feat: Add option to define page range (#852)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-01-31 15:23:00 +01:00 committed by GitHub
parent d727b04ad0
commit 70d68b6164
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 82 additions and 4 deletions

View File

@ -157,6 +157,8 @@ class InputDocument(BaseModel):
self.page_count = self._backend.page_count() self.page_count = self._backend.page_count()
if not self.page_count <= self.limits.max_num_pages: if not self.page_count <= self.limits.max_num_pages:
self.valid = False self.valid = False
elif self.page_count < self.limits.page_range[0]:
self.valid = False
except (FileNotFoundError, OSError) as e: except (FileNotFoundError, OSError) as e:
self.valid = False self.valid = False

View File

@ -1,13 +1,28 @@
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Annotated, Tuple
from pydantic import BaseModel from pydantic import BaseModel, PlainValidator
from pydantic_settings import BaseSettings, SettingsConfigDict from pydantic_settings import BaseSettings, SettingsConfigDict
def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
if v[0] < 1 or v[1] < v[0]:
raise ValueError(
"Invalid page range: start must be ≥ 1 and end must be ≥ start."
)
return v
PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
class DocumentLimits(BaseModel): class DocumentLimits(BaseModel):
max_num_pages: int = sys.maxsize max_num_pages: int = sys.maxsize
max_file_size: int = sys.maxsize max_file_size: int = sys.maxsize
page_range: PageRange = DEFAULT_PAGE_RANGE
class BatchConcurrencySettings(BaseModel): class BatchConcurrencySettings(BaseModel):

View File

@ -1,9 +1,10 @@
import logging import logging
import math
import sys import sys
import time import time
from functools import partial from functools import partial
from pathlib import Path from pathlib import Path
from typing import Dict, Iterable, Iterator, List, Optional, Type, Union from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
from pydantic import BaseModel, ConfigDict, model_validator, validate_call from pydantic import BaseModel, ConfigDict, model_validator, validate_call
@ -31,7 +32,12 @@ from docling.datamodel.document import (
_DocumentConversionInput, _DocumentConversionInput,
) )
from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import DocumentLimits, settings from docling.datamodel.settings import (
DEFAULT_PAGE_RANGE,
DocumentLimits,
PageRange,
settings,
)
from docling.exceptions import ConversionError from docling.exceptions import ConversionError
from docling.pipeline.base_pipeline import BasePipeline from docling.pipeline.base_pipeline import BasePipeline
from docling.pipeline.simple_pipeline import SimplePipeline from docling.pipeline.simple_pipeline import SimplePipeline
@ -184,6 +190,7 @@ class DocumentConverter:
raises_on_error: bool = True, raises_on_error: bool = True,
max_num_pages: int = sys.maxsize, max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize, max_file_size: int = sys.maxsize,
page_range: PageRange = DEFAULT_PAGE_RANGE,
) -> ConversionResult: ) -> ConversionResult:
all_res = self.convert_all( all_res = self.convert_all(
source=[source], source=[source],
@ -191,6 +198,7 @@ class DocumentConverter:
max_num_pages=max_num_pages, max_num_pages=max_num_pages,
max_file_size=max_file_size, max_file_size=max_file_size,
headers=headers, headers=headers,
page_range=page_range,
) )
return next(all_res) return next(all_res)
@ -202,10 +210,12 @@ class DocumentConverter:
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
max_num_pages: int = sys.maxsize, max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize, max_file_size: int = sys.maxsize,
page_range: PageRange = DEFAULT_PAGE_RANGE,
) -> Iterator[ConversionResult]: ) -> Iterator[ConversionResult]:
limits = DocumentLimits( limits = DocumentLimits(
max_num_pages=max_num_pages, max_num_pages=max_num_pages,
max_file_size=max_file_size, max_file_size=max_file_size,
page_range=page_range,
) )
conv_input = _DocumentConversionInput( conv_input = _DocumentConversionInput(
path_or_stream_iterator=source, limits=limits, headers=headers path_or_stream_iterator=source, limits=limits, headers=headers

View File

@ -141,7 +141,9 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
for i in range(0, conv_res.input.page_count): for i in range(0, conv_res.input.page_count):
conv_res.pages.append(Page(page_no=i)) start_page, end_page = conv_res.input.limits.page_range
if (start_page - 1) <= i <= (end_page - 1):
conv_res.pages.append(Page(page_no=i))
try: try:
# Iterate batches of pages (page_batch_size) in the doc # Iterate batches of pages (page_batch_size) in the doc

View File

@ -4,6 +4,7 @@ from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import InputDocument, _DocumentConversionInput from docling.datamodel.document import InputDocument, _DocumentConversionInput
from docling.datamodel.settings import DocumentLimits
def test_in_doc_from_valid_path(): def test_in_doc_from_valid_path():
@ -39,6 +40,40 @@ def test_in_doc_from_invalid_buf():
assert doc.valid == False assert doc.valid == False
def test_in_doc_with_page_range():
test_doc_path = Path("./tests/data/2206.01062.pdf")
limits = DocumentLimits()
limits.page_range = (1, 10)
doc = InputDocument(
path_or_stream=test_doc_path,
format=InputFormat.PDF,
backend=PyPdfiumDocumentBackend,
limits=limits,
)
assert doc.valid == True
limits.page_range = (9, 9)
doc = InputDocument(
path_or_stream=test_doc_path,
format=InputFormat.PDF,
backend=PyPdfiumDocumentBackend,
limits=limits,
)
assert doc.valid == True
limits.page_range = (11, 12)
doc = InputDocument(
path_or_stream=test_doc_path,
format=InputFormat.PDF,
backend=PyPdfiumDocumentBackend,
limits=limits,
)
assert doc.valid == False
def test_guess_format(tmp_path): def test_guess_format(tmp_path):
"""Test docling.datamodel.document._DocumentConversionInput.__guess_format""" """Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
dci = _DocumentConversionInput(path_or_stream_iterator=[]) dci = _DocumentConversionInput(path_or_stream_iterator=[])

View File

@ -105,6 +105,20 @@ def test_e2e_conversions(test_doc_path):
assert doc_result.status == ConversionStatus.SUCCESS assert doc_result.status == ConversionStatus.SUCCESS
def test_page_range(test_doc_path):
converter = DocumentConverter()
doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(9, 9))
assert doc_result.status == ConversionStatus.SUCCESS
assert doc_result.input.page_count == 9
assert doc_result.document.num_pages() == 1
doc_result: ConversionResult = converter.convert(
test_doc_path, page_range=(10, 10), raises_on_error=False
)
assert doc_result.status == ConversionStatus.FAILURE
def test_ocr_coverage_threshold(test_doc_path): def test_ocr_coverage_threshold(test_doc_path):
pipeline_options = PdfPipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True pipeline_options.do_ocr = True