feat: Add option to define page range (#852)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-01-31 15:23:00 +01:00
parent d727b04ad0
commit 70d68b6164
6 changed files with 82 additions and 4 deletions
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -157,6 +157,8 @@ class InputDocument(BaseModel):
                    self.page_count = self._backend.page_count()
                    if not self.page_count <= self.limits.max_num_pages:
                        self.valid = False
                    elif self.page_count < self.limits.page_range[0]:
                        self.valid = False
        except (FileNotFoundError, OSError) as e:
            self.valid = False
--- a/docling/datamodel/settings.py
+++ b/docling/datamodel/settings.py
@@ -1,13 +1,28 @@
 import sys
 from pathlib import Path
 from typing import Annotated, Tuple
-from pydantic import BaseModel
+from pydantic import BaseModel, PlainValidator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
    if v[0] < 1 or v[1] < v[0]:
        raise ValueError(
            "Invalid page range: start must be ≥ 1 and end must be ≥ start."
        )
    return v
 PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
 DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
 class DocumentLimits(BaseModel):
    max_num_pages: int = sys.maxsize
    max_file_size: int = sys.maxsize
    page_range: PageRange = DEFAULT_PAGE_RANGE
 class BatchConcurrencySettings(BaseModel):
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -1,9 +1,10 @@
 import logging
 import math
 import sys
 import time
 from functools import partial
 from pathlib import Path
-from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
+from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
 from pydantic import BaseModel, ConfigDict, model_validator, validate_call
@@ -31,7 +32,12 @@ from docling.datamodel.document import (
    _DocumentConversionInput,
 )
 from docling.datamodel.pipeline_options import PipelineOptions
-from docling.datamodel.settings import DocumentLimits, settings
+from docling.datamodel.settings import (
    DEFAULT_PAGE_RANGE,
    DocumentLimits,
    PageRange,
    settings,
 )
 from docling.exceptions import ConversionError
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.pipeline.simple_pipeline import SimplePipeline
@@ -184,6 +190,7 @@ class DocumentConverter:
        raises_on_error: bool = True,
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,
        page_range: PageRange = DEFAULT_PAGE_RANGE,
    ) -> ConversionResult:
        all_res = self.convert_all(
            source=[source],
@@ -191,6 +198,7 @@ class DocumentConverter:
            max_num_pages=max_num_pages,
            max_file_size=max_file_size,
            headers=headers,
            page_range=page_range,
        )
        return next(all_res)
@@ -202,10 +210,12 @@ class DocumentConverter:
        raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,
        page_range: PageRange = DEFAULT_PAGE_RANGE,
    ) -> Iterator[ConversionResult]:
        limits = DocumentLimits(
            max_num_pages=max_num_pages,
            max_file_size=max_file_size,
            page_range=page_range,
        )
        conv_input = _DocumentConversionInput(
            path_or_stream_iterator=source, limits=limits, headers=headers
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@@ -141,7 +141,9 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
            for i in range(0, conv_res.input.page_count):
-                conv_res.pages.append(Page(page_no=i))
+                start_page, end_page = conv_res.input.limits.page_range
                if (start_page - 1) <= i <= (end_page - 1):
                    conv_res.pages.append(Page(page_no=i))
            try:
                # Iterate batches of pages (page_batch_size) in the doc
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@@ -4,6 +4,7 @@ from pathlib import Path
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import DocumentStream, InputFormat
 from docling.datamodel.document import InputDocument, _DocumentConversionInput
 from docling.datamodel.settings import DocumentLimits
 def test_in_doc_from_valid_path():
@@ -39,6 +40,40 @@ def test_in_doc_from_invalid_buf():
    assert doc.valid == False
 def test_in_doc_with_page_range():
    test_doc_path = Path("./tests/data/2206.01062.pdf")
    limits = DocumentLimits()
    limits.page_range = (1, 10)
    doc = InputDocument(
        path_or_stream=test_doc_path,
        format=InputFormat.PDF,
        backend=PyPdfiumDocumentBackend,
        limits=limits,
    )
    assert doc.valid == True
    limits.page_range = (9, 9)
    doc = InputDocument(
        path_or_stream=test_doc_path,
        format=InputFormat.PDF,
        backend=PyPdfiumDocumentBackend,
        limits=limits,
    )
    assert doc.valid == True
    limits.page_range = (11, 12)
    doc = InputDocument(
        path_or_stream=test_doc_path,
        format=InputFormat.PDF,
        backend=PyPdfiumDocumentBackend,
        limits=limits,
    )
    assert doc.valid == False
 def test_guess_format(tmp_path):
    """Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
    dci = _DocumentConversionInput(path_or_stream_iterator=[])
--- a/tests/test_options.py
+++ b/tests/test_options.py
@@ -105,6 +105,20 @@ def test_e2e_conversions(test_doc_path):
        assert doc_result.status == ConversionStatus.SUCCESS
 def test_page_range(test_doc_path):
    converter = DocumentConverter()
    doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(9, 9))
    assert doc_result.status == ConversionStatus.SUCCESS
    assert doc_result.input.page_count == 9
    assert doc_result.document.num_pages() == 1
    doc_result: ConversionResult = converter.convert(
        test_doc_path, page_range=(10, 10), raises_on_error=False
    )
    assert doc_result.status == ConversionStatus.FAILURE
 def test_ocr_coverage_threshold(test_doc_path):
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True