feat: Add option to define page range (#852)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-01-31 15:23:00 +01:00 · 2025-01-31 15:23:00 +01:00 · 70d68b6164
commit 70d68b6164
parent d727b04ad0
6 changed files with 82 additions and 4 deletions
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -157,6 +157,8 @@ class InputDocument(BaseModel):
                    self.page_count = self._backend.page_count()
                    if not self.page_count <= self.limits.max_num_pages:
                        self.valid = False
+                    elif self.page_count < self.limits.page_range[0]:
+                        self.valid = False

        except (FileNotFoundError, OSError) as e:
            self.valid = False
--- a/docling/datamodel/settings.py
+++ b/docling/datamodel/settings.py
@ -1,13 +1,28 @@
 import sys
 from pathlib import Path
+from typing import Annotated, Tuple

-from pydantic import BaseModel
+from pydantic import BaseModel, PlainValidator
 from pydantic_settings import BaseSettings, SettingsConfigDict


+def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
+    if v[0] < 1 or v[1] < v[0]:
+        raise ValueError(
+            "Invalid page range: start must be ≥ 1 and end must be ≥ start."
+        )
+    return v
+
+
+PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
+
+DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
+
+
 class DocumentLimits(BaseModel):
    max_num_pages: int = sys.maxsize
    max_file_size: int = sys.maxsize
+    page_range: PageRange = DEFAULT_PAGE_RANGE


 class BatchConcurrencySettings(BaseModel):
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -1,9 +1,10 @@
 import logging
+import math
 import sys
 import time
 from functools import partial
 from pathlib import Path
-from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
+from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union

 from pydantic import BaseModel, ConfigDict, model_validator, validate_call

@ -31,7 +32,12 @@ from docling.datamodel.document import (
    _DocumentConversionInput,
 )
 from docling.datamodel.pipeline_options import PipelineOptions
-from docling.datamodel.settings import DocumentLimits, settings
+from docling.datamodel.settings import (
+    DEFAULT_PAGE_RANGE,
+    DocumentLimits,
+    PageRange,
+    settings,
+)
 from docling.exceptions import ConversionError
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.pipeline.simple_pipeline import SimplePipeline
@ -184,6 +190,7 @@ class DocumentConverter:
        raises_on_error: bool = True,
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,
+        page_range: PageRange = DEFAULT_PAGE_RANGE,
    ) -> ConversionResult:
        all_res = self.convert_all(
            source=[source],
@ -191,6 +198,7 @@ class DocumentConverter:
            max_num_pages=max_num_pages,
            max_file_size=max_file_size,
            headers=headers,
+            page_range=page_range,
        )
        return next(all_res)

@ -202,10 +210,12 @@ class DocumentConverter:
        raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,
+        page_range: PageRange = DEFAULT_PAGE_RANGE,
    ) -> Iterator[ConversionResult]:
        limits = DocumentLimits(
            max_num_pages=max_num_pages,
            max_file_size=max_file_size,
+            page_range=page_range,
        )
        conv_input = _DocumentConversionInput(
            path_or_stream_iterator=source, limits=limits, headers=headers
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@ -141,7 +141,9 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):

            for i in range(0, conv_res.input.page_count):
-                conv_res.pages.append(Page(page_no=i))
+                start_page, end_page = conv_res.input.limits.page_range
+                if (start_page - 1) <= i <= (end_page - 1):
+                    conv_res.pages.append(Page(page_no=i))

            try:
                # Iterate batches of pages (page_batch_size) in the doc
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@ -4,6 +4,7 @@ from pathlib import Path
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import DocumentStream, InputFormat
 from docling.datamodel.document import InputDocument, _DocumentConversionInput
+from docling.datamodel.settings import DocumentLimits


 def test_in_doc_from_valid_path():
@ -39,6 +40,40 @@ def test_in_doc_from_invalid_buf():
    assert doc.valid == False


+def test_in_doc_with_page_range():
+    test_doc_path = Path("./tests/data/2206.01062.pdf")
+    limits = DocumentLimits()
+    limits.page_range = (1, 10)
+
+    doc = InputDocument(
+        path_or_stream=test_doc_path,
+        format=InputFormat.PDF,
+        backend=PyPdfiumDocumentBackend,
+        limits=limits,
+    )
+    assert doc.valid == True
+
+    limits.page_range = (9, 9)
+
+    doc = InputDocument(
+        path_or_stream=test_doc_path,
+        format=InputFormat.PDF,
+        backend=PyPdfiumDocumentBackend,
+        limits=limits,
+    )
+    assert doc.valid == True
+
+    limits.page_range = (11, 12)
+
+    doc = InputDocument(
+        path_or_stream=test_doc_path,
+        format=InputFormat.PDF,
+        backend=PyPdfiumDocumentBackend,
+        limits=limits,
+    )
+    assert doc.valid == False
+
+
 def test_guess_format(tmp_path):
    """Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
    dci = _DocumentConversionInput(path_or_stream_iterator=[])
--- a/tests/test_options.py
+++ b/tests/test_options.py
@ -105,6 +105,20 @@ def test_e2e_conversions(test_doc_path):
        assert doc_result.status == ConversionStatus.SUCCESS


+def test_page_range(test_doc_path):
+    converter = DocumentConverter()
+    doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(9, 9))
+
+    assert doc_result.status == ConversionStatus.SUCCESS
+    assert doc_result.input.page_count == 9
+    assert doc_result.document.num_pages() == 1
+
+    doc_result: ConversionResult = converter.convert(
+        test_doc_path, page_range=(10, 10), raises_on_error=False
+    )
+    assert doc_result.status == ConversionStatus.FAILURE
+
+
 def test_ocr_coverage_threshold(test_doc_path):
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True