feat: Add option to define page range (#852)
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
d727b04ad0
commit
70d68b6164
@ -157,6 +157,8 @@ class InputDocument(BaseModel):
|
||||
self.page_count = self._backend.page_count()
|
||||
if not self.page_count <= self.limits.max_num_pages:
|
||||
self.valid = False
|
||||
elif self.page_count < self.limits.page_range[0]:
|
||||
self.valid = False
|
||||
|
||||
except (FileNotFoundError, OSError) as e:
|
||||
self.valid = False
|
||||
|
@ -1,13 +1,28 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Tuple
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, PlainValidator
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
|
||||
if v[0] < 1 or v[1] < v[0]:
|
||||
raise ValueError(
|
||||
"Invalid page range: start must be ≥ 1 and end must be ≥ start."
|
||||
)
|
||||
return v
|
||||
|
||||
|
||||
PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
|
||||
|
||||
DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
|
||||
|
||||
|
||||
class DocumentLimits(BaseModel):
|
||||
max_num_pages: int = sys.maxsize
|
||||
max_file_size: int = sys.maxsize
|
||||
page_range: PageRange = DEFAULT_PAGE_RANGE
|
||||
|
||||
|
||||
class BatchConcurrencySettings(BaseModel):
|
||||
|
@ -1,9 +1,10 @@
|
||||
import logging
|
||||
import math
|
||||
import sys
|
||||
import time
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
|
||||
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||
|
||||
@ -31,7 +32,12 @@ from docling.datamodel.document import (
|
||||
_DocumentConversionInput,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import DocumentLimits, settings
|
||||
from docling.datamodel.settings import (
|
||||
DEFAULT_PAGE_RANGE,
|
||||
DocumentLimits,
|
||||
PageRange,
|
||||
settings,
|
||||
)
|
||||
from docling.exceptions import ConversionError
|
||||
from docling.pipeline.base_pipeline import BasePipeline
|
||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||
@ -184,6 +190,7 @@ class DocumentConverter:
|
||||
raises_on_error: bool = True,
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
||||
) -> ConversionResult:
|
||||
all_res = self.convert_all(
|
||||
source=[source],
|
||||
@ -191,6 +198,7 @@ class DocumentConverter:
|
||||
max_num_pages=max_num_pages,
|
||||
max_file_size=max_file_size,
|
||||
headers=headers,
|
||||
page_range=page_range,
|
||||
)
|
||||
return next(all_res)
|
||||
|
||||
@ -202,10 +210,12 @@ class DocumentConverter:
|
||||
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
||||
) -> Iterator[ConversionResult]:
|
||||
limits = DocumentLimits(
|
||||
max_num_pages=max_num_pages,
|
||||
max_file_size=max_file_size,
|
||||
page_range=page_range,
|
||||
)
|
||||
conv_input = _DocumentConversionInput(
|
||||
path_or_stream_iterator=source, limits=limits, headers=headers
|
||||
|
@ -141,7 +141,9 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
||||
|
||||
for i in range(0, conv_res.input.page_count):
|
||||
conv_res.pages.append(Page(page_no=i))
|
||||
start_page, end_page = conv_res.input.limits.page_range
|
||||
if (start_page - 1) <= i <= (end_page - 1):
|
||||
conv_res.pages.append(Page(page_no=i))
|
||||
|
||||
try:
|
||||
# Iterate batches of pages (page_batch_size) in the doc
|
||||
|
@ -4,6 +4,7 @@ from pathlib import Path
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
||||
from docling.datamodel.settings import DocumentLimits
|
||||
|
||||
|
||||
def test_in_doc_from_valid_path():
|
||||
@ -39,6 +40,40 @@ def test_in_doc_from_invalid_buf():
|
||||
assert doc.valid == False
|
||||
|
||||
|
||||
def test_in_doc_with_page_range():
|
||||
test_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
limits = DocumentLimits()
|
||||
limits.page_range = (1, 10)
|
||||
|
||||
doc = InputDocument(
|
||||
path_or_stream=test_doc_path,
|
||||
format=InputFormat.PDF,
|
||||
backend=PyPdfiumDocumentBackend,
|
||||
limits=limits,
|
||||
)
|
||||
assert doc.valid == True
|
||||
|
||||
limits.page_range = (9, 9)
|
||||
|
||||
doc = InputDocument(
|
||||
path_or_stream=test_doc_path,
|
||||
format=InputFormat.PDF,
|
||||
backend=PyPdfiumDocumentBackend,
|
||||
limits=limits,
|
||||
)
|
||||
assert doc.valid == True
|
||||
|
||||
limits.page_range = (11, 12)
|
||||
|
||||
doc = InputDocument(
|
||||
path_or_stream=test_doc_path,
|
||||
format=InputFormat.PDF,
|
||||
backend=PyPdfiumDocumentBackend,
|
||||
limits=limits,
|
||||
)
|
||||
assert doc.valid == False
|
||||
|
||||
|
||||
def test_guess_format(tmp_path):
|
||||
"""Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
|
||||
dci = _DocumentConversionInput(path_or_stream_iterator=[])
|
||||
|
@ -105,6 +105,20 @@ def test_e2e_conversions(test_doc_path):
|
||||
assert doc_result.status == ConversionStatus.SUCCESS
|
||||
|
||||
|
||||
def test_page_range(test_doc_path):
|
||||
converter = DocumentConverter()
|
||||
doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(9, 9))
|
||||
|
||||
assert doc_result.status == ConversionStatus.SUCCESS
|
||||
assert doc_result.input.page_count == 9
|
||||
assert doc_result.document.num_pages() == 1
|
||||
|
||||
doc_result: ConversionResult = converter.convert(
|
||||
test_doc_path, page_range=(10, 10), raises_on_error=False
|
||||
)
|
||||
assert doc_result.status == ConversionStatus.FAILURE
|
||||
|
||||
|
||||
def test_ocr_coverage_threshold(test_doc_path):
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = True
|
||||
|
Loading…
Reference in New Issue
Block a user