feat: Add option to define page range (#852)
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
d727b04ad0
commit
70d68b6164
@ -157,6 +157,8 @@ class InputDocument(BaseModel):
|
|||||||
self.page_count = self._backend.page_count()
|
self.page_count = self._backend.page_count()
|
||||||
if not self.page_count <= self.limits.max_num_pages:
|
if not self.page_count <= self.limits.max_num_pages:
|
||||||
self.valid = False
|
self.valid = False
|
||||||
|
elif self.page_count < self.limits.page_range[0]:
|
||||||
|
self.valid = False
|
||||||
|
|
||||||
except (FileNotFoundError, OSError) as e:
|
except (FileNotFoundError, OSError) as e:
|
||||||
self.valid = False
|
self.valid = False
|
||||||
|
@ -1,13 +1,28 @@
|
|||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Annotated, Tuple
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel, PlainValidator
|
||||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
|
||||||
|
if v[0] < 1 or v[1] < v[0]:
|
||||||
|
raise ValueError(
|
||||||
|
"Invalid page range: start must be ≥ 1 and end must be ≥ start."
|
||||||
|
)
|
||||||
|
return v
|
||||||
|
|
||||||
|
|
||||||
|
PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
|
||||||
|
|
||||||
|
DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
|
||||||
|
|
||||||
|
|
||||||
class DocumentLimits(BaseModel):
|
class DocumentLimits(BaseModel):
|
||||||
max_num_pages: int = sys.maxsize
|
max_num_pages: int = sys.maxsize
|
||||||
max_file_size: int = sys.maxsize
|
max_file_size: int = sys.maxsize
|
||||||
|
page_range: PageRange = DEFAULT_PAGE_RANGE
|
||||||
|
|
||||||
|
|
||||||
class BatchConcurrencySettings(BaseModel):
|
class BatchConcurrencySettings(BaseModel):
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import math
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
|
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
|
||||||
|
|
||||||
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||||
|
|
||||||
@ -31,7 +32,12 @@ from docling.datamodel.document import (
|
|||||||
_DocumentConversionInput,
|
_DocumentConversionInput,
|
||||||
)
|
)
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docling.datamodel.pipeline_options import PipelineOptions
|
||||||
from docling.datamodel.settings import DocumentLimits, settings
|
from docling.datamodel.settings import (
|
||||||
|
DEFAULT_PAGE_RANGE,
|
||||||
|
DocumentLimits,
|
||||||
|
PageRange,
|
||||||
|
settings,
|
||||||
|
)
|
||||||
from docling.exceptions import ConversionError
|
from docling.exceptions import ConversionError
|
||||||
from docling.pipeline.base_pipeline import BasePipeline
|
from docling.pipeline.base_pipeline import BasePipeline
|
||||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||||
@ -184,6 +190,7 @@ class DocumentConverter:
|
|||||||
raises_on_error: bool = True,
|
raises_on_error: bool = True,
|
||||||
max_num_pages: int = sys.maxsize,
|
max_num_pages: int = sys.maxsize,
|
||||||
max_file_size: int = sys.maxsize,
|
max_file_size: int = sys.maxsize,
|
||||||
|
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
||||||
) -> ConversionResult:
|
) -> ConversionResult:
|
||||||
all_res = self.convert_all(
|
all_res = self.convert_all(
|
||||||
source=[source],
|
source=[source],
|
||||||
@ -191,6 +198,7 @@ class DocumentConverter:
|
|||||||
max_num_pages=max_num_pages,
|
max_num_pages=max_num_pages,
|
||||||
max_file_size=max_file_size,
|
max_file_size=max_file_size,
|
||||||
headers=headers,
|
headers=headers,
|
||||||
|
page_range=page_range,
|
||||||
)
|
)
|
||||||
return next(all_res)
|
return next(all_res)
|
||||||
|
|
||||||
@ -202,10 +210,12 @@ class DocumentConverter:
|
|||||||
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
||||||
max_num_pages: int = sys.maxsize,
|
max_num_pages: int = sys.maxsize,
|
||||||
max_file_size: int = sys.maxsize,
|
max_file_size: int = sys.maxsize,
|
||||||
|
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
||||||
) -> Iterator[ConversionResult]:
|
) -> Iterator[ConversionResult]:
|
||||||
limits = DocumentLimits(
|
limits = DocumentLimits(
|
||||||
max_num_pages=max_num_pages,
|
max_num_pages=max_num_pages,
|
||||||
max_file_size=max_file_size,
|
max_file_size=max_file_size,
|
||||||
|
page_range=page_range,
|
||||||
)
|
)
|
||||||
conv_input = _DocumentConversionInput(
|
conv_input = _DocumentConversionInput(
|
||||||
path_or_stream_iterator=source, limits=limits, headers=headers
|
path_or_stream_iterator=source, limits=limits, headers=headers
|
||||||
|
@ -141,6 +141,8 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|||||||
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
||||||
|
|
||||||
for i in range(0, conv_res.input.page_count):
|
for i in range(0, conv_res.input.page_count):
|
||||||
|
start_page, end_page = conv_res.input.limits.page_range
|
||||||
|
if (start_page - 1) <= i <= (end_page - 1):
|
||||||
conv_res.pages.append(Page(page_no=i))
|
conv_res.pages.append(Page(page_no=i))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -4,6 +4,7 @@ from pathlib import Path
|
|||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||||
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
||||||
|
from docling.datamodel.settings import DocumentLimits
|
||||||
|
|
||||||
|
|
||||||
def test_in_doc_from_valid_path():
|
def test_in_doc_from_valid_path():
|
||||||
@ -39,6 +40,40 @@ def test_in_doc_from_invalid_buf():
|
|||||||
assert doc.valid == False
|
assert doc.valid == False
|
||||||
|
|
||||||
|
|
||||||
|
def test_in_doc_with_page_range():
|
||||||
|
test_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||||
|
limits = DocumentLimits()
|
||||||
|
limits.page_range = (1, 10)
|
||||||
|
|
||||||
|
doc = InputDocument(
|
||||||
|
path_or_stream=test_doc_path,
|
||||||
|
format=InputFormat.PDF,
|
||||||
|
backend=PyPdfiumDocumentBackend,
|
||||||
|
limits=limits,
|
||||||
|
)
|
||||||
|
assert doc.valid == True
|
||||||
|
|
||||||
|
limits.page_range = (9, 9)
|
||||||
|
|
||||||
|
doc = InputDocument(
|
||||||
|
path_or_stream=test_doc_path,
|
||||||
|
format=InputFormat.PDF,
|
||||||
|
backend=PyPdfiumDocumentBackend,
|
||||||
|
limits=limits,
|
||||||
|
)
|
||||||
|
assert doc.valid == True
|
||||||
|
|
||||||
|
limits.page_range = (11, 12)
|
||||||
|
|
||||||
|
doc = InputDocument(
|
||||||
|
path_or_stream=test_doc_path,
|
||||||
|
format=InputFormat.PDF,
|
||||||
|
backend=PyPdfiumDocumentBackend,
|
||||||
|
limits=limits,
|
||||||
|
)
|
||||||
|
assert doc.valid == False
|
||||||
|
|
||||||
|
|
||||||
def test_guess_format(tmp_path):
|
def test_guess_format(tmp_path):
|
||||||
"""Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
|
"""Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
|
||||||
dci = _DocumentConversionInput(path_or_stream_iterator=[])
|
dci = _DocumentConversionInput(path_or_stream_iterator=[])
|
||||||
|
@ -105,6 +105,20 @@ def test_e2e_conversions(test_doc_path):
|
|||||||
assert doc_result.status == ConversionStatus.SUCCESS
|
assert doc_result.status == ConversionStatus.SUCCESS
|
||||||
|
|
||||||
|
|
||||||
|
def test_page_range(test_doc_path):
|
||||||
|
converter = DocumentConverter()
|
||||||
|
doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(9, 9))
|
||||||
|
|
||||||
|
assert doc_result.status == ConversionStatus.SUCCESS
|
||||||
|
assert doc_result.input.page_count == 9
|
||||||
|
assert doc_result.document.num_pages() == 1
|
||||||
|
|
||||||
|
doc_result: ConversionResult = converter.convert(
|
||||||
|
test_doc_path, page_range=(10, 10), raises_on_error=False
|
||||||
|
)
|
||||||
|
assert doc_result.status == ConversionStatus.FAILURE
|
||||||
|
|
||||||
|
|
||||||
def test_ocr_coverage_threshold(test_doc_path):
|
def test_ocr_coverage_threshold(test_doc_path):
|
||||||
pipeline_options = PdfPipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.do_ocr = True
|
pipeline_options.do_ocr = True
|
||||||
|
Loading…
Reference in New Issue
Block a user