fix: improve handling of disallowed formats (#429)
* fix: Fixes and tests for StopIteration on .convert() Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Remove unnecessary case handling Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Other test fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * improve handling of unsupported types - Introduced new explicit exception types instead of `RuntimeError` - Introduced new `ConversionStatus` value for unsupported formats - Tidied up converter member typing & removed asserts Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * robustify & simplify format option resolution Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * rename new status, populate ConversionResult errors Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
2254845da3
commit
34c7c79858
@ -24,6 +24,7 @@ class ConversionStatus(str, Enum):
|
||||
FAILURE = auto()
|
||||
SUCCESS = auto()
|
||||
PARTIAL_SUCCESS = auto()
|
||||
SKIPPED = auto()
|
||||
|
||||
|
||||
class InputFormat(str, Enum):
|
||||
@ -95,6 +96,7 @@ class DoclingComponentType(str, Enum):
|
||||
DOCUMENT_BACKEND = auto()
|
||||
MODEL = auto()
|
||||
DOC_ASSEMBLER = auto()
|
||||
USER_INPUT = auto()
|
||||
|
||||
|
||||
class ErrorItem(BaseModel):
|
||||
|
@ -3,7 +3,7 @@ import re
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
|
||||
|
||||
import filetype
|
||||
from docling_core.types.doc import (
|
||||
@ -164,12 +164,6 @@ class InputDocument(BaseModel):
|
||||
backend: Type[AbstractDocumentBackend],
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
) -> None:
|
||||
if backend is None:
|
||||
raise RuntimeError(
|
||||
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
|
||||
self._backend = backend(self, path_or_stream=path_or_stream)
|
||||
if not self._backend.is_valid():
|
||||
self.valid = False
|
||||
@ -450,6 +444,25 @@ class ConversionResult(BaseModel):
|
||||
return ds_doc
|
||||
|
||||
|
||||
class _DummyBackend(AbstractDocumentBackend):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return set()
|
||||
|
||||
@classmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
def unload(self):
|
||||
return super().unload()
|
||||
|
||||
|
||||
class _DocumentConversionInput(BaseModel):
|
||||
|
||||
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||
@ -461,11 +474,12 @@ class _DocumentConversionInput(BaseModel):
|
||||
for item in self.path_or_stream_iterator:
|
||||
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
|
||||
format = self._guess_format(obj)
|
||||
backend: Type[AbstractDocumentBackend]
|
||||
if format not in format_options.keys():
|
||||
_log.info(
|
||||
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
|
||||
_log.error(
|
||||
f"Input document {obj.name} does not match any allowed format."
|
||||
)
|
||||
continue
|
||||
backend = _DummyBackend
|
||||
else:
|
||||
backend = format_options[format].backend
|
||||
|
||||
|
@ -15,7 +15,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DoclingComponentType,
|
||||
DocumentStream,
|
||||
ErrorItem,
|
||||
InputFormat,
|
||||
)
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
InputDocument,
|
||||
@ -23,6 +29,7 @@ from docling.datamodel.document import (
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import DocumentLimits, settings
|
||||
from docling.exceptions import ConversionError
|
||||
from docling.pipeline.base_pipeline import BasePipeline
|
||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
@ -85,7 +92,8 @@ class ImageFormatOption(FormatOption):
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
||||
|
||||
|
||||
_format_to_default_options = {
|
||||
def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
format_to_default_options = {
|
||||
InputFormat.XLSX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
||||
),
|
||||
@ -111,6 +119,10 @@ _format_to_default_options = {
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
||||
),
|
||||
}
|
||||
if (options := format_to_default_options.get(format)) is not None:
|
||||
return options
|
||||
else:
|
||||
raise RuntimeError(f"No default options configured for {format}")
|
||||
|
||||
|
||||
class DocumentConverter:
|
||||
@ -121,36 +133,26 @@ class DocumentConverter:
|
||||
allowed_formats: Optional[List[InputFormat]] = None,
|
||||
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
||||
):
|
||||
self.allowed_formats = allowed_formats
|
||||
self.format_to_options = format_options
|
||||
|
||||
if self.allowed_formats is None:
|
||||
# if self.format_to_options is not None:
|
||||
# self.allowed_formats = self.format_to_options.keys()
|
||||
# else:
|
||||
self.allowed_formats = [e for e in InputFormat] # all formats
|
||||
|
||||
if self.format_to_options is None:
|
||||
self.format_to_options = _format_to_default_options
|
||||
else:
|
||||
for f in self.allowed_formats:
|
||||
if f not in self.format_to_options.keys():
|
||||
_log.debug(f"Requested format {f} will use default options.")
|
||||
self.format_to_options[f] = _format_to_default_options[f]
|
||||
|
||||
remove_keys = []
|
||||
for f in self.format_to_options.keys():
|
||||
if f not in self.allowed_formats:
|
||||
remove_keys.append(f)
|
||||
|
||||
for f in remove_keys:
|
||||
self.format_to_options.pop(f)
|
||||
|
||||
self.allowed_formats = (
|
||||
allowed_formats if allowed_formats is not None else [e for e in InputFormat]
|
||||
)
|
||||
self.format_to_options = {
|
||||
format: (
|
||||
_get_default_option(format=format)
|
||||
if (custom_option := (format_options or {}).get(format)) is None
|
||||
else custom_option
|
||||
)
|
||||
for format in self.allowed_formats
|
||||
}
|
||||
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
||||
|
||||
def initialize_pipeline(self, format: InputFormat):
|
||||
"""Initialize the conversion pipeline for the selected format."""
|
||||
self._get_pipeline(doc_format=format)
|
||||
pipeline = self._get_pipeline(doc_format=format)
|
||||
if pipeline is None:
|
||||
raise ConversionError(
|
||||
f"No pipeline could be initialized for format {format}"
|
||||
)
|
||||
|
||||
@validate_call(config=ConfigDict(strict=True))
|
||||
def convert(
|
||||
@ -186,22 +188,28 @@ class DocumentConverter:
|
||||
limits=limits,
|
||||
)
|
||||
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
||||
|
||||
had_result = False
|
||||
for conv_res in conv_res_iter:
|
||||
had_result = True
|
||||
if raises_on_error and conv_res.status not in {
|
||||
ConversionStatus.SUCCESS,
|
||||
ConversionStatus.PARTIAL_SUCCESS,
|
||||
}:
|
||||
raise RuntimeError(
|
||||
raise ConversionError(
|
||||
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
||||
)
|
||||
else:
|
||||
yield conv_res
|
||||
|
||||
if not had_result and raises_on_error:
|
||||
raise ConversionError(
|
||||
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
||||
)
|
||||
|
||||
def _convert(
|
||||
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
||||
) -> Iterator[ConversionResult]:
|
||||
assert self.format_to_options is not None
|
||||
|
||||
start_time = time.monotonic()
|
||||
|
||||
for input_batch in chunkify(
|
||||
@ -223,27 +231,22 @@ class DocumentConverter:
|
||||
):
|
||||
elapsed = time.monotonic() - start_time
|
||||
start_time = time.monotonic()
|
||||
|
||||
if item is not None:
|
||||
_log.info(
|
||||
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
||||
)
|
||||
yield item
|
||||
else:
|
||||
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
|
||||
|
||||
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
||||
assert self.format_to_options is not None
|
||||
|
||||
fopt = self.format_to_options.get(doc_format)
|
||||
|
||||
if fopt is None:
|
||||
raise RuntimeError(f"Could not get pipeline for {doc_format}")
|
||||
return None
|
||||
else:
|
||||
pipeline_class = fopt.pipeline_cls
|
||||
pipeline_options = fopt.pipeline_options
|
||||
|
||||
assert pipeline_options is not None
|
||||
if pipeline_options is None:
|
||||
return None
|
||||
# TODO this will ignore if different options have been defined for the same pipeline class.
|
||||
if (
|
||||
pipeline_class not in self.initialized_pipelines
|
||||
@ -257,11 +260,26 @@ class DocumentConverter:
|
||||
|
||||
def _process_document(
|
||||
self, in_doc: InputDocument, raises_on_error: bool
|
||||
) -> Optional[ConversionResult]:
|
||||
assert self.allowed_formats is not None
|
||||
assert in_doc.format in self.allowed_formats
|
||||
) -> ConversionResult:
|
||||
|
||||
valid = (
|
||||
self.allowed_formats is not None and in_doc.format in self.allowed_formats
|
||||
)
|
||||
if valid:
|
||||
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
||||
else:
|
||||
error_message = f"File format not allowed: {in_doc.file}"
|
||||
if raises_on_error:
|
||||
raise ConversionError(error_message)
|
||||
else:
|
||||
error_item = ErrorItem(
|
||||
component_type=DoclingComponentType.USER_INPUT,
|
||||
module_name="",
|
||||
error_message=error_message,
|
||||
)
|
||||
conv_res = ConversionResult(
|
||||
input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
|
||||
)
|
||||
|
||||
return conv_res
|
||||
|
||||
@ -270,26 +288,28 @@ class DocumentConverter:
|
||||
) -> ConversionResult:
|
||||
if in_doc.valid:
|
||||
pipeline = self._get_pipeline(in_doc.format)
|
||||
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
||||
if pipeline is not None:
|
||||
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
||||
else:
|
||||
if raises_on_error:
|
||||
raise RuntimeError(
|
||||
raise ConversionError(
|
||||
f"No pipeline could be initialized for {in_doc.file}."
|
||||
)
|
||||
else:
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
|
||||
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
||||
|
||||
conv_res = ConversionResult(
|
||||
input=in_doc,
|
||||
status=ConversionStatus.FAILURE,
|
||||
)
|
||||
else:
|
||||
if raises_on_error:
|
||||
raise RuntimeError(f"Input document {in_doc.file} is not valid.")
|
||||
raise ConversionError(f"Input document {in_doc.file} is not valid.")
|
||||
|
||||
else:
|
||||
# invalid doc or not of desired format
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
conv_res = ConversionResult(
|
||||
input=in_doc,
|
||||
status=ConversionStatus.FAILURE,
|
||||
)
|
||||
# TODO add error log why it failed.
|
||||
|
||||
return conv_res
|
||||
|
6
docling/exceptions.py
Normal file
6
docling/exceptions.py
Normal file
@ -0,0 +1,6 @@
|
||||
class BaseError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class ConversionError(BaseError):
|
||||
pass
|
@ -10,7 +10,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||
|
||||
GENERATE = True
|
||||
GENERATE = False
|
||||
|
||||
|
||||
def get_pdf_path():
|
||||
|
45
tests/test_invalid_input.py
Normal file
45
tests/test_invalid_input.py
Normal file
@ -0,0 +1,45 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream
|
||||
from docling.document_converter import ConversionError, DocumentConverter
|
||||
|
||||
|
||||
def get_pdf_path():
|
||||
|
||||
pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf")
|
||||
return pdf_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def converter():
|
||||
converter = DocumentConverter()
|
||||
|
||||
return converter
|
||||
|
||||
|
||||
def test_convert_unsupported_doc_format_wout_exception(converter: DocumentConverter):
|
||||
result = converter.convert(
|
||||
DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=False
|
||||
)
|
||||
assert result.status == ConversionStatus.SKIPPED
|
||||
|
||||
|
||||
def test_convert_unsupported_doc_format_with_exception(converter: DocumentConverter):
|
||||
with pytest.raises(ConversionError):
|
||||
converter.convert(
|
||||
DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")),
|
||||
raises_on_error=True,
|
||||
)
|
||||
|
||||
|
||||
def test_convert_too_small_filesize_limit_wout_exception(converter: DocumentConverter):
|
||||
result = converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=False)
|
||||
assert result.status == ConversionStatus.FAILURE
|
||||
|
||||
|
||||
def test_convert_too_small_filesize_limit_with_exception(converter: DocumentConverter):
|
||||
with pytest.raises(ConversionError):
|
||||
converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=True)
|
Loading…
Reference in New Issue
Block a user