fix: set valid=false for invalid backends (#171)
* fix: set valid=false for invalid backends Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Add test case for InputDocument Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
b8d2286dd1
commit
3496b4838f
@ -143,11 +143,13 @@ class InputDocument(BaseModel):
|
|||||||
self.valid = False
|
self.valid = False
|
||||||
|
|
||||||
except (FileNotFoundError, OSError) as e:
|
except (FileNotFoundError, OSError) as e:
|
||||||
|
self.valid = False
|
||||||
_log.exception(
|
_log.exception(
|
||||||
f"File {self.file.name} not found or cannot be opened.", exc_info=e
|
f"File {self.file.name} not found or cannot be opened.", exc_info=e
|
||||||
)
|
)
|
||||||
# raise
|
# raise
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
|
self.valid = False
|
||||||
_log.exception(
|
_log.exception(
|
||||||
f"An unexpected error occurred while opening the document {self.file.name}",
|
f"An unexpected error occurred while opening the document {self.file.name}",
|
||||||
exc_info=e,
|
exc_info=e,
|
||||||
@ -166,6 +168,8 @@ class InputDocument(BaseModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self._backend = backend(self, path_or_stream=path_or_stream)
|
self._backend = backend(self, path_or_stream=path_or_stream)
|
||||||
|
if not self._backend.is_valid():
|
||||||
|
self.valid = False
|
||||||
|
|
||||||
|
|
||||||
class DocumentFormat(str, Enum):
|
class DocumentFormat(str, Enum):
|
||||||
|
58
tests/test_input_doc.py
Normal file
58
tests/test_input_doc.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
|
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
|
||||||
|
def test_in_doc_from_valid_path():
|
||||||
|
|
||||||
|
test_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||||
|
doc = _make_input_doc(test_doc_path)
|
||||||
|
assert doc.valid == True
|
||||||
|
|
||||||
|
|
||||||
|
def test_in_doc_from_invalid_path():
|
||||||
|
test_doc_path = Path("./tests/does/not/exist.pdf")
|
||||||
|
|
||||||
|
doc = _make_input_doc(test_doc_path)
|
||||||
|
|
||||||
|
assert doc.valid == False
|
||||||
|
|
||||||
|
|
||||||
|
def test_in_doc_from_valid_buf():
|
||||||
|
|
||||||
|
buf = BytesIO(Path("./tests/data/2206.01062.pdf").open("rb").read())
|
||||||
|
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
||||||
|
|
||||||
|
doc = _make_input_doc_from_stream(stream)
|
||||||
|
assert doc.valid == True
|
||||||
|
|
||||||
|
|
||||||
|
def test_in_doc_from_invalid_buf():
|
||||||
|
|
||||||
|
buf = BytesIO(b"")
|
||||||
|
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
||||||
|
|
||||||
|
doc = _make_input_doc_from_stream(stream)
|
||||||
|
assert doc.valid == False
|
||||||
|
|
||||||
|
|
||||||
|
def _make_input_doc(path):
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=path,
|
||||||
|
format=InputFormat.PDF,
|
||||||
|
backend=PyPdfiumDocumentBackend,
|
||||||
|
)
|
||||||
|
return in_doc
|
||||||
|
|
||||||
|
|
||||||
|
def _make_input_doc_from_stream(doc_stream):
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=doc_stream.stream,
|
||||||
|
format=InputFormat.PDF,
|
||||||
|
filename=doc_stream.name,
|
||||||
|
backend=PyPdfiumDocumentBackend,
|
||||||
|
)
|
||||||
|
return in_doc
|
Loading…
Reference in New Issue
Block a user