feat: add Docling JSON ingestion (#783)
* feat: add Docling JSON ingestion Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * update conversion as per review comments, add tests, revert Docling JSON disambiguation, document intricacies Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * Update docling/backend/json/docling_json_backend.py Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --------- Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
e9768ae6a5
commit
88a0e66adc
@ -27,7 +27,6 @@ class AbstractDocumentBackend(ABC):
|
||||
def supports_pagination(cls) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.close()
|
||||
|
0
docling/backend/json/__init__.py
Normal file
0
docling/backend/json/__init__.py
Normal file
58
docling/backend/json/docling_json_backend.py
Normal file
58
docling/backend/json/docling_json_backend.py
Normal file
@ -0,0 +1,58 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
from typing_extensions import override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
class DoclingJSONBackend(DeclarativeDocumentBackend):
|
||||
@override
|
||||
def __init__(
|
||||
self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
|
||||
) -> None:
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
# given we need to store any actual conversion exception for raising it from
|
||||
# convert(), this captures the successful result or the actual error in a
|
||||
# mutually exclusive way:
|
||||
self._doc_or_err = self._get_doc_or_err()
|
||||
|
||||
@override
|
||||
def is_valid(self) -> bool:
|
||||
return isinstance(self._doc_or_err, DoclingDocument)
|
||||
|
||||
@classmethod
|
||||
@override
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
@override
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return {InputFormat.JSON_DOCLING}
|
||||
|
||||
def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
|
||||
try:
|
||||
json_data: Union[str, bytes]
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, encoding="utf-8") as f:
|
||||
json_data = f.read()
|
||||
elif isinstance(self.path_or_stream, BytesIO):
|
||||
json_data = self.path_or_stream.getvalue()
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
|
||||
return DoclingDocument.model_validate_json(json_data=json_data)
|
||||
except Exception as e:
|
||||
return e
|
||||
|
||||
@override
|
||||
def convert(self) -> DoclingDocument:
|
||||
if isinstance(self._doc_or_err, DoclingDocument):
|
||||
return self._doc_or_err
|
||||
else:
|
||||
raise self._doc_or_err
|
@ -41,6 +41,7 @@ class InputFormat(str, Enum):
|
||||
MD = "md"
|
||||
XLSX = "xlsx"
|
||||
XML_USPTO = "xml_uspto"
|
||||
JSON_DOCLING = "json_docling"
|
||||
|
||||
|
||||
class OutputFormat(str, Enum):
|
||||
@ -62,6 +63,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
||||
InputFormat.XLSX: ["xlsx"],
|
||||
InputFormat.XML_USPTO: ["xml", "txt"],
|
||||
InputFormat.JSON_DOCLING: ["json"],
|
||||
}
|
||||
|
||||
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||
@ -90,6 +92,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
],
|
||||
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
||||
InputFormat.JSON_DOCLING: ["application/json"],
|
||||
}
|
||||
|
||||
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
||||
|
@ -350,6 +350,8 @@ class _DocumentConversionInput(BaseModel):
|
||||
mime = FormatToMimeType[InputFormat.HTML][0]
|
||||
elif ext in FormatToExtensions[InputFormat.MD]:
|
||||
mime = FormatToMimeType[InputFormat.MD][0]
|
||||
elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
|
||||
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
|
||||
return mime
|
||||
|
||||
@staticmethod
|
||||
|
@ -11,6 +11,7 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.asciidoc_backend import AsciiDocBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
@ -136,6 +137,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
InputFormat.PDF: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
||||
),
|
||||
InputFormat.JSON_DOCLING: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
||||
),
|
||||
}
|
||||
if (options := format_to_default_options.get(format)) is not None:
|
||||
return options
|
||||
|
58
tests/test_backend_docling_json.py
Normal file
58
tests/test_backend_docling_json.py
Normal file
@ -0,0 +1,58 @@
|
||||
"""Test methods in module docling.backend.json.docling_json_backend.py."""
|
||||
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import DoclingDocument, InputDocument
|
||||
|
||||
GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/2206.01062.json")
|
||||
|
||||
|
||||
def test_convert_valid_docling_json():
|
||||
"""Test ingestion of valid Docling JSON."""
|
||||
cls = DoclingJSONBackend
|
||||
path_or_stream = GT_PATH
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=path_or_stream,
|
||||
format=InputFormat.JSON_DOCLING,
|
||||
backend=cls,
|
||||
)
|
||||
backend = cls(
|
||||
in_doc=in_doc,
|
||||
path_or_stream=path_or_stream,
|
||||
)
|
||||
assert backend.is_valid()
|
||||
|
||||
act_doc = backend.convert()
|
||||
act_data = act_doc.export_to_dict()
|
||||
|
||||
exp_doc = DoclingDocument.load_from_json(GT_PATH)
|
||||
exp_data = exp_doc.export_to_dict()
|
||||
|
||||
assert act_data == exp_data
|
||||
|
||||
|
||||
def test_invalid_docling_json():
|
||||
"""Test ingestion of invalid Docling JSON."""
|
||||
cls = DoclingJSONBackend
|
||||
path_or_stream = BytesIO(b"{}")
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=path_or_stream,
|
||||
format=InputFormat.JSON_DOCLING,
|
||||
backend=cls,
|
||||
filename="foo",
|
||||
)
|
||||
backend = cls(
|
||||
in_doc=in_doc,
|
||||
path_or_stream=path_or_stream,
|
||||
)
|
||||
|
||||
assert not backend.is_valid()
|
||||
|
||||
with pytest.raises(ValidationError):
|
||||
backend.convert()
|
@ -124,6 +124,25 @@ def test_guess_format(tmp_path):
|
||||
doc_path.write_text("xyz", encoding="utf-8")
|
||||
assert dci._guess_format(doc_path) == None
|
||||
|
||||
# Valid Docling JSON
|
||||
test_str = '{"name": ""}'
|
||||
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
|
||||
assert dci._guess_format(stream) == InputFormat.JSON_DOCLING
|
||||
doc_path = temp_dir / "test.json"
|
||||
doc_path.write_text(test_str, encoding="utf-8")
|
||||
assert dci._guess_format(doc_path) == InputFormat.JSON_DOCLING
|
||||
|
||||
# Non-Docling JSON
|
||||
# TODO: Docling JSON is currently the single supported JSON flavor and the pipeline
|
||||
# will try to validate *any* JSON (based on suffix/MIME) as Docling JSON; proper
|
||||
# disambiguation seen as part of https://github.com/DS4SD/docling/issues/802
|
||||
test_str = "{}"
|
||||
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
|
||||
assert dci._guess_format(stream) == InputFormat.JSON_DOCLING
|
||||
doc_path = temp_dir / "test.json"
|
||||
doc_path.write_text(test_str, encoding="utf-8")
|
||||
assert dci._guess_format(doc_path) == InputFormat.JSON_DOCLING
|
||||
|
||||
|
||||
def _make_input_doc(path):
|
||||
in_doc = InputDocument(
|
||||
|
Loading…
Reference in New Issue
Block a user