feat: add Docling JSON ingestion (#783)
* feat: add Docling JSON ingestion Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * update conversion as per review comments, add tests, revert Docling JSON disambiguation, document intricacies Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * Update docling/backend/json/docling_json_backend.py Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --------- Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
0
docling/backend/json/__init__.py
Normal file
0
docling/backend/json/__init__.py
Normal file
58
docling/backend/json/docling_json_backend.py
Normal file
58
docling/backend/json/docling_json_backend.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
from typing_extensions import override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
class DoclingJSONBackend(DeclarativeDocumentBackend):
|
||||
@override
|
||||
def __init__(
|
||||
self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
|
||||
) -> None:
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
# given we need to store any actual conversion exception for raising it from
|
||||
# convert(), this captures the successful result or the actual error in a
|
||||
# mutually exclusive way:
|
||||
self._doc_or_err = self._get_doc_or_err()
|
||||
|
||||
@override
|
||||
def is_valid(self) -> bool:
|
||||
return isinstance(self._doc_or_err, DoclingDocument)
|
||||
|
||||
@classmethod
|
||||
@override
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
@override
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return {InputFormat.JSON_DOCLING}
|
||||
|
||||
def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
|
||||
try:
|
||||
json_data: Union[str, bytes]
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, encoding="utf-8") as f:
|
||||
json_data = f.read()
|
||||
elif isinstance(self.path_or_stream, BytesIO):
|
||||
json_data = self.path_or_stream.getvalue()
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
|
||||
return DoclingDocument.model_validate_json(json_data=json_data)
|
||||
except Exception as e:
|
||||
return e
|
||||
|
||||
@override
|
||||
def convert(self) -> DoclingDocument:
|
||||
if isinstance(self._doc_or_err, DoclingDocument):
|
||||
return self._doc_or_err
|
||||
else:
|
||||
raise self._doc_or_err
|
||||
Reference in New Issue
Block a user