feat: Upgrade docling-parse PDF backend and interface to use page-by-page parsing (#44)

* Use docling-parse page-by-page

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Propagate document_hash to PDF backends, use docling-parse 1.0.0

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Upgrade lockfile

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* repin after more packages on pypi

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-08-22 13:49:37 +02:00 committed by GitHub
parent f7c50c8b0e
commit a8c6b29a67
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 73 additions and 51 deletions

View File

@ -39,8 +39,9 @@ class PdfPageBackend(ABC):
class PdfDocumentBackend(ABC): class PdfDocumentBackend(ABC):
@abstractmethod @abstractmethod
def __init__(self, path_or_stream: Union[BytesIO, Path]): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
pass self.path_or_stream = path_or_stream
self.document_hash = document_hash
@abstractmethod @abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend: def load_page(self, page_no: int) -> PdfPageBackend:
@ -56,4 +57,7 @@ class PdfDocumentBackend(ABC):
@abstractmethod @abstractmethod
def unload(self): def unload(self):
pass if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None

View File

@ -1,6 +1,5 @@
import logging import logging
import random import random
import time
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Union from typing import Iterable, Optional, Union
@ -17,11 +16,14 @@ _log = logging.getLogger(__name__)
class DoclingParsePageBackend(PdfPageBackend): class DoclingParsePageBackend(PdfPageBackend):
def __init__(self, page_obj: PdfPage, docling_page_obj): def __init__(
self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
):
super().__init__(page_obj) super().__init__(page_obj)
self._ppage = page_obj self._ppage = page_obj
self._dpage = docling_page_obj
self.text_page = None parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
self._dpage = parsed_page["pages"][0]
def get_text_in_rect(self, bbox: BoundingBox) -> str: def get_text_in_rect(self, bbox: BoundingBox) -> str:
# Find intersecting cells on the page # Find intersecting cells on the page
@ -168,38 +170,39 @@ class DoclingParsePageBackend(PdfPageBackend):
def unload(self): def unload(self):
self._ppage = None self._ppage = None
self._dpage = None self._dpage = None
self.text_page = None
class DoclingParseDocumentBackend(PdfDocumentBackend): class DoclingParseDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path]): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream) super().__init__(path_or_stream, document_hash)
self._pdoc = pdfium.PdfDocument(path_or_stream) self._pdoc = pdfium.PdfDocument(path_or_stream)
# Parsing cells with docling_parser call self.parser = pdf_parser()
parser = pdf_parser()
start_pb_time = time.time()
success = False
if isinstance(path_or_stream, BytesIO): if isinstance(path_or_stream, BytesIO):
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream) success = self.parser.load_document_from_bytesio(
else: document_hash, path_or_stream
self._parser_doc = parser.find_cells(str(path_or_stream)) )
elif isinstance(path_or_stream, Path):
success = self.parser.load_document(document_hash, str(path_or_stream))
end_pb_time = time.time() - start_pb_time if not success:
_log.info(f"Time to parse with docling-parse: time={end_pb_time:.3f}") raise RuntimeError("docling-parse could not load this document.")
def page_count(self) -> int: def page_count(self) -> int:
return len(self._parser_doc["pages"]) return len(self._pdoc) # To be replaced with docling-parse API
def load_page(self, page_no: int) -> DoclingParsePageBackend: def load_page(self, page_no: int) -> DoclingParsePageBackend:
return DoclingParsePageBackend( return DoclingParsePageBackend(
self._pdoc[page_no], self._parser_doc["pages"][page_no] self.parser, self.document_hash, page_no, self._pdoc[page_no]
) )
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.page_count() > 0 return self.page_count() > 0
def unload(self): def unload(self):
super().unload()
self.parser.unload_document(self.document_hash)
self._pdoc.close() self._pdoc.close()
self._pdoc = None self._pdoc = None
self._parser_doc = None

View File

@ -215,8 +215,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
class PyPdfiumDocumentBackend(PdfDocumentBackend): class PyPdfiumDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path]): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream) super().__init__(path_or_stream, document_hash)
self._pdoc = pdfium.PdfDocument(path_or_stream) self._pdoc = pdfium.PdfDocument(path_or_stream)
def page_count(self) -> int: def page_count(self) -> int:
@ -229,5 +229,6 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
return self.page_count() > 0 return self.page_count() > 0
def unload(self): def unload(self):
super().unload()
self._pdoc.close() self._pdoc.close()
self._pdoc = None self._pdoc = None

View File

@ -79,7 +79,9 @@ class InputDocument(BaseModel):
self.valid = False self.valid = False
else: else:
self.document_hash = create_file_hash(path_or_stream) self.document_hash = create_file_hash(path_or_stream)
self._backend = pdf_backend(path_or_stream=path_or_stream) self._backend = pdf_backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
)
elif isinstance(path_or_stream, BytesIO): elif isinstance(path_or_stream, BytesIO):
self.file = PurePath(filename) self.file = PurePath(filename)
@ -89,7 +91,9 @@ class InputDocument(BaseModel):
self.valid = False self.valid = False
else: else:
self.document_hash = create_file_hash(path_or_stream) self.document_hash = create_file_hash(path_or_stream)
self._backend = pdf_backend(path_or_stream=path_or_stream) self._backend = pdf_backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
)
if self.document_hash and self._backend.page_count() > 0: if self.document_hash and self._backend.page_count() > 0:
self.page_count = self._backend.page_count() self.page_count = self._backend.page_count()

View File

@ -141,6 +141,8 @@ class DocumentConverter:
start_doc_time = time.time() start_doc_time = time.time()
converted_doc = ConvertedDocument(input=in_doc) converted_doc = ConvertedDocument(input=in_doc)
_log.info(f"Processing document {in_doc.file.name}")
if not in_doc.valid: if not in_doc.valid:
converted_doc.status = ConversionStatus.FAILURE converted_doc.status = ConversionStatus.FAILURE
return converted_doc return converted_doc

View File

@ -1,10 +1,15 @@
import json import json
import logging import logging
import time import time
from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable
from docling.datamodel.base_models import ConversionStatus from docling.datamodel.base_models import (
ConversionStatus,
DocumentStream,
PipelineOptions,
)
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
@ -52,7 +57,11 @@ def main():
Path("./test/data/redp5695.pdf"), Path("./test/data/redp5695.pdf"),
] ]
doc_converter = DocumentConverter() # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs)
doc_converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False))
input = DocumentConversionInput.from_paths(input_doc_paths) input = DocumentConversionInput.from_paths(input_doc_paths)

43
poetry.lock generated
View File

@ -822,35 +822,34 @@ tqdm = ">=4.64.0,<5.0.0"
[[package]] [[package]]
name = "docling-parse" name = "docling-parse"
version = "0.2.0" version = "1.0.0"
description = "Simple package to extract text with coordinates from programmatic PDFs" description = "Simple package to extract text with coordinates from programmatic PDFs"
optional = false optional = false
python-versions = "<4.0,>=3.9" python-versions = "<4.0,>=3.9"
files = [ files = [
{file = "docling_parse-0.2.0-cp310-cp310-macosx_13_6_arm64.whl", hash = "sha256:3ec6458d36bd33862ae1ca38accbcd2ddc8a881fb5a3ab0aeb9e023bc20d8e04"}, {file = "docling_parse-1.0.0-cp310-cp310-macosx_13_6_arm64.whl", hash = "sha256:068db83a192b21783cc7bc66e9d3efb9072a57edeb8c07ef1a83a93353efcc36"},
{file = "docling_parse-0.2.0-cp310-cp310-macosx_13_6_x86_64.whl", hash = "sha256:898ee83f1e6f97dd34362948fcc70753fa95c83f77eddf48de5e352db10402f7"}, {file = "docling_parse-1.0.0-cp310-cp310-macosx_13_6_x86_64.whl", hash = "sha256:f57f9bba3ac6a81fc30c34bb08261d7308b0a780d90cbee903821aec2f5fbd88"},
{file = "docling_parse-0.2.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:9247e6902f979d23860e4b819b0145a9f55be78b14cf2906ac98f8fb0e9627cd"}, {file = "docling_parse-1.0.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ae02643485eb28cb54bac8523243a536751c561dddd86846a8dd9b3804a3c491"},
{file = "docling_parse-0.2.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:ebd0f091bdb106f1c3f72448aedfee52a904cb01e4de73827446e30fc3ac3b54"}, {file = "docling_parse-1.0.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:01cbb011a337bc4dcdddb281841378af36cbce0898bdf528543c7c54d66e6ecc"},
{file = "docling_parse-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9846bd3347a41337d6e83d7fbfbc636274ed3863ac375f4ca5eac1ea0eb88b8f"}, {file = "docling_parse-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdf142dea82f0a5f5e1bcaa74cc9feeda12899077589e3eb6c728d334b43cdda"},
{file = "docling_parse-0.2.0-cp311-cp311-macosx_13_6_arm64.whl", hash = "sha256:b71b0f9bfe033f9c872eb8298cd1cf5420b5cad74708ae2008257202fe1218a6"}, {file = "docling_parse-1.0.0-cp311-cp311-macosx_13_6_arm64.whl", hash = "sha256:8834a8387a55b4082c20da184e7d09f705c17558c465da9a5f35974b19013fe5"},
{file = "docling_parse-0.2.0-cp311-cp311-macosx_13_6_x86_64.whl", hash = "sha256:aa0e840a9007c673f9fededf04e2372b3d1bde7c6360ac7d1b49a78ad58145f8"}, {file = "docling_parse-1.0.0-cp311-cp311-macosx_13_6_x86_64.whl", hash = "sha256:4d1cfe98a7594fac3c7afd8fb08b28e4b1aba8b317e60cc64a85fb19043230b0"},
{file = "docling_parse-0.2.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:66e622564073fe5dce4b104b5c80cafea2ae1114efa886ef0bc0f1b1488163a9"}, {file = "docling_parse-1.0.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:f5da27cd03f1ba8859ebde525db388dd1d862be2712f38a13b6985f95061280c"},
{file = "docling_parse-0.2.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:96e5c6b1d4f7df936b2461908e99eb5fe756486d6414de71bced8324f4ce2108"}, {file = "docling_parse-1.0.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8aa6bdda40483af52591bdff11a578837eb4d6be51c12d44b4e489f520757ae6"},
{file = "docling_parse-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aeaec873f8f3f8549a2511a321cfb3dc9958d9731f538e2c619fba41eea98c5"}, {file = "docling_parse-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5c4b80a8d5e8f832910f32188501a9a6718a0223fb9921ee7cc5cfe62adb857"},
{file = "docling_parse-0.2.0-cp312-cp312-macosx_13_6_arm64.whl", hash = "sha256:f3e917407a6eb4e71ce4b82ca7aefb9366e750d526011554f9aeae33fdfd53d5"}, {file = "docling_parse-1.0.0-cp312-cp312-macosx_13_6_arm64.whl", hash = "sha256:c86b263b4b089c3a71cde2a4fb8314614350dd76b3769b0950b371c2964e10d6"},
{file = "docling_parse-0.2.0-cp312-cp312-macosx_13_6_x86_64.whl", hash = "sha256:0e4dde0bcffe59c7e1b9f2146eac2789f6a350571f66de5f4c58e8bf031ad5f6"}, {file = "docling_parse-1.0.0-cp312-cp312-macosx_13_6_x86_64.whl", hash = "sha256:93ef15628d663c036d48d466bf3de7c90a172cf52ba11883990640c758331720"},
{file = "docling_parse-0.2.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:12f393a0cba357016e8704e6836e553506b893d5ba16f19e47b0d201c8f6dc6d"}, {file = "docling_parse-1.0.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:37218472773ed94b8ed07eeccfa68457f064227759350404fea5f45c311242a7"},
{file = "docling_parse-0.2.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e07f6439fbb53c3898cd24d7d6628dcc514097314eac4832b095291dbd9c23e0"}, {file = "docling_parse-1.0.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:9f863d9788c62dd34b2cdfd79480785e9a6bb382144b630ceb8b527aaee56351"},
{file = "docling_parse-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cea14f84e196d01f5ae77f59bc9640c488fde9a4eaf25433a7372794ca9433fc"}, {file = "docling_parse-1.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0358eb13822ce2120362d6e7d63eb80a50d819b5bed5a2ccb7bd9beee4d83a61"},
{file = "docling_parse-0.2.0-cp39-cp39-macosx_13_6_arm64.whl", hash = "sha256:1d7b7dc072d029869387c2ec8f2d816d066a62d79f18d5c6d037b19b1cda07c6"}, {file = "docling_parse-1.0.0-cp39-cp39-macosx_13_6_arm64.whl", hash = "sha256:5651185fbec4357b7638e1a39a0854a712a0cc74d6644518e64f066ce38ed976"},
{file = "docling_parse-0.2.0-cp39-cp39-macosx_13_6_x86_64.whl", hash = "sha256:acff58ac3ae9c1198956e9dd566949e4ea06c130f9e0050b2a88c7150716fd4f"}, {file = "docling_parse-1.0.0-cp39-cp39-macosx_13_6_x86_64.whl", hash = "sha256:d5efedf361b4c58e372d355c0bb3fa5a20dcd3d002952ccbafb09580a924f426"},
{file = "docling_parse-0.2.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:06c688993087b763e7aaa10a8282b2cbe615b6c68540f3538998a6bc85f944f0"}, {file = "docling_parse-1.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d4a67df4699b4ffc2b01e77395ef35843ab23f40ac62bcdf593b6cc1f443eca6"},
{file = "docling_parse-0.2.0-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:179595753f74d121ad21e4d422e4360a5e54a36c48def130d7d93886807fcdac"},
{file = "docling_parse-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08be7f229bbf4b89d2dba77a80939f6dbdc3a434a26342a6380dc40e25e69fcb"},
] ]
[package.dependencies] [package.dependencies]
cibuildwheel = ">=2.20.0,<3.0.0" cibuildwheel = ">=2.20.0,<3.0.0"
tabulate = ">=0.9.0,<1.0.0"
[[package]] [[package]]
name = "docutils" name = "docutils"
@ -5142,4 +5141,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "4b0af4695af17ce1cdbcd04b4c29360cacd866acc77b5a0529749651ee633323" content-hash = "98d40c4d763018d5aa79b8c0ec00adac2fc06a036a9850b60f8ecce14db7cbcc"

View File

@ -32,7 +32,7 @@ pydantic-settings = "^2.3.0"
huggingface_hub = ">=0.23,<1" huggingface_hub = ">=0.23,<1"
requests = "^2.32.3" requests = "^2.32.3"
easyocr = "^1.7" easyocr = "^1.7"
docling-parse = "^0.2.0" docling-parse = "^1.0.0"
certifi = ">=2024.7.4" certifi = ">=2024.7.4"
rtree = "^1.3.0" rtree = "^1.3.0"
scipy = "^1.14.1" scipy = "^1.14.1"