feat: introducing docling_backend (#26)

Uses our own docling_parse to reliably get PDF cells To get page images, this backend uses pypdfium2 Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
2024-08-07 16:22:36 +02:00 · 2024-08-07 16:22:36 +02:00 · b8f5e38a8c
commit b8f5e38a8c
parent 62ba4aaf31
4 changed files with 203 additions and 6 deletions
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@ -0,0 +1,171 @@
+import random
+from io import BytesIO
+from pathlib import Path
+from typing import Iterable, List, Optional, Union
+
+import pypdfium2 as pdfium
+from docling_parse.docling_parse import pdf_parser
+from PIL import Image, ImageDraw
+from pypdfium2 import PdfPage
+
+from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
+
+
+class DoclingParsePageBackend(PdfPageBackend):
+    def __init__(self, page_obj: PdfPage, docling_page_obj):
+        super().__init__(page_obj)
+        self._ppage = page_obj
+        self._dpage = docling_page_obj
+        self.text_page = None
+
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        # Find intersecting cells on the page
+        text_piece = ""
+        page_size = self.get_size()
+        parser_width = self._dpage["width"]
+        parser_height = self._dpage["height"]
+
+        scale = (
+            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
+        )
+
+        for i in range(len(self._dpage["cells"])):
+            rect = self._dpage["cells"][i]["box"]["device"]
+            x0, y0, x1, y1 = rect
+            cell_bbox = BoundingBox(
+                l=x0 * scale * page_size.width / parser_width,
+                b=y0 * scale * page_size.height / parser_height,
+                r=x1 * scale * page_size.width / parser_width,
+                t=y1 * scale * page_size.height / parser_height,
+                coord_origin=CoordOrigin.BOTTOMLEFT,
+            ).to_top_left_origin(page_size.height * scale)
+
+            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+
+            if overlap_frac > 0.5:
+                if len(text_piece) > 0:
+                    text_piece += " "
+                text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
+
+        return text_piece
+
+    def get_text_cells(self) -> Iterable[Cell]:
+        cells = []
+        cell_counter = 0
+
+        page_size = self.get_size()
+
+        parser_width = self._dpage["width"]
+        parser_height = self._dpage["height"]
+
+        for i in range(len(self._dpage["cells"])):
+            rect = self._dpage["cells"][i]["box"]["device"]
+            x0, y0, x1, y1 = rect
+            text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
+            cells.append(
+                Cell(
+                    id=cell_counter,
+                    text=text_piece,
+                    bbox=BoundingBox(
+                        # l=x0, b=y0, r=x1, t=y1,
+                        l=x0 * page_size.width / parser_width,
+                        b=y0 * page_size.height / parser_height,
+                        r=x1 * page_size.width / parser_width,
+                        t=y1 * page_size.height / parser_height,
+                        coord_origin=CoordOrigin.BOTTOMLEFT,
+                    ).to_top_left_origin(page_size.height),
+                )
+            )
+            cell_counter += 1
+
+        def draw_clusters_and_cells():
+            image = self.get_page_image()
+            draw = ImageDraw.Draw(image)
+            for c in cells:
+                x0, y0, x1, y1 = c.bbox.as_tuple()
+                cell_color = (
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                )
+                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
+            image.show()
+
+        # before merge:
+        # draw_clusters_and_cells()
+
+        # cells = merge_horizontal_cells(cells)
+
+        # after merge:
+        # draw_clusters_and_cells()
+
+        return cells
+
+    def get_page_image(
+        self, scale: int = 1, cropbox: Optional[BoundingBox] = None
+    ) -> Image.Image:
+
+        page_size = self.get_size()
+
+        if not cropbox:
+            cropbox = BoundingBox(
+                l=0,
+                r=page_size.width,
+                t=0,
+                b=page_size.height,
+                coord_origin=CoordOrigin.TOPLEFT,
+            )
+            padbox = BoundingBox(
+                l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
+            )
+        else:
+            padbox = cropbox.to_bottom_left_origin(page_size.height)
+            padbox.r = page_size.width - padbox.r
+            padbox.t = page_size.height - padbox.t
+
+        image = (
+            self._ppage.render(
+                scale=scale * 1.5,
+                rotation=0,  # no additional rotation
+                crop=padbox.as_tuple(),
+            )
+            .to_pil()
+            .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
+        )  # We resize the image from 1.5x the given scale to make it sharper.
+
+        return image
+
+    def get_size(self) -> PageSize:
+        return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
+
+    def unload(self):
+        self._ppage = None
+        self._dpage = None
+        self.text_page = None
+
+
+class DoclingParseDocumentBackend(PdfDocumentBackend):
+    def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
+        super().__init__(path_or_stream)
+        self._pdoc = pdfium.PdfDocument(path_or_stream)
+        # Parsing cells with docling_parser call
+        print("PARSING WITH DOCLING PARSE")
+        parser = pdf_parser()
+        self._parser_doc = parser.find_cells(str(path_or_stream))
+
+    def page_count(self) -> int:
+        return len(self._parser_doc["pages"])
+
+    def load_page(self, page_no: int) -> PdfPage:
+        return DoclingParsePageBackend(
+            self._pdoc[page_no], self._parser_doc["pages"][page_no]
+        )
+
+    def is_valid(self) -> bool:
+        return self.page_count() > 0
+
+    def unload(self):
+        self._pdoc.close()
+        self._pdoc = None
+        self._parser_doc = None
--- a/examples/convert.py
+++ b/examples/convert.py
@ -4,7 +4,8 @@ import time
 from pathlib import Path
 from typing import Iterable

-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+# from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, PipelineOptions
 from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
 from docling.document_converter import DocumentConverter
@ -54,11 +55,12 @@ def main():
    artifacts_path = DocumentConverter.download_models_hf()

    pipeline_options = PipelineOptions(do_table_structure=True)
-    # use text cells predicted from table structure model, instead of matching with pdf cells
-    pipeline_options.table_structure_options.do_cell_matching = False
+    pipeline_options.table_structure_options.do_cell_matching = True

    doc_converter = DocumentConverter(
-        artifacts_path=artifacts_path, pipeline_options=pipeline_options
+        artifacts_path=artifacts_path,
+        pipeline_options=pipeline_options,
+        pdf_backend=DoclingParseDocumentBackend,
    )

    input = DocumentConversionInput.from_paths(input_doc_paths)
--- a/poetry.lock
+++ b/poetry.lock
@ -759,6 +759,30 @@ torch = "2.2.2"
 torchvision = "0.17.2"
 tqdm = ">=4.64.0,<5.0.0"

+[[package]]
+name = "docling-parse"
+version = "0.0.1"
+description = "Simple package to extract text with coordinates from programmatic PDFs"
+optional = false
+python-versions = "<4.0,>=3.9"
+files = [
+    {file = "docling_parse-0.0.1-cp310-cp310-macosx_13_6_arm64.whl", hash = "sha256:d6301dde11157f94b6436bb87186b4723cce7b1e59e0f74b0a7333339d6f911d"},
+    {file = "docling_parse-0.0.1-cp310-cp310-macosx_13_6_x86_64.whl", hash = "sha256:ac5fb3b6ac568159930103521f2e7002b78c37f6555f23d767b2e247ddbce740"},
+    {file = "docling_parse-0.0.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ec9066ad9e7f11a18aa230f67b733d64433185be1da8e887ac273c9683e02938"},
+    {file = "docling_parse-0.0.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:3e5d560ac3527a9bda5bf01905ec6a5fb9eb889a5bec2c3c909cf9c75642e2d3"},
+    {file = "docling_parse-0.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d56de1a5b45b19117d4fe1f444878501796ec5f17de880c06c1ce3184ac360e7"},
+    {file = "docling_parse-0.0.1-cp311-cp311-macosx_13_6_arm64.whl", hash = "sha256:110a08f4663ee18833b2b89013993c2326b519a7fe21a64940d9f2789f52be29"},
+    {file = "docling_parse-0.0.1-cp311-cp311-macosx_13_6_x86_64.whl", hash = "sha256:19cf275ce78d2ebb7c3e577b5126f1f2af6fd28557b63c42d1455f1cc87be454"},
+    {file = "docling_parse-0.0.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:1fdd07ac20951935e3f74b1ec1f503c4493440664aaa8e30ab7fa6334c2a4937"},
+    {file = "docling_parse-0.0.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:d8018263eba239c702f79149ed16ec4e749bdec5396aea9e78b9cdfbae1b86bd"},
+    {file = "docling_parse-0.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:299281bfc14ca95cc1db677f48f152105be0f96beab171313004cdb7ce448df4"},
+    {file = "docling_parse-0.0.1-cp312-cp312-macosx_13_6_arm64.whl", hash = "sha256:b05d40d6570212ca1e3b98fb55ce1c861d28484db2bde513b6c5e8b3339f4021"},
+    {file = "docling_parse-0.0.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:11bdddc8f767bdd14b317bcb25d7fc46b656f867f137a5d8fe6d0f95d61d2ce9"},
+    {file = "docling_parse-0.0.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:14a52b46c887c00b0a1da0f5ea4e6652ab9e23deeac43f6d98b239a6cba7fbf1"},
+    {file = "docling_parse-0.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17caa551f7432555823f01a4882e869068198a8b27eec1449afc6c821b594330"},
+    {file = "docling_parse-0.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27aac51dd7753fac57466fa5de55e0ff0294367cf62a539941e72cfff8fb7e87"},
+]
+
 [[package]]
 name = "docutils"
 version = "0.21.2"
@ -2510,7 +2534,6 @@ description = "Nvidia JIT LTO Library"
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_aarch64.whl", hash = "sha256:98103729cc5226e13ca319a10bbf9433bbbd44ef64fe72f45f067cacc14b8d27"},
    {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"},
    {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"},
 ]
@ -4882,4 +4905,4 @@ ocr = ["easyocr"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "a6685d5cf1b283d805e10193a437662a1807f99dad40b56ab1e58e1b708fc184"
+content-hash = "9dfea6fabd2b8be0183a671c1540446cadc1da45a5460e636c71ae5b24abee0d"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -32,6 +32,7 @@ pydantic-settings = "^2.3.0"
 huggingface_hub = ">=0.23,<1"
 requests = "^2.32.3"
 easyocr = { version = "^1.7", optional = true }
+docling-parse = "^0.0.1"

 [tool.poetry.group.dev.dependencies]
 black = {extras = ["jupyter"], version = "^24.4.2"}