feat: Make Page.parsed_page the only source of truth for text cells, add OCR cells to it (#1745)

* Keep page.parsed_page.textline_cells and page.cells in sync, including OCR Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Make page.parsed_page the only source of truth for text cells Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Small fix Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Correctly compute PDF boxes from pymupdf Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use different OCR engine order Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add type hints and fix mypy Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * One more test fix Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Remove with pypdfium2_lock from caller sites Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix typing Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-06-13 19:01:55 +02:00
parent 0432a31b2f
commit 7d3302cb48
50 changed files with 339091 additions and 330047 deletions
--- a/docling/backend/docling_parse_v4_backend.py
+++ b/docling/backend/docling_parse_v4_backend.py
@@ -59,20 +59,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
        return self._dpage

    def get_text_cells(self) -> Iterable[TextCell]:
-        page_size = self.get_size()
-
-        [tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
-
-        # for cell in self._dpage.textline_cells:
-        #     rect = cell.rect
-        #
-        #     assert (
-        #         rect.to_bounding_box().l <= rect.to_bounding_box().r
-        #     ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
-        #     assert (
-        #         rect.to_bounding_box().t <= rect.to_bounding_box().b
-        #     ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
-
        return self._dpage.textline_cells

    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
@@ -171,12 +157,28 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
        self, page_no: int, create_words: bool = True, create_textlines: bool = True
    ) -> DoclingParseV4PageBackend:
        with pypdfium2_lock:
+            seg_page = self.dp_doc.get_page(
+                page_no + 1,
+                create_words=create_words,
+                create_textlines=create_textlines,
+            )
+
+            # In Docling, all TextCell instances are expected with top-left origin.
+            [
+                tc.to_top_left_origin(seg_page.dimension.height)
+                for tc in seg_page.textline_cells
+            ]
+            [
+                tc.to_top_left_origin(seg_page.dimension.height)
+                for tc in seg_page.char_cells
+            ]
+            [
+                tc.to_top_left_origin(seg_page.dimension.height)
+                for tc in seg_page.word_cells
+            ]
+
            return DoclingParseV4PageBackend(
-                self.dp_doc.get_page(
-                    page_no + 1,
-                    create_words=create_words,
-                    create_textlines=create_textlines,
-                ),
+                seg_page,
                self._pdoc[page_no],
            )