fix: Word-level pdf cells for tables (#1238)

* word-level pdf cells for tables Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * removed comments Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated dependency to docling-core Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
2025-03-28 16:34:48 +01:00
parent 82694b2136
commit 8bd71e8e33
3 changed files with 1310 additions and 405 deletions
@@ -5,7 +5,11 @@ from typing import Iterable, Optional, Union
 import numpy
 from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
-from docling_core.types.doc.page import BoundingRectangle
+from docling_core.types.doc.page import (
    BoundingRectangle,
    SegmentedPdfPage,
    TextCellUnit,
 )
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
 from PIL import ImageDraw
@@ -218,9 +222,18 @@ class TableStructureModel(BasePageModel):
                    if len(table_bboxes):
                        for table_cluster, tbl_box in in_tables:
-
+                            # Check if word-level cells are available from backend:
                            sp = page._backend.get_segmented_page()
                            if sp is not None:
                                tcells = sp.get_cells_in_bbox(
                                    cell_unit=TextCellUnit.WORD,
                                    bbox=table_cluster.bbox,
                                )
                            else:
                                # Otherwise - we use normal (line/phrase) cells
                                tcells = table_cluster.cells
                            tokens = []
-                            for c in table_cluster.cells:
+                            for c in tcells:
                                # Only allow non empty stings (spaces) into the cells of a table
                                if len(c.text.strip()) > 0:
                                    new_cell = copy.deepcopy(c)
@@ -229,7 +242,6 @@ class TableStructureModel(BasePageModel):
                                            scale=self.scale
                                        )
                                    )
                                    tokens.append(
                                        {
                                            "id": new_cell.index,
@@ -46,7 +46,7 @@ packages = [{ include = "docling" }]
 ######################
 python = "^3.9"
 pydantic = "^2.0.0"
-docling-core = {extras = ["chunking"], version = "^2.23.1"}
+docling-core = {extras = ["chunking"], version = "^2.24.1"}
 docling-ibm-models = "^3.4.0"
 docling-parse = "^4.0.0"
 filetype = "^1.2.0"