fix: Word-level pdf cells for tables (#1238)

* word-level pdf cells for tables

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* removed comments

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* Updated dependency to docling-core

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

---------

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak 2025-03-28 16:34:48 +01:00 committed by GitHub
parent 82694b2136
commit 8bd71e8e33
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 1310 additions and 405 deletions

View File

@ -5,7 +5,11 @@ from typing import Iterable, Optional, Union
import numpy
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
from docling_core.types.doc.page import BoundingRectangle
from docling_core.types.doc.page import (
BoundingRectangle,
SegmentedPdfPage,
TextCellUnit,
)
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
from PIL import ImageDraw
@ -218,9 +222,18 @@ class TableStructureModel(BasePageModel):
if len(table_bboxes):
for table_cluster, tbl_box in in_tables:
# Check if word-level cells are available from backend:
sp = page._backend.get_segmented_page()
if sp is not None:
tcells = sp.get_cells_in_bbox(
cell_unit=TextCellUnit.WORD,
bbox=table_cluster.bbox,
)
else:
# Otherwise - we use normal (line/phrase) cells
tcells = table_cluster.cells
tokens = []
for c in table_cluster.cells:
for c in tcells:
# Only allow non empty stings (spaces) into the cells of a table
if len(c.text.strip()) > 0:
new_cell = copy.deepcopy(c)
@ -229,7 +242,6 @@ class TableStructureModel(BasePageModel):
scale=self.scale
)
)
tokens.append(
{
"id": new_cell.index,

1693
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -46,7 +46,7 @@ packages = [{ include = "docling" }]
######################
python = "^3.9"
pydantic = "^2.0.0"
docling-core = {extras = ["chunking"], version = "^2.23.1"}
docling-core = {extras = ["chunking"], version = "^2.24.1"}
docling-ibm-models = "^3.4.0"
docling-parse = "^4.0.0"
filetype = "^1.2.0"