fix: Word-level pdf cells for tables (#1238)
* word-level pdf cells for tables Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * removed comments Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated dependency to docling-core Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
82694b2136
commit
8bd71e8e33
@ -5,7 +5,11 @@ from typing import Iterable, Optional, Union
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
||||
from docling_core.types.doc.page import BoundingRectangle
|
||||
from docling_core.types.doc.page import (
|
||||
BoundingRectangle,
|
||||
SegmentedPdfPage,
|
||||
TextCellUnit,
|
||||
)
|
||||
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||
from PIL import ImageDraw
|
||||
|
||||
@ -218,9 +222,18 @@ class TableStructureModel(BasePageModel):
|
||||
|
||||
if len(table_bboxes):
|
||||
for table_cluster, tbl_box in in_tables:
|
||||
|
||||
# Check if word-level cells are available from backend:
|
||||
sp = page._backend.get_segmented_page()
|
||||
if sp is not None:
|
||||
tcells = sp.get_cells_in_bbox(
|
||||
cell_unit=TextCellUnit.WORD,
|
||||
bbox=table_cluster.bbox,
|
||||
)
|
||||
else:
|
||||
# Otherwise - we use normal (line/phrase) cells
|
||||
tcells = table_cluster.cells
|
||||
tokens = []
|
||||
for c in table_cluster.cells:
|
||||
for c in tcells:
|
||||
# Only allow non empty stings (spaces) into the cells of a table
|
||||
if len(c.text.strip()) > 0:
|
||||
new_cell = copy.deepcopy(c)
|
||||
@ -229,7 +242,6 @@ class TableStructureModel(BasePageModel):
|
||||
scale=self.scale
|
||||
)
|
||||
)
|
||||
|
||||
tokens.append(
|
||||
{
|
||||
"id": new_cell.index,
|
||||
|
1693
poetry.lock
generated
1693
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -46,7 +46,7 @@ packages = [{ include = "docling" }]
|
||||
######################
|
||||
python = "^3.9"
|
||||
pydantic = "^2.0.0"
|
||||
docling-core = {extras = ["chunking"], version = "^2.23.1"}
|
||||
docling-core = {extras = ["chunking"], version = "^2.24.1"}
|
||||
docling-ibm-models = "^3.4.0"
|
||||
docling-parse = "^4.0.0"
|
||||
filetype = "^1.2.0"
|
||||
|
Loading…
Reference in New Issue
Block a user