fix: Correct text extraction for table cells (#21)

* - Fixes for scaling transformation for table cell bounding boxes when using do_cell_matching = False
- Corrected examples/convert.py with appropriate parameter, for good quality example conversion

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>

* Completed checks

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>

---------

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak 2024-07-30 14:51:47 +02:00 committed by GitHub
parent b07c4a7a4a
commit f4bf3d25b9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 12 additions and 3 deletions

View File

@ -114,12 +114,15 @@ class TableStructureModel:
for element in table_out["tf_responses"]:
if not self.do_cell_matching:
the_bbox = BoundingBox.model_validate(element["bbox"])
the_bbox = BoundingBox.model_validate(
element["bbox"]
).scaled(1 / self.scale)
text_piece = page._backend.get_text_in_rect(the_bbox)
element["bbox"]["token"] = text_piece
tc = TableCell.model_validate(element)
tc.bbox = tc.bbox.scaled(1 / self.scale)
if self.do_cell_matching:
tc.bbox = tc.bbox.scaled(1 / self.scale)
table_cells.append(tc)
# Retrieving cols/rows, after post processing:

View File

@ -53,7 +53,13 @@ def main():
artifacts_path = DocumentConverter.download_models_hf()
doc_converter = DocumentConverter(artifacts_path=artifacts_path)
pipeline_options = PipelineOptions(do_table_structure=True)
# use text cells predicted from table structure model, instead of matching with pdf cells
pipeline_options.table_structure_options.do_cell_matching = False
doc_converter = DocumentConverter(
artifacts_path=artifacts_path, pipeline_options=pipeline_options
)
input = DocumentConversionInput.from_paths(input_doc_paths)