fix: Correct text extraction for table cells (#21)
* - Fixes for scaling transformation for table cell bounding boxes when using do_cell_matching = False - Corrected examples/convert.py with appropriate parameter, for good quality example conversion Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> * Completed checks Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
b07c4a7a4a
commit
f4bf3d25b9
@ -114,12 +114,15 @@ class TableStructureModel:
|
||||
for element in table_out["tf_responses"]:
|
||||
|
||||
if not self.do_cell_matching:
|
||||
the_bbox = BoundingBox.model_validate(element["bbox"])
|
||||
the_bbox = BoundingBox.model_validate(
|
||||
element["bbox"]
|
||||
).scaled(1 / self.scale)
|
||||
text_piece = page._backend.get_text_in_rect(the_bbox)
|
||||
element["bbox"]["token"] = text_piece
|
||||
|
||||
tc = TableCell.model_validate(element)
|
||||
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
||||
if self.do_cell_matching:
|
||||
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
||||
table_cells.append(tc)
|
||||
|
||||
# Retrieving cols/rows, after post processing:
|
||||
|
@ -53,7 +53,13 @@ def main():
|
||||
|
||||
artifacts_path = DocumentConverter.download_models_hf()
|
||||
|
||||
doc_converter = DocumentConverter(artifacts_path=artifacts_path)
|
||||
pipeline_options = PipelineOptions(do_table_structure=True)
|
||||
# use text cells predicted from table structure model, instead of matching with pdf cells
|
||||
pipeline_options.table_structure_options.do_cell_matching = False
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
artifacts_path=artifacts_path, pipeline_options=pipeline_options
|
||||
)
|
||||
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user