fix: Correct text extraction for table cells (#21)

* - Fixes for scaling transformation for table cell bounding boxes when using do_cell_matching = False - Corrected examples/convert.py with appropriate parameter, for good quality example conversion Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> * Completed checks Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
2024-07-30 14:51:47 +02:00
parent b07c4a7a4a
commit f4bf3d25b9
2 changed files with 12 additions and 3 deletions
@@ -114,12 +114,15 @@ class TableStructureModel:
                    for element in table_out["tf_responses"]:

                        if not self.do_cell_matching:
-                            the_bbox = BoundingBox.model_validate(element["bbox"])
+                            the_bbox = BoundingBox.model_validate(
+                                element["bbox"]
+                            ).scaled(1 / self.scale)
                            text_piece = page._backend.get_text_in_rect(the_bbox)
                            element["bbox"]["token"] = text_piece

                        tc = TableCell.model_validate(element)
-                        tc.bbox = tc.bbox.scaled(1 / self.scale)
+                        if self.do_cell_matching:
+                            tc.bbox = tc.bbox.scaled(1 / self.scale)
                        table_cells.append(tc)

                    # Retrieving cols/rows, after post processing:
@@ -53,7 +53,13 @@ def main():

    artifacts_path = DocumentConverter.download_models_hf()

-    doc_converter = DocumentConverter(artifacts_path=artifacts_path)
+    pipeline_options = PipelineOptions(do_table_structure=True)
+    # use text cells predicted from table structure model, instead of matching with pdf cells
+    pipeline_options.table_structure_options.do_cell_matching = False
+
+    doc_converter = DocumentConverter(
+        artifacts_path=artifacts_path, pipeline_options=pipeline_options
+    )

    input = DocumentConversionInput.from_paths(input_doc_paths)