fix: Processing of placeholder shapes in pptx that have text but no bbox (#868)

Processing of placeholder shapes in pptx that have text but no bbox Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
2025-02-03 09:33:33 +01:00 · 2025-02-03 09:33:33 +01:00 · eff16b62cc
commit eff16b62cc
parent b1cf796730
1 changed files with 35 additions and 22 deletions
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -98,21 +98,28 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        return doc
-    def generate_prov(self, shape, slide_ind, text=""):
+    def generate_prov(
-        left = shape.left
+        self, shape, slide_ind, text="", slide_size=Size(width=1, height=1)
-        top = shape.top
+    ):
-        width = shape.width
+        if shape.left:
-        height = shape.height
+            left = shape.left
            top = shape.top
            width = shape.width
            height = shape.height
        else:
            left = 0
            top = 0
            width = slide_size.width
            height = slide_size.height
        shape_bbox = [left, top, left + width, top + height]
        shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
        # prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
        prov = ProvenanceItem(
            page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
        )
        return prov
-    def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
+    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
        is_a_list = False
        is_list_group_created = False
        enum_list_item_value = 0
@ -121,7 +128,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        list_text = ""
        list_label = GroupLabel.LIST
        doc_label = DocItemLabel.LIST_ITEM
-        prov = self.generate_prov(shape, slide_ind, shape.text.strip())
+        prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
        # Identify if shape contains lists
        for paragraph in shape.text_frame.paragraphs:
@ -270,7 +277,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                )
        return
-    def handle_pictures(self, shape, parent_slide, slide_ind, doc):
+    def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size):
        # Open it with PIL
        try:
            # Get the image bytes
@ -280,7 +287,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
            pil_image = Image.open(BytesIO(image_bytes))
            # shape has picture
-            prov = self.generate_prov(shape, slide_ind, "")
+            prov = self.generate_prov(shape, slide_ind, "", slide_size)
            doc.add_picture(
                parent=parent_slide,
                image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
@ -291,13 +298,13 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
            _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
        return
-    def handle_tables(self, shape, parent_slide, slide_ind, doc):
+    def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size):
        # Handling tables, images, charts
        if shape.has_table:
            table = shape.table
            table_xml = shape._element
-            prov = self.generate_prov(shape, slide_ind, "")
+            prov = self.generate_prov(shape, slide_ind, "", slide_size)
            num_cols = 0
            num_rows = len(table.rows)
@ -374,17 +381,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
            )
-            size = Size(width=slide_width, height=slide_height)
+            slide_size = Size(width=slide_width, height=slide_height)
-            parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
+            parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
-            def handle_shapes(shape, parent_slide, slide_ind, doc):
+            def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
-                handle_groups(shape, parent_slide, slide_ind, doc)
+                handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
                if shape.has_table:
                    # Handle Tables
-                    self.handle_tables(shape, parent_slide, slide_ind, doc)
+                    self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                    # Handle Pictures
-                    self.handle_pictures(shape, parent_slide, slide_ind, doc)
+                    self.handle_pictures(
                        shape, parent_slide, slide_ind, doc, slide_size
                    )
                # If shape doesn't have any text, move on to the next shape
                if not hasattr(shape, "text"):
                    return
@ -396,16 +405,20 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                    _log.warning("Warning: shape has text but not text_frame")
                    return
                # Handle other text elements, including lists (bullet lists, numbered lists)
-                self.handle_text_elements(shape, parent_slide, slide_ind, doc)
+                self.handle_text_elements(
                    shape, parent_slide, slide_ind, doc, slide_size
                )
                return
-            def handle_groups(shape, parent_slide, slide_ind, doc):
+            def handle_groups(shape, parent_slide, slide_ind, doc, slide_size):
                if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
                    for groupedshape in shape.shapes:
-                        handle_shapes(groupedshape, parent_slide, slide_ind, doc)
+                        handle_shapes(
                            groupedshape, parent_slide, slide_ind, doc, slide_size
                        )
            # Loop through each shape in the slide
            for shape in slide.shapes:
-                handle_shapes(shape, parent_slide, slide_ind, doc)
+                handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
        return doc