fix: Processing of placeholder shapes in pptx that have text but no bbox (#868)

Processing of placeholder shapes in pptx that have text but no bbox

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak 2025-02-03 09:33:33 +01:00 committed by GitHub
parent b1cf796730
commit eff16b62cc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -98,21 +98,28 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
return doc return doc
def generate_prov(self, shape, slide_ind, text=""): def generate_prov(
left = shape.left self, shape, slide_ind, text="", slide_size=Size(width=1, height=1)
top = shape.top ):
width = shape.width if shape.left:
height = shape.height left = shape.left
top = shape.top
width = shape.width
height = shape.height
else:
left = 0
top = 0
width = slide_size.width
height = slide_size.height
shape_bbox = [left, top, left + width, top + height] shape_bbox = [left, top, left + width, top + height]
shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT) shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
# prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
prov = ProvenanceItem( prov = ProvenanceItem(
page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
) )
return prov return prov
def handle_text_elements(self, shape, parent_slide, slide_ind, doc): def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
is_a_list = False is_a_list = False
is_list_group_created = False is_list_group_created = False
enum_list_item_value = 0 enum_list_item_value = 0
@ -121,7 +128,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
list_text = "" list_text = ""
list_label = GroupLabel.LIST list_label = GroupLabel.LIST
doc_label = DocItemLabel.LIST_ITEM doc_label = DocItemLabel.LIST_ITEM
prov = self.generate_prov(shape, slide_ind, shape.text.strip()) prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
# Identify if shape contains lists # Identify if shape contains lists
for paragraph in shape.text_frame.paragraphs: for paragraph in shape.text_frame.paragraphs:
@ -270,7 +277,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
) )
return return
def handle_pictures(self, shape, parent_slide, slide_ind, doc): def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size):
# Open it with PIL # Open it with PIL
try: try:
# Get the image bytes # Get the image bytes
@ -280,7 +287,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
pil_image = Image.open(BytesIO(image_bytes)) pil_image = Image.open(BytesIO(image_bytes))
# shape has picture # shape has picture
prov = self.generate_prov(shape, slide_ind, "") prov = self.generate_prov(shape, slide_ind, "", slide_size)
doc.add_picture( doc.add_picture(
parent=parent_slide, parent=parent_slide,
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi), image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
@ -291,13 +298,13 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}") _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
return return
def handle_tables(self, shape, parent_slide, slide_ind, doc): def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size):
# Handling tables, images, charts # Handling tables, images, charts
if shape.has_table: if shape.has_table:
table = shape.table table = shape.table
table_xml = shape._element table_xml = shape._element
prov = self.generate_prov(shape, slide_ind, "") prov = self.generate_prov(shape, slide_ind, "", slide_size)
num_cols = 0 num_cols = 0
num_rows = len(table.rows) num_rows = len(table.rows)
@ -374,17 +381,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0] name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
) )
size = Size(width=slide_width, height=slide_height) slide_size = Size(width=slide_width, height=slide_height)
parent_page = doc.add_page(page_no=slide_ind + 1, size=size) parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
def handle_shapes(shape, parent_slide, slide_ind, doc): def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
handle_groups(shape, parent_slide, slide_ind, doc) handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
if shape.has_table: if shape.has_table:
# Handle Tables # Handle Tables
self.handle_tables(shape, parent_slide, slide_ind, doc) self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
# Handle Pictures # Handle Pictures
self.handle_pictures(shape, parent_slide, slide_ind, doc) self.handle_pictures(
shape, parent_slide, slide_ind, doc, slide_size
)
# If shape doesn't have any text, move on to the next shape # If shape doesn't have any text, move on to the next shape
if not hasattr(shape, "text"): if not hasattr(shape, "text"):
return return
@ -396,16 +405,20 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
_log.warning("Warning: shape has text but not text_frame") _log.warning("Warning: shape has text but not text_frame")
return return
# Handle other text elements, including lists (bullet lists, numbered lists) # Handle other text elements, including lists (bullet lists, numbered lists)
self.handle_text_elements(shape, parent_slide, slide_ind, doc) self.handle_text_elements(
shape, parent_slide, slide_ind, doc, slide_size
)
return return
def handle_groups(shape, parent_slide, slide_ind, doc): def handle_groups(shape, parent_slide, slide_ind, doc, slide_size):
if shape.shape_type == MSO_SHAPE_TYPE.GROUP: if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
for groupedshape in shape.shapes: for groupedshape in shape.shapes:
handle_shapes(groupedshape, parent_slide, slide_ind, doc) handle_shapes(
groupedshape, parent_slide, slide_ind, doc, slide_size
)
# Loop through each shape in the slide # Loop through each shape in the slide
for shape in slide.shapes: for shape in slide.shapes:
handle_shapes(shape, parent_slide, slide_ind, doc) handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
return doc return doc