fix: Processing of placeholder shapes in pptx that have text but no bbox (#868)
Processing of placeholder shapes in pptx that have text but no bbox Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
b1cf796730
commit
eff16b62cc
@ -98,21 +98,28 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def generate_prov(self, shape, slide_ind, text=""):
|
def generate_prov(
|
||||||
left = shape.left
|
self, shape, slide_ind, text="", slide_size=Size(width=1, height=1)
|
||||||
top = shape.top
|
):
|
||||||
width = shape.width
|
if shape.left:
|
||||||
height = shape.height
|
left = shape.left
|
||||||
|
top = shape.top
|
||||||
|
width = shape.width
|
||||||
|
height = shape.height
|
||||||
|
else:
|
||||||
|
left = 0
|
||||||
|
top = 0
|
||||||
|
width = slide_size.width
|
||||||
|
height = slide_size.height
|
||||||
shape_bbox = [left, top, left + width, top + height]
|
shape_bbox = [left, top, left + width, top + height]
|
||||||
shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
|
shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
|
||||||
# prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
|
|
||||||
prov = ProvenanceItem(
|
prov = ProvenanceItem(
|
||||||
page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
|
page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
|
||||||
)
|
)
|
||||||
|
|
||||||
return prov
|
return prov
|
||||||
|
|
||||||
def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
|
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
||||||
is_a_list = False
|
is_a_list = False
|
||||||
is_list_group_created = False
|
is_list_group_created = False
|
||||||
enum_list_item_value = 0
|
enum_list_item_value = 0
|
||||||
@ -121,7 +128,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
list_text = ""
|
list_text = ""
|
||||||
list_label = GroupLabel.LIST
|
list_label = GroupLabel.LIST
|
||||||
doc_label = DocItemLabel.LIST_ITEM
|
doc_label = DocItemLabel.LIST_ITEM
|
||||||
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
|
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
|
||||||
|
|
||||||
# Identify if shape contains lists
|
# Identify if shape contains lists
|
||||||
for paragraph in shape.text_frame.paragraphs:
|
for paragraph in shape.text_frame.paragraphs:
|
||||||
@ -270,7 +277,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
def handle_pictures(self, shape, parent_slide, slide_ind, doc):
|
def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size):
|
||||||
# Open it with PIL
|
# Open it with PIL
|
||||||
try:
|
try:
|
||||||
# Get the image bytes
|
# Get the image bytes
|
||||||
@ -280,7 +287,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
pil_image = Image.open(BytesIO(image_bytes))
|
pil_image = Image.open(BytesIO(image_bytes))
|
||||||
|
|
||||||
# shape has picture
|
# shape has picture
|
||||||
prov = self.generate_prov(shape, slide_ind, "")
|
prov = self.generate_prov(shape, slide_ind, "", slide_size)
|
||||||
doc.add_picture(
|
doc.add_picture(
|
||||||
parent=parent_slide,
|
parent=parent_slide,
|
||||||
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
|
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
|
||||||
@ -291,13 +298,13 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
|
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
|
||||||
return
|
return
|
||||||
|
|
||||||
def handle_tables(self, shape, parent_slide, slide_ind, doc):
|
def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size):
|
||||||
# Handling tables, images, charts
|
# Handling tables, images, charts
|
||||||
if shape.has_table:
|
if shape.has_table:
|
||||||
table = shape.table
|
table = shape.table
|
||||||
table_xml = shape._element
|
table_xml = shape._element
|
||||||
|
|
||||||
prov = self.generate_prov(shape, slide_ind, "")
|
prov = self.generate_prov(shape, slide_ind, "", slide_size)
|
||||||
|
|
||||||
num_cols = 0
|
num_cols = 0
|
||||||
num_rows = len(table.rows)
|
num_rows = len(table.rows)
|
||||||
@ -374,17 +381,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
|
name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
|
||||||
)
|
)
|
||||||
|
|
||||||
size = Size(width=slide_width, height=slide_height)
|
slide_size = Size(width=slide_width, height=slide_height)
|
||||||
parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
|
parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
|
||||||
|
|
||||||
def handle_shapes(shape, parent_slide, slide_ind, doc):
|
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
|
||||||
handle_groups(shape, parent_slide, slide_ind, doc)
|
handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
|
||||||
if shape.has_table:
|
if shape.has_table:
|
||||||
# Handle Tables
|
# Handle Tables
|
||||||
self.handle_tables(shape, parent_slide, slide_ind, doc)
|
self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
|
||||||
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
||||||
# Handle Pictures
|
# Handle Pictures
|
||||||
self.handle_pictures(shape, parent_slide, slide_ind, doc)
|
self.handle_pictures(
|
||||||
|
shape, parent_slide, slide_ind, doc, slide_size
|
||||||
|
)
|
||||||
# If shape doesn't have any text, move on to the next shape
|
# If shape doesn't have any text, move on to the next shape
|
||||||
if not hasattr(shape, "text"):
|
if not hasattr(shape, "text"):
|
||||||
return
|
return
|
||||||
@ -396,16 +405,20 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
_log.warning("Warning: shape has text but not text_frame")
|
_log.warning("Warning: shape has text but not text_frame")
|
||||||
return
|
return
|
||||||
# Handle other text elements, including lists (bullet lists, numbered lists)
|
# Handle other text elements, including lists (bullet lists, numbered lists)
|
||||||
self.handle_text_elements(shape, parent_slide, slide_ind, doc)
|
self.handle_text_elements(
|
||||||
|
shape, parent_slide, slide_ind, doc, slide_size
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
def handle_groups(shape, parent_slide, slide_ind, doc):
|
def handle_groups(shape, parent_slide, slide_ind, doc, slide_size):
|
||||||
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
||||||
for groupedshape in shape.shapes:
|
for groupedshape in shape.shapes:
|
||||||
handle_shapes(groupedshape, parent_slide, slide_ind, doc)
|
handle_shapes(
|
||||||
|
groupedshape, parent_slide, slide_ind, doc, slide_size
|
||||||
|
)
|
||||||
|
|
||||||
# Loop through each shape in the slide
|
# Loop through each shape in the slide
|
||||||
for shape in slide.shapes:
|
for shape in slide.shapes:
|
||||||
handle_shapes(shape, parent_slide, slide_ind, doc)
|
handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
Loading…
Reference in New Issue
Block a user