fix(mspowerpoint): handle invalid images in PowerPoint slides (#650)

- Add error handling for images that cannot be loaded by Pillow
- Improve resilience when encountering corrupted or unsupported image formats
- Maintain processing of other slide elements even if an image fails to load

Signed-off-by: Tendo33 <sjf1998112@gmail.com>
This commit is contained in:
Jinfeng Sun 2025-01-07 20:58:10 +08:00 committed by GitHub
parent 0ee849e8bc
commit d49650c54f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -16,7 +16,7 @@ from docling_core.types.doc import (
TableCell, TableCell,
TableData, TableData,
) )
from PIL import Image from PIL import Image, UnidentifiedImageError
from pptx import Presentation from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
@ -120,6 +120,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
bullet_type = "None" bullet_type = "None"
list_text = "" list_text = ""
list_label = GroupLabel.LIST list_label = GroupLabel.LIST
doc_label = DocItemLabel.LIST_ITEM
prov = self.generate_prov(shape, slide_ind, shape.text.strip()) prov = self.generate_prov(shape, slide_ind, shape.text.strip())
# Identify if shape contains lists # Identify if shape contains lists
@ -276,6 +277,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
im_dpi, _ = image.dpi im_dpi, _ = image.dpi
# Open it with PIL # Open it with PIL
try:
pil_image = Image.open(BytesIO(image_bytes)) pil_image = Image.open(BytesIO(image_bytes))
# shape has picture # shape has picture
@ -286,6 +288,8 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
caption=None, caption=None,
prov=prov, prov=prov,
) )
except (UnidentifiedImageError, OSError) as e:
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
return return
def handle_tables(self, shape, parent_slide, slide_ind, doc): def handle_tables(self, shape, parent_slide, slide_ind, doc):