From fea0a99a95d97e72687f48f8174d31102655483e Mon Sep 17 00:00:00 2001 From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com> Date: Thu, 30 Jan 2025 14:58:27 +0100 Subject: [PATCH] fix: Fix for the crash when encountering WMF images in pptx and docx (#837) * Fix for the crash when encountering WMF images in pptx and docx backends on non Windows platforms Signed-off-by: Maksym Lysak * Updated faq Signed-off-by: Maksym Lysak --------- Signed-off-by: Maksym Lysak Co-authored-by: Maksym Lysak --- docling/backend/mspowerpoint_backend.py | 9 ++++----- docling/backend/msword_backend.py | 4 ++-- docs/faq.md | 8 ++++++++ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 995969d..aecebdc 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -271,13 +271,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB return def handle_pictures(self, shape, parent_slide, slide_ind, doc): - # Get the image bytes - image = shape.image - image_bytes = image.blob - im_dpi, _ = image.dpi - # Open it with PIL try: + # Get the image bytes + image = shape.image + image_bytes = image.blob + im_dpi, _ = image.dpi pil_image = Image.open(BytesIO(image_bytes)) # shape has picture diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index f8148d5..0af3db5 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -520,11 +520,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): image_data = image_part.blob # Get the binary image data return image_data - image_data = get_docx_image(element, drawing_blip) - image_bytes = BytesIO(image_data) level = self.get_level() # Open the BytesIO object with PIL to create an Image try: + image_data = get_docx_image(element, drawing_blip) + image_bytes = BytesIO(image_data) pil_image = Image.open(image_bytes) doc.add_picture( parent=self.parents[level - 1], diff --git a/docs/faq.md b/docs/faq.md index 96e12ed..d8b85cd 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -151,3 +151,11 @@ This is a collection of FAQ collected from the user questions on