fix: Fixing images in the input Word files (#330)

* Fixing images identification in the input Word files Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Populating extracted image data into docling picture for wordx backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated tests Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * removed base64 dependency in msword_backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
2024-11-14 13:33:34 +01:00 · 2024-11-14 13:33:34 +01:00 · 8533039b0c
commit 8533039b0c
parent bf2a85f1d4
4 changed files with 107 additions and 78 deletions
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -9,10 +9,12 @@ from docling_core.types.doc import (
    DoclingDocument,
    DocumentOrigin,
    GroupLabel,
    ImageRef,
    TableCell,
    TableData,
 )
 from lxml import etree
 from PIL import Image
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
@ -130,13 +132,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
        for element in body:
            tag_name = etree.QName(element).localname
-            # Check for Inline Images (drawings or blip elements)
+            # Check for Inline Images (blip elements)
-            found_drawing = etree.ElementBase.xpath(
+            drawing_blip = element.xpath(".//a:blip")
                element, ".//w:drawing", namespaces=self.xml_namespaces
            )
            found_pict = etree.ElementBase.xpath(
                element, ".//w:pict", namespaces=self.xml_namespaces
            )
            # Check for Tables
            if element.tag.endswith("tbl"):
@ -145,8 +142,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                except Exception:
                    _log.debug("could not parse a table, broken docx table")
-            elif found_drawing or found_pict:
+            elif drawing_blip:
-                self.handle_pictures(element, docx_obj, doc)
+                self.handle_pictures(element, docx_obj, drawing_blip, doc)
            # Check for Text
            elif tag_name in ["p"]:
                self.handle_text_elements(element, docx_obj, doc)
@ -491,6 +488,24 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        doc.add_table(data=data, parent=self.parents[level - 1])
        return
-    def handle_pictures(self, element, docx_obj, doc):
+    def handle_pictures(self, element, docx_obj, drawing_blip, doc):
-        doc.add_picture(parent=self.parents[self.level], caption=None)
+        def get_docx_image(element, drawing_blip):
            rId = drawing_blip[0].get(
                "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
            )
            if rId in docx_obj.part.rels:
                # Access the image part using the relationship ID
                image_part = docx_obj.part.rels[rId].target_part
                image_data = image_part.blob  # Get the binary image data
            return image_data
        image_data = get_docx_image(element, drawing_blip)
        image_bytes = BytesIO(image_data)
        # Open the BytesIO object with PIL to create an Image
        pil_image = Image.open(image_bytes)
        doc.add_picture(
            parent=self.parents[self.level],
            image=ImageRef.from_pil(image=pil_image, dpi=72),
            caption=None,
        )
        return
--- a/tests/data/groundtruth/docling_v2/word_sample.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/word_sample.docx.itxt
@ -2,7 +2,7 @@ item-0 at level 0: unspecified: group _root_
  item-1 at level 1: paragraph: Summer activities
  item-2 at level 1: title: Swimming in the lake
    item-3 at level 2: paragraph: Duck
-    item-4 at level 2: paragraph: 
+    item-4 at level 2: picture
    item-5 at level 2: paragraph: Figure 1: This is a cute duckling
    item-6 at level 2: section_header: Let’s swim!
      item-7 at level 3: paragraph: To get started with swimming, fi ...  down in a water and try not to drown:
--- a/tests/data/groundtruth/docling_v2/word_sample.docx.json
+++ b/tests/data/groundtruth/docling_v2/word_sample.docx.json
--- a/tests/data/groundtruth/docling_v2/word_sample.docx.md
+++ b/tests/data/groundtruth/docling_v2/word_sample.docx.md
@ -4,6 +4,8 @@ Summer activities
 Duck
 <!-- image -->
 Figure 1: This is a cute duckling
 ## Let’s swim!