fix: Fixing images in the input Word files (#330)
* Fixing images identification in the input Word files Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Populating extracted image data into docling picture for wordx backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated tests Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * removed base64 dependency in msword_backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
bf2a85f1d4
commit
8533039b0c
@ -9,10 +9,12 @@ from docling_core.types.doc import (
|
|||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
DocumentOrigin,
|
||||||
GroupLabel,
|
GroupLabel,
|
||||||
|
ImageRef,
|
||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
@ -130,13 +132,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
||||||
for element in body:
|
for element in body:
|
||||||
tag_name = etree.QName(element).localname
|
tag_name = etree.QName(element).localname
|
||||||
# Check for Inline Images (drawings or blip elements)
|
# Check for Inline Images (blip elements)
|
||||||
found_drawing = etree.ElementBase.xpath(
|
drawing_blip = element.xpath(".//a:blip")
|
||||||
element, ".//w:drawing", namespaces=self.xml_namespaces
|
|
||||||
)
|
|
||||||
found_pict = etree.ElementBase.xpath(
|
|
||||||
element, ".//w:pict", namespaces=self.xml_namespaces
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check for Tables
|
# Check for Tables
|
||||||
if element.tag.endswith("tbl"):
|
if element.tag.endswith("tbl"):
|
||||||
@ -145,8 +142,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
except Exception:
|
except Exception:
|
||||||
_log.debug("could not parse a table, broken docx table")
|
_log.debug("could not parse a table, broken docx table")
|
||||||
|
|
||||||
elif found_drawing or found_pict:
|
elif drawing_blip:
|
||||||
self.handle_pictures(element, docx_obj, doc)
|
self.handle_pictures(element, docx_obj, drawing_blip, doc)
|
||||||
# Check for Text
|
# Check for Text
|
||||||
elif tag_name in ["p"]:
|
elif tag_name in ["p"]:
|
||||||
self.handle_text_elements(element, docx_obj, doc)
|
self.handle_text_elements(element, docx_obj, doc)
|
||||||
@ -491,6 +488,24 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc.add_table(data=data, parent=self.parents[level - 1])
|
doc.add_table(data=data, parent=self.parents[level - 1])
|
||||||
return
|
return
|
||||||
|
|
||||||
def handle_pictures(self, element, docx_obj, doc):
|
def handle_pictures(self, element, docx_obj, drawing_blip, doc):
|
||||||
doc.add_picture(parent=self.parents[self.level], caption=None)
|
def get_docx_image(element, drawing_blip):
|
||||||
|
rId = drawing_blip[0].get(
|
||||||
|
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
|
||||||
|
)
|
||||||
|
if rId in docx_obj.part.rels:
|
||||||
|
# Access the image part using the relationship ID
|
||||||
|
image_part = docx_obj.part.rels[rId].target_part
|
||||||
|
image_data = image_part.blob # Get the binary image data
|
||||||
|
return image_data
|
||||||
|
|
||||||
|
image_data = get_docx_image(element, drawing_blip)
|
||||||
|
image_bytes = BytesIO(image_data)
|
||||||
|
# Open the BytesIO object with PIL to create an Image
|
||||||
|
pil_image = Image.open(image_bytes)
|
||||||
|
doc.add_picture(
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||||
|
caption=None,
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
@ -2,7 +2,7 @@ item-0 at level 0: unspecified: group _root_
|
|||||||
item-1 at level 1: paragraph: Summer activities
|
item-1 at level 1: paragraph: Summer activities
|
||||||
item-2 at level 1: title: Swimming in the lake
|
item-2 at level 1: title: Swimming in the lake
|
||||||
item-3 at level 2: paragraph: Duck
|
item-3 at level 2: paragraph: Duck
|
||||||
item-4 at level 2: paragraph:
|
item-4 at level 2: picture
|
||||||
item-5 at level 2: paragraph: Figure 1: This is a cute duckling
|
item-5 at level 2: paragraph: Figure 1: This is a cute duckling
|
||||||
item-6 at level 2: section_header: Let’s swim!
|
item-6 at level 2: section_header: Let’s swim!
|
||||||
item-7 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown:
|
item-7 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown:
|
||||||
|
File diff suppressed because one or more lines are too long
@ -4,6 +4,8 @@ Summer activities
|
|||||||
|
|
||||||
Duck
|
Duck
|
||||||
|
|
||||||
|
<!-- image -->
|
||||||
|
|
||||||
Figure 1: This is a cute duckling
|
Figure 1: This is a cute duckling
|
||||||
|
|
||||||
## Let’s swim!
|
## Let’s swim!
|
||||||
|
Loading…
Reference in New Issue
Block a user