feat(docx): add text formatting and hyperlink support (#630)

* feat: Enable markdown text formatting for docx

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Fix imports

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Use Formatting

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Handle hyperlink

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Handle formatting properly for DocItemLabel.PARAGRAPH

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Use inline group

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Handle bullet lists

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Strip elements

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Strip elements

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Run black and mypy

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Handle header and footer

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Use inline_fmt everywhere

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Run precommit

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Address feedback

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Fix add_list_item

Signed-off-by: SimJeg <sjegou@nvidia.com>

* fix minor bugs, mark helper methods internal

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

---------

Signed-off-by: SimJeg <sjegou@nvidia.com>
Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
Co-authored-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
Simon Jégou 2025-04-03 15:11:50 +02:00 committed by GitHub
parent 71148eb381
commit bfcab3d677
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 852 additions and 86 deletions

View File

@ -14,15 +14,19 @@ from docling_core.types.doc import (
TableCell, TableCell,
TableData, TableData,
) )
from docling_core.types.doc.document import Formatting
from docx import Document from docx import Document
from docx.document import Document as DocxDocument from docx.document import Document as DocxDocument
from docx.oxml.table import CT_Tc from docx.oxml.table import CT_Tc
from docx.oxml.xmlchemy import BaseOxmlElement from docx.oxml.xmlchemy import BaseOxmlElement
from docx.table import Table, _Cell from docx.table import Table, _Cell
from docx.text.hyperlink import Hyperlink
from docx.text.paragraph import Paragraph from docx.text.paragraph import Paragraph
from docx.text.run import Run
from lxml import etree from lxml import etree
from lxml.etree import XPath from lxml.etree import XPath
from PIL import Image, UnidentifiedImageError from PIL import Image, UnidentifiedImageError
from pydantic import AnyUrl
from typing_extensions import override from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
@ -118,14 +122,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc = DoclingDocument(name=self.file.stem or "file", origin=origin) doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
if self.is_valid(): if self.is_valid():
assert self.docx_obj is not None assert self.docx_obj is not None
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc) doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
return doc return doc
else: else:
raise RuntimeError( raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init." f"Cannot convert doc with {self.document_hash} because the backend failed to init."
) )
def update_history( def _update_history(
self, self,
name: str, name: str,
level: Optional[int], level: Optional[int],
@ -138,26 +142,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.history["numids"].append(numid) self.history["numids"].append(numid)
self.history["indents"].append(ilevel) self.history["indents"].append(ilevel)
def prev_name(self) -> Optional[str]: def _prev_name(self) -> Optional[str]:
return self.history["names"][-1] return self.history["names"][-1]
def prev_level(self) -> Optional[int]: def _prev_level(self) -> Optional[int]:
return self.history["levels"][-1] return self.history["levels"][-1]
def prev_numid(self) -> Optional[int]: def _prev_numid(self) -> Optional[int]:
return self.history["numids"][-1] return self.history["numids"][-1]
def prev_indent(self) -> Optional[int]: def _prev_indent(self) -> Optional[int]:
return self.history["indents"][-1] return self.history["indents"][-1]
def get_level(self) -> int: def _get_level(self) -> int:
"""Return the first None index.""" """Return the first None index."""
for k, v in self.parents.items(): for k, v in self.parents.items():
if k >= 0 and v == None: if k >= 0 and v == None:
return k return k
return 0 return 0
def walk_linear( def _walk_linear(
self, self,
body: BaseOxmlElement, body: BaseOxmlElement,
docx_obj: DocxDocument, docx_obj: DocxDocument,
@ -177,12 +181,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Check for Tables # Check for Tables
if element.tag.endswith("tbl"): if element.tag.endswith("tbl"):
try: try:
self.handle_tables(element, docx_obj, doc) self._handle_tables(element, docx_obj, doc)
except Exception: except Exception:
_log.debug("could not parse a table, broken docx table") _log.debug("could not parse a table, broken docx table")
elif drawing_blip: elif drawing_blip:
self.handle_pictures(docx_obj, drawing_blip, doc) self._handle_pictures(docx_obj, drawing_blip, doc)
# Check for the sdt containers, like table of contents # Check for the sdt containers, like table of contents
elif tag_name in ["sdt"]: elif tag_name in ["sdt"]:
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@ -190,16 +194,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Iterate paragraphs, runs, or text inside <w:sdtContent>. # Iterate paragraphs, runs, or text inside <w:sdtContent>.
paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces) paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
for p in paragraphs: for p in paragraphs:
self.handle_text_elements(p, docx_obj, doc) self._handle_text_elements(p, docx_obj, doc)
# Check for Text # Check for Text
elif tag_name in ["p"]: elif tag_name in ["p"]:
# "tcPr", "sectPr" # "tcPr", "sectPr"
self.handle_text_elements(element, docx_obj, doc) self._handle_text_elements(element, docx_obj, doc)
else: else:
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}") _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
return doc return doc
def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]: def _str_to_int(
self, s: Optional[str], default: Optional[int] = 0
) -> Optional[int]:
if s is None: if s is None:
return None return None
try: try:
@ -207,7 +213,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
except ValueError: except ValueError:
return default return default
def split_text_and_number(self, input_string: str) -> list[str]: def _split_text_and_number(self, input_string: str) -> list[str]:
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string) match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
if match: if match:
parts = list(filter(None, match.groups())) parts = list(filter(None, match.groups()))
@ -215,7 +221,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else: else:
return [input_string] return [input_string]
def get_numId_and_ilvl( def _get_numId_and_ilvl(
self, paragraph: Paragraph self, paragraph: Paragraph
) -> tuple[Optional[int], Optional[int]]: ) -> tuple[Optional[int], Optional[int]]:
# Access the XML element of the paragraph # Access the XML element of the paragraph
@ -230,12 +236,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
return self.str_to_int(numId, None), self.str_to_int(ilvl, None) return self._str_to_int(numId, None), self._str_to_int(ilvl, None)
return None, None # If the paragraph is not part of a list return None, None # If the paragraph is not part of a list
def get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]: def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
parts = self.split_text_and_number(style_label) parts = self._split_text_and_number(style_label)
if len(parts) == 2: if len(parts) == 2:
parts.sort() parts.sort()
@ -243,15 +249,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label_level: Optional[int] = 0 label_level: Optional[int] = 0
if parts[0].strip().lower() == "heading": if parts[0].strip().lower() == "heading":
label_str = "Heading" label_str = "Heading"
label_level = self.str_to_int(parts[1], None) label_level = self._str_to_int(parts[1], None)
if parts[1].strip().lower() == "heading": if parts[1].strip().lower() == "heading":
label_str = "Heading" label_str = "Heading"
label_level = self.str_to_int(parts[0], None) label_level = self._str_to_int(parts[0], None)
return label_str, label_level return label_str, label_level
return style_label, None return style_label, None
def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]: def _get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
if paragraph.style is None: if paragraph.style is None:
return "Normal", None return "Normal", None
@ -264,16 +270,82 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if ":" in label: if ":" in label:
parts = label.split(":") parts = label.split(":")
if len(parts) == 2: if len(parts) == 2:
return parts[0], self.str_to_int(parts[1], None) return parts[0], self._str_to_int(parts[1], None)
if "heading" in label.lower(): if "heading" in label.lower():
return self.get_heading_and_level(label) return self._get_heading_and_level(label)
if "heading" in name.lower(): if "heading" in name.lower():
return self.get_heading_and_level(name) return self._get_heading_and_level(name)
return label, None return label, None
def handle_equations_in_text(self, element, text): @classmethod
def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
has_any_formatting = run.bold or run.italic or run.underline
return (
Formatting(
bold=run.bold or False,
italic=run.italic or False,
underline=run.underline or False,
)
if has_any_formatting
else None
)
def _get_paragraph_elements(self, paragraph: Paragraph):
"""
Extract paragraph elements along with their formatting and hyperlink
"""
# for now retain empty paragraphs for backwards compatibility:
if paragraph.text.strip() == "":
return [("", None, None)]
paragraph_elements: list[
tuple[str, Optional[Formatting], Optional[Union[AnyUrl, Path]]]
] = []
group_text = ""
previous_format = None
# Iterate over the runs of the paragraph and group them by format
for c in paragraph.iter_inner_content():
if isinstance(c, Hyperlink):
text = c.text
hyperlink = Path(c.address)
format = self._get_format_from_run(c.runs[0])
elif isinstance(c, Run):
text = c.text
hyperlink = None
format = self._get_format_from_run(c)
else:
continue
if (len(text.strip()) and format != previous_format) or (
hyperlink is not None
):
# If the style changes for a non empty text, add the previous group
if len(group_text.strip()) > 0:
paragraph_elements.append(
(group_text.strip(), previous_format, None)
)
group_text = ""
# If there is a hyperlink, add it immediately
if hyperlink is not None:
paragraph_elements.append((text.strip(), format, hyperlink))
text = ""
else:
previous_format = format
group_text += text
# Format the last group
if len(group_text.strip()) > 0:
paragraph_elements.append((group_text.strip(), format, None))
return paragraph_elements
def _handle_equations_in_text(self, element, text):
only_texts = [] only_texts = []
only_equations = [] only_equations = []
texts_and_equations = [] texts_and_equations = []
@ -319,7 +391,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return output_text, only_equations return output_text, only_equations
def handle_text_elements( def _create_or_reuse_parent(
self,
*,
doc: DoclingDocument,
prev_parent: Optional[NodeItem],
paragraph_elements: list,
) -> Optional[NodeItem]:
return (
doc.add_group(label=GroupLabel.INLINE, parent=prev_parent)
if len(paragraph_elements) > 1
else prev_parent
)
def _handle_text_elements(
self, self,
element: BaseOxmlElement, element: BaseOxmlElement,
docx_obj: DocxDocument, docx_obj: DocxDocument,
@ -328,10 +413,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
paragraph = Paragraph(element, docx_obj) paragraph = Paragraph(element, docx_obj)
raw_text = paragraph.text raw_text = paragraph.text
text, equations = self.handle_equations_in_text(element=element, text=raw_text) text, equations = self._handle_equations_in_text(element=element, text=raw_text)
if text is None: if text is None:
return return
paragraph_elements = self._get_paragraph_elements(paragraph)
text = text.strip() text = text.strip()
# Common styles for bullet and numbered lists. # Common styles for bullet and numbered lists.
@ -339,8 +425,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Identify wether list is a numbered list or not # Identify wether list is a numbered list or not
# is_numbered = "List Bullet" not in paragraph.style.name # is_numbered = "List Bullet" not in paragraph.style.name
is_numbered = False is_numbered = False
p_style_id, p_level = self.get_label_and_level(paragraph) p_style_id, p_level = self._get_label_and_level(paragraph)
numid, ilevel = self.get_numId_and_ilvl(paragraph) numid, ilevel = self._get_numId_and_ilvl(paragraph)
if numid == 0: if numid == 0:
numid = None numid = None
@ -351,18 +437,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
and ilevel is not None and ilevel is not None
and p_style_id not in ["Title", "Heading"] and p_style_id not in ["Title", "Heading"]
): ):
self.add_listitem( self._add_list_item(
doc, doc=doc,
numid, numid=numid,
ilevel, ilevel=ilevel,
text, elements=paragraph_elements,
is_numbered, is_numbered=is_numbered,
) )
self.update_history(p_style_id, p_level, numid, ilevel) self._update_history(p_style_id, p_level, numid, ilevel)
return return
elif ( elif (
numid is None numid is None
and self.prev_numid() is not None and self._prev_numid() is not None
and p_style_id not in ["Title", "Heading"] and p_style_id not in ["Title", "Heading"]
): # Close list ): # Close list
if self.level_at_new_list: if self.level_at_new_list:
@ -390,12 +476,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) )
else: else:
is_numbered_style = False is_numbered_style = False
self.add_header(doc, p_level, text, is_numbered_style) self._add_header(doc, p_level, text, is_numbered_style)
elif len(equations) > 0: elif len(equations) > 0:
if (raw_text is None or len(raw_text) == 0) and len(text) > 0: if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
# Standalone equation # Standalone equation
level = self.get_level() level = self._get_level()
doc.add_text( doc.add_text(
label=DocItemLabel.FORMULA, label=DocItemLabel.FORMULA,
parent=self.parents[level - 1], parent=self.parents[level - 1],
@ -403,7 +489,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) )
else: else:
# Inline equation # Inline equation
level = self.get_level() level = self._get_level()
inline_equation = doc.add_group( inline_equation = doc.add_group(
label=GroupLabel.INLINE, parent=self.parents[level - 1] label=GroupLabel.INLINE, parent=self.parents[level - 1]
) )
@ -442,30 +528,50 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
"ListBullet", "ListBullet",
"Quote", "Quote",
]: ]:
level = self.get_level() level = self._get_level()
doc.add_text( parent = self._create_or_reuse_parent(
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text doc=doc,
prev_parent=self.parents.get(level - 1),
paragraph_elements=paragraph_elements,
) )
for text, format, hyperlink in paragraph_elements:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
else: else:
# Text style names can, and will have, not only default values but user values too # Text style names can, and will have, not only default values but user values too
# hence we treat all other labels as pure text # hence we treat all other labels as pure text
level = self.get_level() level = self._get_level()
doc.add_text( parent = self._create_or_reuse_parent(
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text doc=doc,
prev_parent=self.parents.get(level - 1),
paragraph_elements=paragraph_elements,
) )
for text, format, hyperlink in paragraph_elements:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
self.update_history(p_style_id, p_level, numid, ilevel) self._update_history(p_style_id, p_level, numid, ilevel)
return return
def add_header( def _add_header(
self, self,
doc: DoclingDocument, doc: DoclingDocument,
curr_level: Optional[int], curr_level: Optional[int],
text: str, text: str,
is_numbered_style: bool = False, is_numbered_style: bool = False,
) -> None: ) -> None:
level = self.get_level() level = self._get_level()
if isinstance(curr_level, int): if isinstance(curr_level, int):
if curr_level > level: if curr_level > level:
# add invisible group # add invisible group
@ -521,19 +627,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) )
return return
def add_listitem( def _add_list_item(
self, self,
*,
doc: DoclingDocument, doc: DoclingDocument,
numid: int, numid: int,
ilevel: int, ilevel: int,
text: str, elements: list,
is_numbered: bool = False, is_numbered: bool = False,
) -> None: ) -> None:
enum_marker = "" enum_marker = ""
level = self.get_level() level = self._get_level()
prev_indent = self.prev_indent() prev_indent = self._prev_indent()
if self.prev_numid() is None: # Open new list if self._prev_numid() is None: # Open new list
self.level_at_new_list = level self.level_at_new_list = level
self.parents[level] = doc.add_group( self.parents[level] = doc.add_group(
@ -545,15 +652,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered: if is_numbered:
enum_marker = str(self.listIter) + "." enum_marker = str(self.listIter) + "."
is_numbered = True is_numbered = True
doc.add_list_item( new_parent = self._create_or_reuse_parent(
marker=enum_marker, doc=doc,
enumerated=is_numbered, prev_parent=self.parents[level],
parent=self.parents[level], paragraph_elements=elements,
text=text,
) )
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elif ( elif (
self.prev_numid() == numid self._prev_numid() == numid
and self.level_at_new_list is not None and self.level_at_new_list is not None
and prev_indent is not None and prev_indent is not None
and prev_indent < ilevel and prev_indent < ilevel
@ -581,15 +696,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered: if is_numbered:
enum_marker = str(self.listIter) + "." enum_marker = str(self.listIter) + "."
is_numbered = True is_numbered = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[self.level_at_new_list + ilevel],
text=text,
)
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[self.level_at_new_list + ilevel],
paragraph_elements=elements,
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elif ( elif (
self.prev_numid() == numid self._prev_numid() == numid
and self.level_at_new_list is not None and self.level_at_new_list is not None
and prev_indent is not None and prev_indent is not None
and ilevel < prev_indent and ilevel < prev_indent
@ -603,29 +726,46 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered: if is_numbered:
enum_marker = str(self.listIter) + "." enum_marker = str(self.listIter) + "."
is_numbered = True is_numbered = True
doc.add_list_item( new_parent = self._create_or_reuse_parent(
marker=enum_marker, doc=doc,
enumerated=is_numbered, prev_parent=self.parents[self.level_at_new_list + ilevel],
parent=self.parents[self.level_at_new_list + ilevel], paragraph_elements=elements,
text=text,
) )
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
self.listIter = 0 self.listIter = 0
elif self.prev_numid() == numid or prev_indent == ilevel: elif self._prev_numid() == numid or prev_indent == ilevel:
# TODO: Set marker and enumerated arguments if this is an enumeration element. # TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1 self.listIter += 1
if is_numbered: if is_numbered:
enum_marker = str(self.listIter) + "." enum_marker = str(self.listIter) + "."
is_numbered = True is_numbered = True
doc.add_list_item( new_parent = self._create_or_reuse_parent(
marker=enum_marker, doc=doc,
enumerated=is_numbered, prev_parent=self.parents[level - 1],
parent=self.parents[level - 1], paragraph_elements=elements,
text=text,
) )
for text, format, hyperlink in elements:
# Add the list item to the parent group
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
return return
def handle_tables( def _handle_tables(
self, self,
element: BaseOxmlElement, element: BaseOxmlElement,
docx_obj: DocxDocument, docx_obj: DocxDocument,
@ -640,7 +780,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
cell_element = table.rows[0].cells[0] cell_element = table.rows[0].cells[0]
# In case we have a table of only 1 cell, we consider it furniture # In case we have a table of only 1 cell, we consider it furniture
# And proceed processing the content of the cell as though it's in the document body # And proceed processing the content of the cell as though it's in the document body
self.walk_linear(cell_element._element, docx_obj, doc) self._walk_linear(cell_element._element, docx_obj, doc)
return return
data = TableData(num_rows=num_rows, num_cols=num_cols) data = TableData(num_rows=num_rows, num_cols=num_cols)
@ -685,11 +825,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
data.table_cells.append(table_cell) data.table_cells.append(table_cell)
col_idx += cell.grid_span col_idx += cell.grid_span
level = self.get_level() level = self._get_level()
doc.add_table(data=data, parent=self.parents[level - 1]) doc.add_table(data=data, parent=self.parents[level - 1])
return return
def handle_pictures( def _handle_pictures(
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
) -> None: ) -> None:
def get_docx_image(drawing_blip): def get_docx_image(drawing_blip):
@ -702,7 +842,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
image_data = image_part.blob # Get the binary image data image_data = image_part.blob # Get the binary image data
return image_data return image_data
level = self.get_level() level = self._get_level()
# Open the BytesIO object with PIL to create an Image # Open the BytesIO object with PIL to create an Image
try: try:
image_data = get_docx_image(drawing_blip) image_data = get_docx_image(drawing_blip)

Binary file not shown.

View File

@ -0,0 +1,30 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: italic
item-2 at level 1: paragraph: bold
item-3 at level 1: paragraph: underline
item-4 at level 1: paragraph: hyperlink
item-5 at level 1: paragraph: italic and bold hyperlink
item-6 at level 1: inline: group group
item-7 at level 2: paragraph: Normal
item-8 at level 2: paragraph: italic
item-9 at level 2: paragraph: bold
item-10 at level 2: paragraph: underline
item-11 at level 2: paragraph: and
item-12 at level 2: paragraph: hyperlink
item-13 at level 2: paragraph: on the same line
item-14 at level 1: paragraph:
item-15 at level 1: list: group list
item-16 at level 2: list_item: Italic bullet 1
item-17 at level 2: list_item: Bold bullet 2
item-18 at level 2: list_item: Underline bullet 3
item-19 at level 2: inline: group group
item-20 at level 3: list_item: Some
item-21 at level 3: list_item: italic
item-22 at level 3: list_item: bold
item-23 at level 3: list_item: underline
item-24 at level 2: list: group list
item-25 at level 3: inline: group group
item-26 at level 4: list_item: Nested
item-27 at level 4: list_item: italic
item-28 at level 4: list_item: bold
item-29 at level 1: paragraph:

View File

@ -0,0 +1,577 @@
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"name": "unit_test_formatting",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"binary_hash": 16380079676357958448,
"filename": "unit_test_formatting.docx"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
},
{
"$ref": "#/texts/4"
},
{
"$ref": "#/groups/0"
},
{
"$ref": "#/texts/12"
},
{
"$ref": "#/groups/1"
},
{
"$ref": "#/texts/23"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/5"
},
{
"$ref": "#/texts/6"
},
{
"$ref": "#/texts/7"
},
{
"$ref": "#/texts/8"
},
{
"$ref": "#/texts/9"
},
{
"$ref": "#/texts/10"
},
{
"$ref": "#/texts/11"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/13"
},
{
"$ref": "#/texts/14"
},
{
"$ref": "#/texts/15"
},
{
"$ref": "#/groups/2"
},
{
"$ref": "#/groups/3"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/groups/1"
},
"children": [
{
"$ref": "#/texts/16"
},
{
"$ref": "#/texts/17"
},
{
"$ref": "#/texts/18"
},
{
"$ref": "#/texts/19"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/3",
"parent": {
"$ref": "#/groups/1"
},
"children": [
{
"$ref": "#/groups/4"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/4",
"parent": {
"$ref": "#/groups/3"
},
"children": [
{
"$ref": "#/texts/20"
},
{
"$ref": "#/texts/21"
},
{
"$ref": "#/texts/22"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
}
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "hyperlink",
"text": "hyperlink",
"hyperlink": "https:/github.com/DS4SD/docling"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "italic and bold hyperlink",
"text": "italic and bold hyperlink",
"formatting": {
"bold": true,
"italic": true,
"underline": false,
"strikethrough": false
},
"hyperlink": "https:/github.com/DS4SD/docling"
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "Normal",
"text": "Normal"
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
}
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "and",
"text": "and"
},
{
"self_ref": "#/texts/10",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "hyperlink",
"text": "hyperlink",
"hyperlink": "https:/github.com/DS4SD/docling"
},
{
"self_ref": "#/texts/11",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "on the same line",
"text": "on the same line"
},
{
"self_ref": "#/texts/12",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/13",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Italic bullet 1",
"text": "Italic bullet 1",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/14",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Bold bullet 2",
"text": "Bold bullet 2",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/15",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Underline bullet 3",
"text": "Underline bullet 3",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/16",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Some",
"text": "Some",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/17",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/18",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/20",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Nested",
"text": "Nested",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/21",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/22",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/23",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@ -0,0 +1,17 @@
*italic*
**bold**
underline
[hyperlink](https:/github.com/DS4SD/docling)
[***italic and bold hyperlink***](https:/github.com/DS4SD/docling)
Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docling) on the same line
- *Italic bullet 1*
- **Bold bullet 2**
- Underline bullet 3
- Some *italic* **bold** underline
- Nested *italic* **bold**

View File

@ -76,17 +76,19 @@ def test_e2e_docx_conversions():
doc: DoclingDocument = conv_result.document doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown() pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md" assert verify_export(
pred_md, str(gt_path) + ".md", generate=GENERATE
), "export to md"
pred_itxt: str = doc._export_to_indented_text( pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False max_text_len=70, explicit_tables=False
) )
assert verify_export( assert verify_export(
pred_itxt, str(gt_path) + ".itxt" pred_itxt, str(gt_path) + ".itxt", generate=GENERATE
), "export to indented-text" ), "export to indented-text"
assert verify_document( assert verify_document(
doc, str(gt_path) + ".json", GENERATE doc, str(gt_path) + ".json", generate=GENERATE
), "document document" ), "document document"
if docx_path.name == "word_tables.docx": if docx_path.name == "word_tables.docx":