feat(docx): add text formatting and hyperlink support (#630)
* feat: Enable markdown text formatting for docx Signed-off-by: SimJeg <sjegou@nvidia.com> * Fix imports Signed-off-by: SimJeg <sjegou@nvidia.com> * Use Formatting Signed-off-by: SimJeg <sjegou@nvidia.com> * Handle hyperlink Signed-off-by: SimJeg <sjegou@nvidia.com> * Handle formatting properly for DocItemLabel.PARAGRAPH Signed-off-by: SimJeg <sjegou@nvidia.com> * Use inline group Signed-off-by: SimJeg <sjegou@nvidia.com> * Handle bullet lists Signed-off-by: SimJeg <sjegou@nvidia.com> * Strip elements Signed-off-by: SimJeg <sjegou@nvidia.com> * Strip elements Signed-off-by: SimJeg <sjegou@nvidia.com> * Run black and mypy Signed-off-by: SimJeg <sjegou@nvidia.com> * Handle header and footer Signed-off-by: SimJeg <sjegou@nvidia.com> * Use inline_fmt everywhere Signed-off-by: SimJeg <sjegou@nvidia.com> * Run precommit Signed-off-by: SimJeg <sjegou@nvidia.com> * Address feedback Signed-off-by: SimJeg <sjegou@nvidia.com> * Fix add_list_item Signed-off-by: SimJeg <sjegou@nvidia.com> * fix minor bugs, mark helper methods internal Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> --------- Signed-off-by: SimJeg <sjegou@nvidia.com> Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> Co-authored-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
parent
71148eb381
commit
bfcab3d677
@ -14,15 +14,19 @@ from docling_core.types.doc import (
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
from docling_core.types.doc.document import Formatting
|
||||
from docx import Document
|
||||
from docx.document import Document as DocxDocument
|
||||
from docx.oxml.table import CT_Tc
|
||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||
from docx.table import Table, _Cell
|
||||
from docx.text.hyperlink import Hyperlink
|
||||
from docx.text.paragraph import Paragraph
|
||||
from docx.text.run import Run
|
||||
from lxml import etree
|
||||
from lxml.etree import XPath
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
from pydantic import AnyUrl
|
||||
from typing_extensions import override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
@ -118,14 +122,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||
if self.is_valid():
|
||||
assert self.docx_obj is not None
|
||||
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
||||
doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
||||
return doc
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
||||
)
|
||||
|
||||
def update_history(
|
||||
def _update_history(
|
||||
self,
|
||||
name: str,
|
||||
level: Optional[int],
|
||||
@ -138,26 +142,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.history["numids"].append(numid)
|
||||
self.history["indents"].append(ilevel)
|
||||
|
||||
def prev_name(self) -> Optional[str]:
|
||||
def _prev_name(self) -> Optional[str]:
|
||||
return self.history["names"][-1]
|
||||
|
||||
def prev_level(self) -> Optional[int]:
|
||||
def _prev_level(self) -> Optional[int]:
|
||||
return self.history["levels"][-1]
|
||||
|
||||
def prev_numid(self) -> Optional[int]:
|
||||
def _prev_numid(self) -> Optional[int]:
|
||||
return self.history["numids"][-1]
|
||||
|
||||
def prev_indent(self) -> Optional[int]:
|
||||
def _prev_indent(self) -> Optional[int]:
|
||||
return self.history["indents"][-1]
|
||||
|
||||
def get_level(self) -> int:
|
||||
def _get_level(self) -> int:
|
||||
"""Return the first None index."""
|
||||
for k, v in self.parents.items():
|
||||
if k >= 0 and v == None:
|
||||
return k
|
||||
return 0
|
||||
|
||||
def walk_linear(
|
||||
def _walk_linear(
|
||||
self,
|
||||
body: BaseOxmlElement,
|
||||
docx_obj: DocxDocument,
|
||||
@ -177,12 +181,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Check for Tables
|
||||
if element.tag.endswith("tbl"):
|
||||
try:
|
||||
self.handle_tables(element, docx_obj, doc)
|
||||
self._handle_tables(element, docx_obj, doc)
|
||||
except Exception:
|
||||
_log.debug("could not parse a table, broken docx table")
|
||||
|
||||
elif drawing_blip:
|
||||
self.handle_pictures(docx_obj, drawing_blip, doc)
|
||||
self._handle_pictures(docx_obj, drawing_blip, doc)
|
||||
# Check for the sdt containers, like table of contents
|
||||
elif tag_name in ["sdt"]:
|
||||
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
||||
@ -190,16 +194,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Iterate paragraphs, runs, or text inside <w:sdtContent>.
|
||||
paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
|
||||
for p in paragraphs:
|
||||
self.handle_text_elements(p, docx_obj, doc)
|
||||
self._handle_text_elements(p, docx_obj, doc)
|
||||
# Check for Text
|
||||
elif tag_name in ["p"]:
|
||||
# "tcPr", "sectPr"
|
||||
self.handle_text_elements(element, docx_obj, doc)
|
||||
self._handle_text_elements(element, docx_obj, doc)
|
||||
else:
|
||||
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||
return doc
|
||||
|
||||
def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]:
|
||||
def _str_to_int(
|
||||
self, s: Optional[str], default: Optional[int] = 0
|
||||
) -> Optional[int]:
|
||||
if s is None:
|
||||
return None
|
||||
try:
|
||||
@ -207,7 +213,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
def split_text_and_number(self, input_string: str) -> list[str]:
|
||||
def _split_text_and_number(self, input_string: str) -> list[str]:
|
||||
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
|
||||
if match:
|
||||
parts = list(filter(None, match.groups()))
|
||||
@ -215,7 +221,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
return [input_string]
|
||||
|
||||
def get_numId_and_ilvl(
|
||||
def _get_numId_and_ilvl(
|
||||
self, paragraph: Paragraph
|
||||
) -> tuple[Optional[int], Optional[int]]:
|
||||
# Access the XML element of the paragraph
|
||||
@ -230,12 +236,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
|
||||
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
|
||||
|
||||
return self.str_to_int(numId, None), self.str_to_int(ilvl, None)
|
||||
return self._str_to_int(numId, None), self._str_to_int(ilvl, None)
|
||||
|
||||
return None, None # If the paragraph is not part of a list
|
||||
|
||||
def get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
|
||||
parts = self.split_text_and_number(style_label)
|
||||
def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
|
||||
parts = self._split_text_and_number(style_label)
|
||||
|
||||
if len(parts) == 2:
|
||||
parts.sort()
|
||||
@ -243,15 +249,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
label_level: Optional[int] = 0
|
||||
if parts[0].strip().lower() == "heading":
|
||||
label_str = "Heading"
|
||||
label_level = self.str_to_int(parts[1], None)
|
||||
label_level = self._str_to_int(parts[1], None)
|
||||
if parts[1].strip().lower() == "heading":
|
||||
label_str = "Heading"
|
||||
label_level = self.str_to_int(parts[0], None)
|
||||
label_level = self._str_to_int(parts[0], None)
|
||||
return label_str, label_level
|
||||
|
||||
return style_label, None
|
||||
|
||||
def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
|
||||
def _get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
|
||||
if paragraph.style is None:
|
||||
return "Normal", None
|
||||
|
||||
@ -264,16 +270,82 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if ":" in label:
|
||||
parts = label.split(":")
|
||||
if len(parts) == 2:
|
||||
return parts[0], self.str_to_int(parts[1], None)
|
||||
return parts[0], self._str_to_int(parts[1], None)
|
||||
|
||||
if "heading" in label.lower():
|
||||
return self.get_heading_and_level(label)
|
||||
return self._get_heading_and_level(label)
|
||||
if "heading" in name.lower():
|
||||
return self.get_heading_and_level(name)
|
||||
return self._get_heading_and_level(name)
|
||||
|
||||
return label, None
|
||||
|
||||
def handle_equations_in_text(self, element, text):
|
||||
@classmethod
|
||||
def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
|
||||
has_any_formatting = run.bold or run.italic or run.underline
|
||||
return (
|
||||
Formatting(
|
||||
bold=run.bold or False,
|
||||
italic=run.italic or False,
|
||||
underline=run.underline or False,
|
||||
)
|
||||
if has_any_formatting
|
||||
else None
|
||||
)
|
||||
|
||||
def _get_paragraph_elements(self, paragraph: Paragraph):
|
||||
"""
|
||||
Extract paragraph elements along with their formatting and hyperlink
|
||||
"""
|
||||
|
||||
# for now retain empty paragraphs for backwards compatibility:
|
||||
if paragraph.text.strip() == "":
|
||||
return [("", None, None)]
|
||||
|
||||
paragraph_elements: list[
|
||||
tuple[str, Optional[Formatting], Optional[Union[AnyUrl, Path]]]
|
||||
] = []
|
||||
group_text = ""
|
||||
previous_format = None
|
||||
|
||||
# Iterate over the runs of the paragraph and group them by format
|
||||
for c in paragraph.iter_inner_content():
|
||||
if isinstance(c, Hyperlink):
|
||||
text = c.text
|
||||
hyperlink = Path(c.address)
|
||||
format = self._get_format_from_run(c.runs[0])
|
||||
elif isinstance(c, Run):
|
||||
text = c.text
|
||||
hyperlink = None
|
||||
format = self._get_format_from_run(c)
|
||||
else:
|
||||
continue
|
||||
|
||||
if (len(text.strip()) and format != previous_format) or (
|
||||
hyperlink is not None
|
||||
):
|
||||
# If the style changes for a non empty text, add the previous group
|
||||
if len(group_text.strip()) > 0:
|
||||
paragraph_elements.append(
|
||||
(group_text.strip(), previous_format, None)
|
||||
)
|
||||
group_text = ""
|
||||
|
||||
# If there is a hyperlink, add it immediately
|
||||
if hyperlink is not None:
|
||||
paragraph_elements.append((text.strip(), format, hyperlink))
|
||||
text = ""
|
||||
else:
|
||||
previous_format = format
|
||||
|
||||
group_text += text
|
||||
|
||||
# Format the last group
|
||||
if len(group_text.strip()) > 0:
|
||||
paragraph_elements.append((group_text.strip(), format, None))
|
||||
|
||||
return paragraph_elements
|
||||
|
||||
def _handle_equations_in_text(self, element, text):
|
||||
only_texts = []
|
||||
only_equations = []
|
||||
texts_and_equations = []
|
||||
@ -319,7 +391,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return output_text, only_equations
|
||||
|
||||
def handle_text_elements(
|
||||
def _create_or_reuse_parent(
|
||||
self,
|
||||
*,
|
||||
doc: DoclingDocument,
|
||||
prev_parent: Optional[NodeItem],
|
||||
paragraph_elements: list,
|
||||
) -> Optional[NodeItem]:
|
||||
return (
|
||||
doc.add_group(label=GroupLabel.INLINE, parent=prev_parent)
|
||||
if len(paragraph_elements) > 1
|
||||
else prev_parent
|
||||
)
|
||||
|
||||
def _handle_text_elements(
|
||||
self,
|
||||
element: BaseOxmlElement,
|
||||
docx_obj: DocxDocument,
|
||||
@ -328,10 +413,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
paragraph = Paragraph(element, docx_obj)
|
||||
|
||||
raw_text = paragraph.text
|
||||
text, equations = self.handle_equations_in_text(element=element, text=raw_text)
|
||||
text, equations = self._handle_equations_in_text(element=element, text=raw_text)
|
||||
|
||||
if text is None:
|
||||
return
|
||||
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||
text = text.strip()
|
||||
|
||||
# Common styles for bullet and numbered lists.
|
||||
@ -339,8 +425,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Identify wether list is a numbered list or not
|
||||
# is_numbered = "List Bullet" not in paragraph.style.name
|
||||
is_numbered = False
|
||||
p_style_id, p_level = self.get_label_and_level(paragraph)
|
||||
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
||||
p_style_id, p_level = self._get_label_and_level(paragraph)
|
||||
numid, ilevel = self._get_numId_and_ilvl(paragraph)
|
||||
|
||||
if numid == 0:
|
||||
numid = None
|
||||
@ -351,18 +437,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
and ilevel is not None
|
||||
and p_style_id not in ["Title", "Heading"]
|
||||
):
|
||||
self.add_listitem(
|
||||
doc,
|
||||
numid,
|
||||
ilevel,
|
||||
text,
|
||||
is_numbered,
|
||||
self._add_list_item(
|
||||
doc=doc,
|
||||
numid=numid,
|
||||
ilevel=ilevel,
|
||||
elements=paragraph_elements,
|
||||
is_numbered=is_numbered,
|
||||
)
|
||||
self.update_history(p_style_id, p_level, numid, ilevel)
|
||||
self._update_history(p_style_id, p_level, numid, ilevel)
|
||||
return
|
||||
elif (
|
||||
numid is None
|
||||
and self.prev_numid() is not None
|
||||
and self._prev_numid() is not None
|
||||
and p_style_id not in ["Title", "Heading"]
|
||||
): # Close list
|
||||
if self.level_at_new_list:
|
||||
@ -390,12 +476,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
else:
|
||||
is_numbered_style = False
|
||||
self.add_header(doc, p_level, text, is_numbered_style)
|
||||
self._add_header(doc, p_level, text, is_numbered_style)
|
||||
|
||||
elif len(equations) > 0:
|
||||
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
|
||||
# Standalone equation
|
||||
level = self.get_level()
|
||||
level = self._get_level()
|
||||
doc.add_text(
|
||||
label=DocItemLabel.FORMULA,
|
||||
parent=self.parents[level - 1],
|
||||
@ -403,7 +489,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
else:
|
||||
# Inline equation
|
||||
level = self.get_level()
|
||||
level = self._get_level()
|
||||
inline_equation = doc.add_group(
|
||||
label=GroupLabel.INLINE, parent=self.parents[level - 1]
|
||||
)
|
||||
@ -442,30 +528,50 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
"ListBullet",
|
||||
"Quote",
|
||||
]:
|
||||
level = self.get_level()
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
||||
level = self._get_level()
|
||||
parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents.get(level - 1),
|
||||
paragraph_elements=paragraph_elements,
|
||||
)
|
||||
for text, format, hyperlink in paragraph_elements:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
parent=parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
else:
|
||||
# Text style names can, and will have, not only default values but user values too
|
||||
# hence we treat all other labels as pure text
|
||||
level = self.get_level()
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
||||
level = self._get_level()
|
||||
parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents.get(level - 1),
|
||||
paragraph_elements=paragraph_elements,
|
||||
)
|
||||
for text, format, hyperlink in paragraph_elements:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
parent=parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
self.update_history(p_style_id, p_level, numid, ilevel)
|
||||
self._update_history(p_style_id, p_level, numid, ilevel)
|
||||
return
|
||||
|
||||
def add_header(
|
||||
def _add_header(
|
||||
self,
|
||||
doc: DoclingDocument,
|
||||
curr_level: Optional[int],
|
||||
text: str,
|
||||
is_numbered_style: bool = False,
|
||||
) -> None:
|
||||
level = self.get_level()
|
||||
level = self._get_level()
|
||||
if isinstance(curr_level, int):
|
||||
if curr_level > level:
|
||||
# add invisible group
|
||||
@ -521,19 +627,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
return
|
||||
|
||||
def add_listitem(
|
||||
def _add_list_item(
|
||||
self,
|
||||
*,
|
||||
doc: DoclingDocument,
|
||||
numid: int,
|
||||
ilevel: int,
|
||||
text: str,
|
||||
elements: list,
|
||||
is_numbered: bool = False,
|
||||
) -> None:
|
||||
enum_marker = ""
|
||||
|
||||
level = self.get_level()
|
||||
prev_indent = self.prev_indent()
|
||||
if self.prev_numid() is None: # Open new list
|
||||
level = self._get_level()
|
||||
prev_indent = self._prev_indent()
|
||||
if self._prev_numid() is None: # Open new list
|
||||
self.level_at_new_list = level
|
||||
|
||||
self.parents[level] = doc.add_group(
|
||||
@ -545,15 +652,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=self.parents[level],
|
||||
text=text,
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[level],
|
||||
paragraph_elements=elements,
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
elif (
|
||||
self.prev_numid() == numid
|
||||
self._prev_numid() == numid
|
||||
and self.level_at_new_list is not None
|
||||
and prev_indent is not None
|
||||
and prev_indent < ilevel
|
||||
@ -581,15 +696,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=self.parents[self.level_at_new_list + ilevel],
|
||||
text=text,
|
||||
)
|
||||
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
||||
paragraph_elements=elements,
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
elif (
|
||||
self.prev_numid() == numid
|
||||
self._prev_numid() == numid
|
||||
and self.level_at_new_list is not None
|
||||
and prev_indent is not None
|
||||
and ilevel < prev_indent
|
||||
@ -603,29 +726,46 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=self.parents[self.level_at_new_list + ilevel],
|
||||
text=text,
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
||||
paragraph_elements=elements,
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
self.listIter = 0
|
||||
|
||||
elif self.prev_numid() == numid or prev_indent == ilevel:
|
||||
elif self._prev_numid() == numid or prev_indent == ilevel:
|
||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||
self.listIter += 1
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=self.parents[level - 1],
|
||||
text=text,
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[level - 1],
|
||||
paragraph_elements=elements,
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
# Add the list item to the parent group
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
return
|
||||
|
||||
def handle_tables(
|
||||
def _handle_tables(
|
||||
self,
|
||||
element: BaseOxmlElement,
|
||||
docx_obj: DocxDocument,
|
||||
@ -640,7 +780,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
cell_element = table.rows[0].cells[0]
|
||||
# In case we have a table of only 1 cell, we consider it furniture
|
||||
# And proceed processing the content of the cell as though it's in the document body
|
||||
self.walk_linear(cell_element._element, docx_obj, doc)
|
||||
self._walk_linear(cell_element._element, docx_obj, doc)
|
||||
return
|
||||
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols)
|
||||
@ -685,11 +825,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
data.table_cells.append(table_cell)
|
||||
col_idx += cell.grid_span
|
||||
|
||||
level = self.get_level()
|
||||
level = self._get_level()
|
||||
doc.add_table(data=data, parent=self.parents[level - 1])
|
||||
return
|
||||
|
||||
def handle_pictures(
|
||||
def _handle_pictures(
|
||||
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
|
||||
) -> None:
|
||||
def get_docx_image(drawing_blip):
|
||||
@ -702,7 +842,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
image_data = image_part.blob # Get the binary image data
|
||||
return image_data
|
||||
|
||||
level = self.get_level()
|
||||
level = self._get_level()
|
||||
# Open the BytesIO object with PIL to create an Image
|
||||
try:
|
||||
image_data = get_docx_image(drawing_blip)
|
||||
|
BIN
tests/data/docx/unit_test_formatting.docx
Normal file
BIN
tests/data/docx/unit_test_formatting.docx
Normal file
Binary file not shown.
@ -0,0 +1,30 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: paragraph: italic
|
||||
item-2 at level 1: paragraph: bold
|
||||
item-3 at level 1: paragraph: underline
|
||||
item-4 at level 1: paragraph: hyperlink
|
||||
item-5 at level 1: paragraph: italic and bold hyperlink
|
||||
item-6 at level 1: inline: group group
|
||||
item-7 at level 2: paragraph: Normal
|
||||
item-8 at level 2: paragraph: italic
|
||||
item-9 at level 2: paragraph: bold
|
||||
item-10 at level 2: paragraph: underline
|
||||
item-11 at level 2: paragraph: and
|
||||
item-12 at level 2: paragraph: hyperlink
|
||||
item-13 at level 2: paragraph: on the same line
|
||||
item-14 at level 1: paragraph:
|
||||
item-15 at level 1: list: group list
|
||||
item-16 at level 2: list_item: Italic bullet 1
|
||||
item-17 at level 2: list_item: Bold bullet 2
|
||||
item-18 at level 2: list_item: Underline bullet 3
|
||||
item-19 at level 2: inline: group group
|
||||
item-20 at level 3: list_item: Some
|
||||
item-21 at level 3: list_item: italic
|
||||
item-22 at level 3: list_item: bold
|
||||
item-23 at level 3: list_item: underline
|
||||
item-24 at level 2: list: group list
|
||||
item-25 at level 3: inline: group group
|
||||
item-26 at level 4: list_item: Nested
|
||||
item-27 at level 4: list_item: italic
|
||||
item-28 at level 4: list_item: bold
|
||||
item-29 at level 1: paragraph:
|
577
tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json
Normal file
577
tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json
Normal file
@ -0,0 +1,577 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.3.0",
|
||||
"name": "unit_test_formatting",
|
||||
"origin": {
|
||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"binary_hash": 16380079676357958448,
|
||||
"filename": "unit_test_formatting.docx"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/2"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/3"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/4"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/12"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/23"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/5"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/6"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/7"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/8"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/9"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/10"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/11"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/1",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/13"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/14"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/15"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/3"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "list",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/2",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/16"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/17"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/18"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/19"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/3",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/4"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "list",
|
||||
"label": "list"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/groups/4",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/20"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/21"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/22"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "group",
|
||||
"label": "inline"
|
||||
}
|
||||
],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "italic",
|
||||
"text": "italic",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/1",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "bold",
|
||||
"text": "bold",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/2",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "underline",
|
||||
"text": "underline",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": true,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/3",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "hyperlink",
|
||||
"text": "hyperlink",
|
||||
"hyperlink": "https:/github.com/DS4SD/docling"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/4",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "italic and bold hyperlink",
|
||||
"text": "italic and bold hyperlink",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"hyperlink": "https:/github.com/DS4SD/docling"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/5",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "Normal",
|
||||
"text": "Normal"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/6",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "italic",
|
||||
"text": "italic",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/7",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "bold",
|
||||
"text": "bold",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/8",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "underline",
|
||||
"text": "underline",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": true,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/9",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "and",
|
||||
"text": "and"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/10",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "hyperlink",
|
||||
"text": "hyperlink",
|
||||
"hyperlink": "https:/github.com/DS4SD/docling"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/11",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "on the same line",
|
||||
"text": "on the same line"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/12",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/13",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Italic bullet 1",
|
||||
"text": "Italic bullet 1",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/14",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Bold bullet 2",
|
||||
"text": "Bold bullet 2",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/15",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Underline bullet 3",
|
||||
"text": "Underline bullet 3",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": true,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/16",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Some",
|
||||
"text": "Some",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/17",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "italic",
|
||||
"text": "italic",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/18",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "bold",
|
||||
"text": "bold",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/19",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "underline",
|
||||
"text": "underline",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": true,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/20",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Nested",
|
||||
"text": "Nested",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/21",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "italic",
|
||||
"text": "italic",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/22",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "bold",
|
||||
"text": "bold",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/23",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "paragraph",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": ""
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {}
|
||||
}
|
@ -0,0 +1,17 @@
|
||||
*italic*
|
||||
|
||||
**bold**
|
||||
|
||||
underline
|
||||
|
||||
[hyperlink](https:/github.com/DS4SD/docling)
|
||||
|
||||
[***italic and bold hyperlink***](https:/github.com/DS4SD/docling)
|
||||
|
||||
Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docling) on the same line
|
||||
|
||||
- *Italic bullet 1*
|
||||
- **Bold bullet 2**
|
||||
- Underline bullet 3
|
||||
- Some *italic* **bold** underline
|
||||
- Nested *italic* **bold**
|
@ -76,17 +76,19 @@ def test_e2e_docx_conversions():
|
||||
doc: DoclingDocument = conv_result.document
|
||||
|
||||
pred_md: str = doc.export_to_markdown()
|
||||
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
|
||||
assert verify_export(
|
||||
pred_md, str(gt_path) + ".md", generate=GENERATE
|
||||
), "export to md"
|
||||
|
||||
pred_itxt: str = doc._export_to_indented_text(
|
||||
max_text_len=70, explicit_tables=False
|
||||
)
|
||||
assert verify_export(
|
||||
pred_itxt, str(gt_path) + ".itxt"
|
||||
pred_itxt, str(gt_path) + ".itxt", generate=GENERATE
|
||||
), "export to indented-text"
|
||||
|
||||
assert verify_document(
|
||||
doc, str(gt_path) + ".json", GENERATE
|
||||
doc, str(gt_path) + ".json", generate=GENERATE
|
||||
), "document document"
|
||||
|
||||
if docx_path.name == "word_tables.docx":
|
||||
|
Loading…
Reference in New Issue
Block a user