feat(docx): add text formatting and hyperlink support (#630)
* feat: Enable markdown text formatting for docx Signed-off-by: SimJeg <sjegou@nvidia.com> * Fix imports Signed-off-by: SimJeg <sjegou@nvidia.com> * Use Formatting Signed-off-by: SimJeg <sjegou@nvidia.com> * Handle hyperlink Signed-off-by: SimJeg <sjegou@nvidia.com> * Handle formatting properly for DocItemLabel.PARAGRAPH Signed-off-by: SimJeg <sjegou@nvidia.com> * Use inline group Signed-off-by: SimJeg <sjegou@nvidia.com> * Handle bullet lists Signed-off-by: SimJeg <sjegou@nvidia.com> * Strip elements Signed-off-by: SimJeg <sjegou@nvidia.com> * Strip elements Signed-off-by: SimJeg <sjegou@nvidia.com> * Run black and mypy Signed-off-by: SimJeg <sjegou@nvidia.com> * Handle header and footer Signed-off-by: SimJeg <sjegou@nvidia.com> * Use inline_fmt everywhere Signed-off-by: SimJeg <sjegou@nvidia.com> * Run precommit Signed-off-by: SimJeg <sjegou@nvidia.com> * Address feedback Signed-off-by: SimJeg <sjegou@nvidia.com> * Fix add_list_item Signed-off-by: SimJeg <sjegou@nvidia.com> * fix minor bugs, mark helper methods internal Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> --------- Signed-off-by: SimJeg <sjegou@nvidia.com> Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> Co-authored-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
parent
71148eb381
commit
bfcab3d677
@ -14,15 +14,19 @@ from docling_core.types.doc import (
|
|||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
|
from docling_core.types.doc.document import Formatting
|
||||||
from docx import Document
|
from docx import Document
|
||||||
from docx.document import Document as DocxDocument
|
from docx.document import Document as DocxDocument
|
||||||
from docx.oxml.table import CT_Tc
|
from docx.oxml.table import CT_Tc
|
||||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||||
from docx.table import Table, _Cell
|
from docx.table import Table, _Cell
|
||||||
|
from docx.text.hyperlink import Hyperlink
|
||||||
from docx.text.paragraph import Paragraph
|
from docx.text.paragraph import Paragraph
|
||||||
|
from docx.text.run import Run
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from lxml.etree import XPath
|
from lxml.etree import XPath
|
||||||
from PIL import Image, UnidentifiedImageError
|
from PIL import Image, UnidentifiedImageError
|
||||||
|
from pydantic import AnyUrl
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
@ -118,14 +122,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
assert self.docx_obj is not None
|
assert self.docx_obj is not None
|
||||||
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
||||||
return doc
|
return doc
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
||||||
)
|
)
|
||||||
|
|
||||||
def update_history(
|
def _update_history(
|
||||||
self,
|
self,
|
||||||
name: str,
|
name: str,
|
||||||
level: Optional[int],
|
level: Optional[int],
|
||||||
@ -138,26 +142,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.history["numids"].append(numid)
|
self.history["numids"].append(numid)
|
||||||
self.history["indents"].append(ilevel)
|
self.history["indents"].append(ilevel)
|
||||||
|
|
||||||
def prev_name(self) -> Optional[str]:
|
def _prev_name(self) -> Optional[str]:
|
||||||
return self.history["names"][-1]
|
return self.history["names"][-1]
|
||||||
|
|
||||||
def prev_level(self) -> Optional[int]:
|
def _prev_level(self) -> Optional[int]:
|
||||||
return self.history["levels"][-1]
|
return self.history["levels"][-1]
|
||||||
|
|
||||||
def prev_numid(self) -> Optional[int]:
|
def _prev_numid(self) -> Optional[int]:
|
||||||
return self.history["numids"][-1]
|
return self.history["numids"][-1]
|
||||||
|
|
||||||
def prev_indent(self) -> Optional[int]:
|
def _prev_indent(self) -> Optional[int]:
|
||||||
return self.history["indents"][-1]
|
return self.history["indents"][-1]
|
||||||
|
|
||||||
def get_level(self) -> int:
|
def _get_level(self) -> int:
|
||||||
"""Return the first None index."""
|
"""Return the first None index."""
|
||||||
for k, v in self.parents.items():
|
for k, v in self.parents.items():
|
||||||
if k >= 0 and v == None:
|
if k >= 0 and v == None:
|
||||||
return k
|
return k
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def walk_linear(
|
def _walk_linear(
|
||||||
self,
|
self,
|
||||||
body: BaseOxmlElement,
|
body: BaseOxmlElement,
|
||||||
docx_obj: DocxDocument,
|
docx_obj: DocxDocument,
|
||||||
@ -177,12 +181,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Check for Tables
|
# Check for Tables
|
||||||
if element.tag.endswith("tbl"):
|
if element.tag.endswith("tbl"):
|
||||||
try:
|
try:
|
||||||
self.handle_tables(element, docx_obj, doc)
|
self._handle_tables(element, docx_obj, doc)
|
||||||
except Exception:
|
except Exception:
|
||||||
_log.debug("could not parse a table, broken docx table")
|
_log.debug("could not parse a table, broken docx table")
|
||||||
|
|
||||||
elif drawing_blip:
|
elif drawing_blip:
|
||||||
self.handle_pictures(docx_obj, drawing_blip, doc)
|
self._handle_pictures(docx_obj, drawing_blip, doc)
|
||||||
# Check for the sdt containers, like table of contents
|
# Check for the sdt containers, like table of contents
|
||||||
elif tag_name in ["sdt"]:
|
elif tag_name in ["sdt"]:
|
||||||
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
||||||
@ -190,16 +194,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Iterate paragraphs, runs, or text inside <w:sdtContent>.
|
# Iterate paragraphs, runs, or text inside <w:sdtContent>.
|
||||||
paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
|
paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
|
||||||
for p in paragraphs:
|
for p in paragraphs:
|
||||||
self.handle_text_elements(p, docx_obj, doc)
|
self._handle_text_elements(p, docx_obj, doc)
|
||||||
# Check for Text
|
# Check for Text
|
||||||
elif tag_name in ["p"]:
|
elif tag_name in ["p"]:
|
||||||
# "tcPr", "sectPr"
|
# "tcPr", "sectPr"
|
||||||
self.handle_text_elements(element, docx_obj, doc)
|
self._handle_text_elements(element, docx_obj, doc)
|
||||||
else:
|
else:
|
||||||
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]:
|
def _str_to_int(
|
||||||
|
self, s: Optional[str], default: Optional[int] = 0
|
||||||
|
) -> Optional[int]:
|
||||||
if s is None:
|
if s is None:
|
||||||
return None
|
return None
|
||||||
try:
|
try:
|
||||||
@ -207,7 +213,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
return default
|
return default
|
||||||
|
|
||||||
def split_text_and_number(self, input_string: str) -> list[str]:
|
def _split_text_and_number(self, input_string: str) -> list[str]:
|
||||||
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
|
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
|
||||||
if match:
|
if match:
|
||||||
parts = list(filter(None, match.groups()))
|
parts = list(filter(None, match.groups()))
|
||||||
@ -215,7 +221,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
else:
|
else:
|
||||||
return [input_string]
|
return [input_string]
|
||||||
|
|
||||||
def get_numId_and_ilvl(
|
def _get_numId_and_ilvl(
|
||||||
self, paragraph: Paragraph
|
self, paragraph: Paragraph
|
||||||
) -> tuple[Optional[int], Optional[int]]:
|
) -> tuple[Optional[int], Optional[int]]:
|
||||||
# Access the XML element of the paragraph
|
# Access the XML element of the paragraph
|
||||||
@ -230,12 +236,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
|
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
|
||||||
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
|
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
|
||||||
|
|
||||||
return self.str_to_int(numId, None), self.str_to_int(ilvl, None)
|
return self._str_to_int(numId, None), self._str_to_int(ilvl, None)
|
||||||
|
|
||||||
return None, None # If the paragraph is not part of a list
|
return None, None # If the paragraph is not part of a list
|
||||||
|
|
||||||
def get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
|
def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
|
||||||
parts = self.split_text_and_number(style_label)
|
parts = self._split_text_and_number(style_label)
|
||||||
|
|
||||||
if len(parts) == 2:
|
if len(parts) == 2:
|
||||||
parts.sort()
|
parts.sort()
|
||||||
@ -243,15 +249,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
label_level: Optional[int] = 0
|
label_level: Optional[int] = 0
|
||||||
if parts[0].strip().lower() == "heading":
|
if parts[0].strip().lower() == "heading":
|
||||||
label_str = "Heading"
|
label_str = "Heading"
|
||||||
label_level = self.str_to_int(parts[1], None)
|
label_level = self._str_to_int(parts[1], None)
|
||||||
if parts[1].strip().lower() == "heading":
|
if parts[1].strip().lower() == "heading":
|
||||||
label_str = "Heading"
|
label_str = "Heading"
|
||||||
label_level = self.str_to_int(parts[0], None)
|
label_level = self._str_to_int(parts[0], None)
|
||||||
return label_str, label_level
|
return label_str, label_level
|
||||||
|
|
||||||
return style_label, None
|
return style_label, None
|
||||||
|
|
||||||
def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
|
def _get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
|
||||||
if paragraph.style is None:
|
if paragraph.style is None:
|
||||||
return "Normal", None
|
return "Normal", None
|
||||||
|
|
||||||
@ -264,16 +270,82 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if ":" in label:
|
if ":" in label:
|
||||||
parts = label.split(":")
|
parts = label.split(":")
|
||||||
if len(parts) == 2:
|
if len(parts) == 2:
|
||||||
return parts[0], self.str_to_int(parts[1], None)
|
return parts[0], self._str_to_int(parts[1], None)
|
||||||
|
|
||||||
if "heading" in label.lower():
|
if "heading" in label.lower():
|
||||||
return self.get_heading_and_level(label)
|
return self._get_heading_and_level(label)
|
||||||
if "heading" in name.lower():
|
if "heading" in name.lower():
|
||||||
return self.get_heading_and_level(name)
|
return self._get_heading_and_level(name)
|
||||||
|
|
||||||
return label, None
|
return label, None
|
||||||
|
|
||||||
def handle_equations_in_text(self, element, text):
|
@classmethod
|
||||||
|
def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
|
||||||
|
has_any_formatting = run.bold or run.italic or run.underline
|
||||||
|
return (
|
||||||
|
Formatting(
|
||||||
|
bold=run.bold or False,
|
||||||
|
italic=run.italic or False,
|
||||||
|
underline=run.underline or False,
|
||||||
|
)
|
||||||
|
if has_any_formatting
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_paragraph_elements(self, paragraph: Paragraph):
|
||||||
|
"""
|
||||||
|
Extract paragraph elements along with their formatting and hyperlink
|
||||||
|
"""
|
||||||
|
|
||||||
|
# for now retain empty paragraphs for backwards compatibility:
|
||||||
|
if paragraph.text.strip() == "":
|
||||||
|
return [("", None, None)]
|
||||||
|
|
||||||
|
paragraph_elements: list[
|
||||||
|
tuple[str, Optional[Formatting], Optional[Union[AnyUrl, Path]]]
|
||||||
|
] = []
|
||||||
|
group_text = ""
|
||||||
|
previous_format = None
|
||||||
|
|
||||||
|
# Iterate over the runs of the paragraph and group them by format
|
||||||
|
for c in paragraph.iter_inner_content():
|
||||||
|
if isinstance(c, Hyperlink):
|
||||||
|
text = c.text
|
||||||
|
hyperlink = Path(c.address)
|
||||||
|
format = self._get_format_from_run(c.runs[0])
|
||||||
|
elif isinstance(c, Run):
|
||||||
|
text = c.text
|
||||||
|
hyperlink = None
|
||||||
|
format = self._get_format_from_run(c)
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if (len(text.strip()) and format != previous_format) or (
|
||||||
|
hyperlink is not None
|
||||||
|
):
|
||||||
|
# If the style changes for a non empty text, add the previous group
|
||||||
|
if len(group_text.strip()) > 0:
|
||||||
|
paragraph_elements.append(
|
||||||
|
(group_text.strip(), previous_format, None)
|
||||||
|
)
|
||||||
|
group_text = ""
|
||||||
|
|
||||||
|
# If there is a hyperlink, add it immediately
|
||||||
|
if hyperlink is not None:
|
||||||
|
paragraph_elements.append((text.strip(), format, hyperlink))
|
||||||
|
text = ""
|
||||||
|
else:
|
||||||
|
previous_format = format
|
||||||
|
|
||||||
|
group_text += text
|
||||||
|
|
||||||
|
# Format the last group
|
||||||
|
if len(group_text.strip()) > 0:
|
||||||
|
paragraph_elements.append((group_text.strip(), format, None))
|
||||||
|
|
||||||
|
return paragraph_elements
|
||||||
|
|
||||||
|
def _handle_equations_in_text(self, element, text):
|
||||||
only_texts = []
|
only_texts = []
|
||||||
only_equations = []
|
only_equations = []
|
||||||
texts_and_equations = []
|
texts_and_equations = []
|
||||||
@ -319,7 +391,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
return output_text, only_equations
|
return output_text, only_equations
|
||||||
|
|
||||||
def handle_text_elements(
|
def _create_or_reuse_parent(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
doc: DoclingDocument,
|
||||||
|
prev_parent: Optional[NodeItem],
|
||||||
|
paragraph_elements: list,
|
||||||
|
) -> Optional[NodeItem]:
|
||||||
|
return (
|
||||||
|
doc.add_group(label=GroupLabel.INLINE, parent=prev_parent)
|
||||||
|
if len(paragraph_elements) > 1
|
||||||
|
else prev_parent
|
||||||
|
)
|
||||||
|
|
||||||
|
def _handle_text_elements(
|
||||||
self,
|
self,
|
||||||
element: BaseOxmlElement,
|
element: BaseOxmlElement,
|
||||||
docx_obj: DocxDocument,
|
docx_obj: DocxDocument,
|
||||||
@ -328,10 +413,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
paragraph = Paragraph(element, docx_obj)
|
paragraph = Paragraph(element, docx_obj)
|
||||||
|
|
||||||
raw_text = paragraph.text
|
raw_text = paragraph.text
|
||||||
text, equations = self.handle_equations_in_text(element=element, text=raw_text)
|
text, equations = self._handle_equations_in_text(element=element, text=raw_text)
|
||||||
|
|
||||||
if text is None:
|
if text is None:
|
||||||
return
|
return
|
||||||
|
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
|
|
||||||
# Common styles for bullet and numbered lists.
|
# Common styles for bullet and numbered lists.
|
||||||
@ -339,8 +425,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Identify wether list is a numbered list or not
|
# Identify wether list is a numbered list or not
|
||||||
# is_numbered = "List Bullet" not in paragraph.style.name
|
# is_numbered = "List Bullet" not in paragraph.style.name
|
||||||
is_numbered = False
|
is_numbered = False
|
||||||
p_style_id, p_level = self.get_label_and_level(paragraph)
|
p_style_id, p_level = self._get_label_and_level(paragraph)
|
||||||
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
numid, ilevel = self._get_numId_and_ilvl(paragraph)
|
||||||
|
|
||||||
if numid == 0:
|
if numid == 0:
|
||||||
numid = None
|
numid = None
|
||||||
@ -351,18 +437,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
and ilevel is not None
|
and ilevel is not None
|
||||||
and p_style_id not in ["Title", "Heading"]
|
and p_style_id not in ["Title", "Heading"]
|
||||||
):
|
):
|
||||||
self.add_listitem(
|
self._add_list_item(
|
||||||
doc,
|
doc=doc,
|
||||||
numid,
|
numid=numid,
|
||||||
ilevel,
|
ilevel=ilevel,
|
||||||
text,
|
elements=paragraph_elements,
|
||||||
is_numbered,
|
is_numbered=is_numbered,
|
||||||
)
|
)
|
||||||
self.update_history(p_style_id, p_level, numid, ilevel)
|
self._update_history(p_style_id, p_level, numid, ilevel)
|
||||||
return
|
return
|
||||||
elif (
|
elif (
|
||||||
numid is None
|
numid is None
|
||||||
and self.prev_numid() is not None
|
and self._prev_numid() is not None
|
||||||
and p_style_id not in ["Title", "Heading"]
|
and p_style_id not in ["Title", "Heading"]
|
||||||
): # Close list
|
): # Close list
|
||||||
if self.level_at_new_list:
|
if self.level_at_new_list:
|
||||||
@ -390,12 +476,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
is_numbered_style = False
|
is_numbered_style = False
|
||||||
self.add_header(doc, p_level, text, is_numbered_style)
|
self._add_header(doc, p_level, text, is_numbered_style)
|
||||||
|
|
||||||
elif len(equations) > 0:
|
elif len(equations) > 0:
|
||||||
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
|
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
|
||||||
# Standalone equation
|
# Standalone equation
|
||||||
level = self.get_level()
|
level = self._get_level()
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.FORMULA,
|
label=DocItemLabel.FORMULA,
|
||||||
parent=self.parents[level - 1],
|
parent=self.parents[level - 1],
|
||||||
@ -403,7 +489,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Inline equation
|
# Inline equation
|
||||||
level = self.get_level()
|
level = self._get_level()
|
||||||
inline_equation = doc.add_group(
|
inline_equation = doc.add_group(
|
||||||
label=GroupLabel.INLINE, parent=self.parents[level - 1]
|
label=GroupLabel.INLINE, parent=self.parents[level - 1]
|
||||||
)
|
)
|
||||||
@ -442,30 +528,50 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
"ListBullet",
|
"ListBullet",
|
||||||
"Quote",
|
"Quote",
|
||||||
]:
|
]:
|
||||||
level = self.get_level()
|
level = self._get_level()
|
||||||
|
parent = self._create_or_reuse_parent(
|
||||||
|
doc=doc,
|
||||||
|
prev_parent=self.parents.get(level - 1),
|
||||||
|
paragraph_elements=paragraph_elements,
|
||||||
|
)
|
||||||
|
for text, format, hyperlink in paragraph_elements:
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
label=DocItemLabel.PARAGRAPH,
|
||||||
|
parent=parent,
|
||||||
|
text=text,
|
||||||
|
formatting=format,
|
||||||
|
hyperlink=hyperlink,
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Text style names can, and will have, not only default values but user values too
|
# Text style names can, and will have, not only default values but user values too
|
||||||
# hence we treat all other labels as pure text
|
# hence we treat all other labels as pure text
|
||||||
level = self.get_level()
|
level = self._get_level()
|
||||||
|
parent = self._create_or_reuse_parent(
|
||||||
|
doc=doc,
|
||||||
|
prev_parent=self.parents.get(level - 1),
|
||||||
|
paragraph_elements=paragraph_elements,
|
||||||
|
)
|
||||||
|
for text, format, hyperlink in paragraph_elements:
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
label=DocItemLabel.PARAGRAPH,
|
||||||
|
parent=parent,
|
||||||
|
text=text,
|
||||||
|
formatting=format,
|
||||||
|
hyperlink=hyperlink,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.update_history(p_style_id, p_level, numid, ilevel)
|
self._update_history(p_style_id, p_level, numid, ilevel)
|
||||||
return
|
return
|
||||||
|
|
||||||
def add_header(
|
def _add_header(
|
||||||
self,
|
self,
|
||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
curr_level: Optional[int],
|
curr_level: Optional[int],
|
||||||
text: str,
|
text: str,
|
||||||
is_numbered_style: bool = False,
|
is_numbered_style: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
level = self.get_level()
|
level = self._get_level()
|
||||||
if isinstance(curr_level, int):
|
if isinstance(curr_level, int):
|
||||||
if curr_level > level:
|
if curr_level > level:
|
||||||
# add invisible group
|
# add invisible group
|
||||||
@ -521,19 +627,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
def add_listitem(
|
def _add_list_item(
|
||||||
self,
|
self,
|
||||||
|
*,
|
||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
numid: int,
|
numid: int,
|
||||||
ilevel: int,
|
ilevel: int,
|
||||||
text: str,
|
elements: list,
|
||||||
is_numbered: bool = False,
|
is_numbered: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
enum_marker = ""
|
enum_marker = ""
|
||||||
|
|
||||||
level = self.get_level()
|
level = self._get_level()
|
||||||
prev_indent = self.prev_indent()
|
prev_indent = self._prev_indent()
|
||||||
if self.prev_numid() is None: # Open new list
|
if self._prev_numid() is None: # Open new list
|
||||||
self.level_at_new_list = level
|
self.level_at_new_list = level
|
||||||
|
|
||||||
self.parents[level] = doc.add_group(
|
self.parents[level] = doc.add_group(
|
||||||
@ -545,15 +652,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
|
new_parent = self._create_or_reuse_parent(
|
||||||
|
doc=doc,
|
||||||
|
prev_parent=self.parents[level],
|
||||||
|
paragraph_elements=elements,
|
||||||
|
)
|
||||||
|
for text, format, hyperlink in elements:
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
marker=enum_marker,
|
marker=enum_marker,
|
||||||
enumerated=is_numbered,
|
enumerated=is_numbered,
|
||||||
parent=self.parents[level],
|
parent=new_parent,
|
||||||
text=text,
|
text=text,
|
||||||
|
formatting=format,
|
||||||
|
hyperlink=hyperlink,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
self.prev_numid() == numid
|
self._prev_numid() == numid
|
||||||
and self.level_at_new_list is not None
|
and self.level_at_new_list is not None
|
||||||
and prev_indent is not None
|
and prev_indent is not None
|
||||||
and prev_indent < ilevel
|
and prev_indent < ilevel
|
||||||
@ -581,15 +696,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
|
|
||||||
|
new_parent = self._create_or_reuse_parent(
|
||||||
|
doc=doc,
|
||||||
|
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
||||||
|
paragraph_elements=elements,
|
||||||
|
)
|
||||||
|
for text, format, hyperlink in elements:
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
marker=enum_marker,
|
marker=enum_marker,
|
||||||
enumerated=is_numbered,
|
enumerated=is_numbered,
|
||||||
parent=self.parents[self.level_at_new_list + ilevel],
|
parent=new_parent,
|
||||||
text=text,
|
text=text,
|
||||||
|
formatting=format,
|
||||||
|
hyperlink=hyperlink,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
self.prev_numid() == numid
|
self._prev_numid() == numid
|
||||||
and self.level_at_new_list is not None
|
and self.level_at_new_list is not None
|
||||||
and prev_indent is not None
|
and prev_indent is not None
|
||||||
and ilevel < prev_indent
|
and ilevel < prev_indent
|
||||||
@ -603,29 +726,46 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
|
new_parent = self._create_or_reuse_parent(
|
||||||
|
doc=doc,
|
||||||
|
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
||||||
|
paragraph_elements=elements,
|
||||||
|
)
|
||||||
|
for text, format, hyperlink in elements:
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
marker=enum_marker,
|
marker=enum_marker,
|
||||||
enumerated=is_numbered,
|
enumerated=is_numbered,
|
||||||
parent=self.parents[self.level_at_new_list + ilevel],
|
parent=new_parent,
|
||||||
text=text,
|
text=text,
|
||||||
|
formatting=format,
|
||||||
|
hyperlink=hyperlink,
|
||||||
)
|
)
|
||||||
self.listIter = 0
|
self.listIter = 0
|
||||||
|
|
||||||
elif self.prev_numid() == numid or prev_indent == ilevel:
|
elif self._prev_numid() == numid or prev_indent == ilevel:
|
||||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||||
self.listIter += 1
|
self.listIter += 1
|
||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
|
new_parent = self._create_or_reuse_parent(
|
||||||
|
doc=doc,
|
||||||
|
prev_parent=self.parents[level - 1],
|
||||||
|
paragraph_elements=elements,
|
||||||
|
)
|
||||||
|
for text, format, hyperlink in elements:
|
||||||
|
# Add the list item to the parent group
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
marker=enum_marker,
|
marker=enum_marker,
|
||||||
enumerated=is_numbered,
|
enumerated=is_numbered,
|
||||||
parent=self.parents[level - 1],
|
parent=new_parent,
|
||||||
text=text,
|
text=text,
|
||||||
|
formatting=format,
|
||||||
|
hyperlink=hyperlink,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
def handle_tables(
|
def _handle_tables(
|
||||||
self,
|
self,
|
||||||
element: BaseOxmlElement,
|
element: BaseOxmlElement,
|
||||||
docx_obj: DocxDocument,
|
docx_obj: DocxDocument,
|
||||||
@ -640,7 +780,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
cell_element = table.rows[0].cells[0]
|
cell_element = table.rows[0].cells[0]
|
||||||
# In case we have a table of only 1 cell, we consider it furniture
|
# In case we have a table of only 1 cell, we consider it furniture
|
||||||
# And proceed processing the content of the cell as though it's in the document body
|
# And proceed processing the content of the cell as though it's in the document body
|
||||||
self.walk_linear(cell_element._element, docx_obj, doc)
|
self._walk_linear(cell_element._element, docx_obj, doc)
|
||||||
return
|
return
|
||||||
|
|
||||||
data = TableData(num_rows=num_rows, num_cols=num_cols)
|
data = TableData(num_rows=num_rows, num_cols=num_cols)
|
||||||
@ -685,11 +825,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
data.table_cells.append(table_cell)
|
data.table_cells.append(table_cell)
|
||||||
col_idx += cell.grid_span
|
col_idx += cell.grid_span
|
||||||
|
|
||||||
level = self.get_level()
|
level = self._get_level()
|
||||||
doc.add_table(data=data, parent=self.parents[level - 1])
|
doc.add_table(data=data, parent=self.parents[level - 1])
|
||||||
return
|
return
|
||||||
|
|
||||||
def handle_pictures(
|
def _handle_pictures(
|
||||||
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
|
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
|
||||||
) -> None:
|
) -> None:
|
||||||
def get_docx_image(drawing_blip):
|
def get_docx_image(drawing_blip):
|
||||||
@ -702,7 +842,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
image_data = image_part.blob # Get the binary image data
|
image_data = image_part.blob # Get the binary image data
|
||||||
return image_data
|
return image_data
|
||||||
|
|
||||||
level = self.get_level()
|
level = self._get_level()
|
||||||
# Open the BytesIO object with PIL to create an Image
|
# Open the BytesIO object with PIL to create an Image
|
||||||
try:
|
try:
|
||||||
image_data = get_docx_image(drawing_blip)
|
image_data = get_docx_image(drawing_blip)
|
||||||
|
BIN
tests/data/docx/unit_test_formatting.docx
Normal file
BIN
tests/data/docx/unit_test_formatting.docx
Normal file
Binary file not shown.
@ -0,0 +1,30 @@
|
|||||||
|
item-0 at level 0: unspecified: group _root_
|
||||||
|
item-1 at level 1: paragraph: italic
|
||||||
|
item-2 at level 1: paragraph: bold
|
||||||
|
item-3 at level 1: paragraph: underline
|
||||||
|
item-4 at level 1: paragraph: hyperlink
|
||||||
|
item-5 at level 1: paragraph: italic and bold hyperlink
|
||||||
|
item-6 at level 1: inline: group group
|
||||||
|
item-7 at level 2: paragraph: Normal
|
||||||
|
item-8 at level 2: paragraph: italic
|
||||||
|
item-9 at level 2: paragraph: bold
|
||||||
|
item-10 at level 2: paragraph: underline
|
||||||
|
item-11 at level 2: paragraph: and
|
||||||
|
item-12 at level 2: paragraph: hyperlink
|
||||||
|
item-13 at level 2: paragraph: on the same line
|
||||||
|
item-14 at level 1: paragraph:
|
||||||
|
item-15 at level 1: list: group list
|
||||||
|
item-16 at level 2: list_item: Italic bullet 1
|
||||||
|
item-17 at level 2: list_item: Bold bullet 2
|
||||||
|
item-18 at level 2: list_item: Underline bullet 3
|
||||||
|
item-19 at level 2: inline: group group
|
||||||
|
item-20 at level 3: list_item: Some
|
||||||
|
item-21 at level 3: list_item: italic
|
||||||
|
item-22 at level 3: list_item: bold
|
||||||
|
item-23 at level 3: list_item: underline
|
||||||
|
item-24 at level 2: list: group list
|
||||||
|
item-25 at level 3: inline: group group
|
||||||
|
item-26 at level 4: list_item: Nested
|
||||||
|
item-27 at level 4: list_item: italic
|
||||||
|
item-28 at level 4: list_item: bold
|
||||||
|
item-29 at level 1: paragraph:
|
577
tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json
Normal file
577
tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json
Normal file
@ -0,0 +1,577 @@
|
|||||||
|
{
|
||||||
|
"schema_name": "DoclingDocument",
|
||||||
|
"version": "1.3.0",
|
||||||
|
"name": "unit_test_formatting",
|
||||||
|
"origin": {
|
||||||
|
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
"binary_hash": 16380079676357958448,
|
||||||
|
"filename": "unit_test_formatting.docx"
|
||||||
|
},
|
||||||
|
"furniture": {
|
||||||
|
"self_ref": "#/furniture",
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"name": "_root_",
|
||||||
|
"label": "unspecified"
|
||||||
|
},
|
||||||
|
"body": {
|
||||||
|
"self_ref": "#/body",
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/4"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/12"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/23"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "_root_",
|
||||||
|
"label": "unspecified"
|
||||||
|
},
|
||||||
|
"groups": [
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/0",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/5"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/7"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/8"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/9"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/10"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/11"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "group",
|
||||||
|
"label": "inline"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/1",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/13"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/14"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/15"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/3"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "list",
|
||||||
|
"label": "list"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/2",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/1"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/16"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/17"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/18"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/19"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "group",
|
||||||
|
"label": "inline"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/3",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/1"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/4"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "list",
|
||||||
|
"label": "list"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/4",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/3"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/20"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/21"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/22"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "group",
|
||||||
|
"label": "inline"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"texts": [
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/0",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "italic",
|
||||||
|
"text": "italic",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": true,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/1",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "bold",
|
||||||
|
"text": "bold",
|
||||||
|
"formatting": {
|
||||||
|
"bold": true,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/2",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "underline",
|
||||||
|
"text": "underline",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": true,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/3",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "hyperlink",
|
||||||
|
"text": "hyperlink",
|
||||||
|
"hyperlink": "https:/github.com/DS4SD/docling"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/4",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "italic and bold hyperlink",
|
||||||
|
"text": "italic and bold hyperlink",
|
||||||
|
"formatting": {
|
||||||
|
"bold": true,
|
||||||
|
"italic": true,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
|
"hyperlink": "https:/github.com/DS4SD/docling"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/5",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Normal",
|
||||||
|
"text": "Normal"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/6",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "italic",
|
||||||
|
"text": "italic",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": true,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/7",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "bold",
|
||||||
|
"text": "bold",
|
||||||
|
"formatting": {
|
||||||
|
"bold": true,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/8",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "underline",
|
||||||
|
"text": "underline",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": true,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/9",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "and",
|
||||||
|
"text": "and"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/10",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "hyperlink",
|
||||||
|
"text": "hyperlink",
|
||||||
|
"hyperlink": "https:/github.com/DS4SD/docling"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/11",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "on the same line",
|
||||||
|
"text": "on the same line"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/12",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "",
|
||||||
|
"text": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/13",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/1"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "list_item",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Italic bullet 1",
|
||||||
|
"text": "Italic bullet 1",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": true,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": "-"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/14",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/1"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "list_item",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Bold bullet 2",
|
||||||
|
"text": "Bold bullet 2",
|
||||||
|
"formatting": {
|
||||||
|
"bold": true,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": "-"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/15",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/1"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "list_item",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Underline bullet 3",
|
||||||
|
"text": "Underline bullet 3",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": true,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": "-"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/16",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/2"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "list_item",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Some",
|
||||||
|
"text": "Some",
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": "-"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/17",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/2"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "list_item",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "italic",
|
||||||
|
"text": "italic",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": true,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": "-"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/18",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/2"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "list_item",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "bold",
|
||||||
|
"text": "bold",
|
||||||
|
"formatting": {
|
||||||
|
"bold": true,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": "-"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/19",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/2"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "list_item",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "underline",
|
||||||
|
"text": "underline",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": true,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": "-"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/20",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/4"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "list_item",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Nested",
|
||||||
|
"text": "Nested",
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": "-"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/21",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/4"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "list_item",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "italic",
|
||||||
|
"text": "italic",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": true,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": "-"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/22",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/4"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "list_item",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "bold",
|
||||||
|
"text": "bold",
|
||||||
|
"formatting": {
|
||||||
|
"bold": true,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
},
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": "-"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/23",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "paragraph",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "",
|
||||||
|
"text": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"pictures": [],
|
||||||
|
"tables": [],
|
||||||
|
"key_value_items": [],
|
||||||
|
"form_items": [],
|
||||||
|
"pages": {}
|
||||||
|
}
|
@ -0,0 +1,17 @@
|
|||||||
|
*italic*
|
||||||
|
|
||||||
|
**bold**
|
||||||
|
|
||||||
|
underline
|
||||||
|
|
||||||
|
[hyperlink](https:/github.com/DS4SD/docling)
|
||||||
|
|
||||||
|
[***italic and bold hyperlink***](https:/github.com/DS4SD/docling)
|
||||||
|
|
||||||
|
Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docling) on the same line
|
||||||
|
|
||||||
|
- *Italic bullet 1*
|
||||||
|
- **Bold bullet 2**
|
||||||
|
- Underline bullet 3
|
||||||
|
- Some *italic* **bold** underline
|
||||||
|
- Nested *italic* **bold**
|
@ -76,17 +76,19 @@ def test_e2e_docx_conversions():
|
|||||||
doc: DoclingDocument = conv_result.document
|
doc: DoclingDocument = conv_result.document
|
||||||
|
|
||||||
pred_md: str = doc.export_to_markdown()
|
pred_md: str = doc.export_to_markdown()
|
||||||
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
|
assert verify_export(
|
||||||
|
pred_md, str(gt_path) + ".md", generate=GENERATE
|
||||||
|
), "export to md"
|
||||||
|
|
||||||
pred_itxt: str = doc._export_to_indented_text(
|
pred_itxt: str = doc._export_to_indented_text(
|
||||||
max_text_len=70, explicit_tables=False
|
max_text_len=70, explicit_tables=False
|
||||||
)
|
)
|
||||||
assert verify_export(
|
assert verify_export(
|
||||||
pred_itxt, str(gt_path) + ".itxt"
|
pred_itxt, str(gt_path) + ".itxt", generate=GENERATE
|
||||||
), "export to indented-text"
|
), "export to indented-text"
|
||||||
|
|
||||||
assert verify_document(
|
assert verify_document(
|
||||||
doc, str(gt_path) + ".json", GENERATE
|
doc, str(gt_path) + ".json", generate=GENERATE
|
||||||
), "document document"
|
), "document document"
|
||||||
|
|
||||||
if docx_path.name == "word_tables.docx":
|
if docx_path.name == "word_tables.docx":
|
||||||
|
Loading…
Reference in New Issue
Block a user