feat(docx): add text formatting and hyperlink support (#630)

* feat: Enable markdown text formatting for docx

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Fix imports

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Use Formatting

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Handle hyperlink

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Handle formatting properly for DocItemLabel.PARAGRAPH

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Use inline group

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Handle bullet lists

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Strip elements

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Strip elements

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Run black and mypy

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Handle header and footer

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Use inline_fmt everywhere

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Run precommit

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Address feedback

Signed-off-by: SimJeg <sjegou@nvidia.com>

* Fix add_list_item

Signed-off-by: SimJeg <sjegou@nvidia.com>

* fix minor bugs, mark helper methods internal

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

---------

Signed-off-by: SimJeg <sjegou@nvidia.com>
Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
Co-authored-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
Simon Jégou 2025-04-03 15:11:50 +02:00 committed by GitHub
parent 71148eb381
commit bfcab3d677
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 852 additions and 86 deletions

View File

@ -14,15 +14,19 @@ from docling_core.types.doc import (
TableCell,
TableData,
)
from docling_core.types.doc.document import Formatting
from docx import Document
from docx.document import Document as DocxDocument
from docx.oxml.table import CT_Tc
from docx.oxml.xmlchemy import BaseOxmlElement
from docx.table import Table, _Cell
from docx.text.hyperlink import Hyperlink
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from lxml import etree
from lxml.etree import XPath
from PIL import Image, UnidentifiedImageError
from pydantic import AnyUrl
from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
@ -118,14 +122,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
if self.is_valid():
assert self.docx_obj is not None
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
return doc
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
)
def update_history(
def _update_history(
self,
name: str,
level: Optional[int],
@ -138,26 +142,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.history["numids"].append(numid)
self.history["indents"].append(ilevel)
def prev_name(self) -> Optional[str]:
def _prev_name(self) -> Optional[str]:
return self.history["names"][-1]
def prev_level(self) -> Optional[int]:
def _prev_level(self) -> Optional[int]:
return self.history["levels"][-1]
def prev_numid(self) -> Optional[int]:
def _prev_numid(self) -> Optional[int]:
return self.history["numids"][-1]
def prev_indent(self) -> Optional[int]:
def _prev_indent(self) -> Optional[int]:
return self.history["indents"][-1]
def get_level(self) -> int:
def _get_level(self) -> int:
"""Return the first None index."""
for k, v in self.parents.items():
if k >= 0 and v == None:
return k
return 0
def walk_linear(
def _walk_linear(
self,
body: BaseOxmlElement,
docx_obj: DocxDocument,
@ -177,12 +181,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Check for Tables
if element.tag.endswith("tbl"):
try:
self.handle_tables(element, docx_obj, doc)
self._handle_tables(element, docx_obj, doc)
except Exception:
_log.debug("could not parse a table, broken docx table")
elif drawing_blip:
self.handle_pictures(docx_obj, drawing_blip, doc)
self._handle_pictures(docx_obj, drawing_blip, doc)
# Check for the sdt containers, like table of contents
elif tag_name in ["sdt"]:
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@ -190,16 +194,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Iterate paragraphs, runs, or text inside <w:sdtContent>.
paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces)
for p in paragraphs:
self.handle_text_elements(p, docx_obj, doc)
self._handle_text_elements(p, docx_obj, doc)
# Check for Text
elif tag_name in ["p"]:
# "tcPr", "sectPr"
self.handle_text_elements(element, docx_obj, doc)
self._handle_text_elements(element, docx_obj, doc)
else:
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
return doc
def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]:
def _str_to_int(
self, s: Optional[str], default: Optional[int] = 0
) -> Optional[int]:
if s is None:
return None
try:
@ -207,7 +213,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
except ValueError:
return default
def split_text_and_number(self, input_string: str) -> list[str]:
def _split_text_and_number(self, input_string: str) -> list[str]:
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
if match:
parts = list(filter(None, match.groups()))
@ -215,7 +221,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else:
return [input_string]
def get_numId_and_ilvl(
def _get_numId_and_ilvl(
self, paragraph: Paragraph
) -> tuple[Optional[int], Optional[int]]:
# Access the XML element of the paragraph
@ -230,12 +236,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
return self.str_to_int(numId, None), self.str_to_int(ilvl, None)
return self._str_to_int(numId, None), self._str_to_int(ilvl, None)
return None, None # If the paragraph is not part of a list
def get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
parts = self.split_text_and_number(style_label)
def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
parts = self._split_text_and_number(style_label)
if len(parts) == 2:
parts.sort()
@ -243,15 +249,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label_level: Optional[int] = 0
if parts[0].strip().lower() == "heading":
label_str = "Heading"
label_level = self.str_to_int(parts[1], None)
label_level = self._str_to_int(parts[1], None)
if parts[1].strip().lower() == "heading":
label_str = "Heading"
label_level = self.str_to_int(parts[0], None)
label_level = self._str_to_int(parts[0], None)
return label_str, label_level
return style_label, None
def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
def _get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
if paragraph.style is None:
return "Normal", None
@ -264,16 +270,82 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if ":" in label:
parts = label.split(":")
if len(parts) == 2:
return parts[0], self.str_to_int(parts[1], None)
return parts[0], self._str_to_int(parts[1], None)
if "heading" in label.lower():
return self.get_heading_and_level(label)
return self._get_heading_and_level(label)
if "heading" in name.lower():
return self.get_heading_and_level(name)
return self._get_heading_and_level(name)
return label, None
def handle_equations_in_text(self, element, text):
@classmethod
def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
has_any_formatting = run.bold or run.italic or run.underline
return (
Formatting(
bold=run.bold or False,
italic=run.italic or False,
underline=run.underline or False,
)
if has_any_formatting
else None
)
def _get_paragraph_elements(self, paragraph: Paragraph):
"""
Extract paragraph elements along with their formatting and hyperlink
"""
# for now retain empty paragraphs for backwards compatibility:
if paragraph.text.strip() == "":
return [("", None, None)]
paragraph_elements: list[
tuple[str, Optional[Formatting], Optional[Union[AnyUrl, Path]]]
] = []
group_text = ""
previous_format = None
# Iterate over the runs of the paragraph and group them by format
for c in paragraph.iter_inner_content():
if isinstance(c, Hyperlink):
text = c.text
hyperlink = Path(c.address)
format = self._get_format_from_run(c.runs[0])
elif isinstance(c, Run):
text = c.text
hyperlink = None
format = self._get_format_from_run(c)
else:
continue
if (len(text.strip()) and format != previous_format) or (
hyperlink is not None
):
# If the style changes for a non empty text, add the previous group
if len(group_text.strip()) > 0:
paragraph_elements.append(
(group_text.strip(), previous_format, None)
)
group_text = ""
# If there is a hyperlink, add it immediately
if hyperlink is not None:
paragraph_elements.append((text.strip(), format, hyperlink))
text = ""
else:
previous_format = format
group_text += text
# Format the last group
if len(group_text.strip()) > 0:
paragraph_elements.append((group_text.strip(), format, None))
return paragraph_elements
def _handle_equations_in_text(self, element, text):
only_texts = []
only_equations = []
texts_and_equations = []
@ -319,7 +391,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return output_text, only_equations
def handle_text_elements(
def _create_or_reuse_parent(
self,
*,
doc: DoclingDocument,
prev_parent: Optional[NodeItem],
paragraph_elements: list,
) -> Optional[NodeItem]:
return (
doc.add_group(label=GroupLabel.INLINE, parent=prev_parent)
if len(paragraph_elements) > 1
else prev_parent
)
def _handle_text_elements(
self,
element: BaseOxmlElement,
docx_obj: DocxDocument,
@ -328,10 +413,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
paragraph = Paragraph(element, docx_obj)
raw_text = paragraph.text
text, equations = self.handle_equations_in_text(element=element, text=raw_text)
text, equations = self._handle_equations_in_text(element=element, text=raw_text)
if text is None:
return
paragraph_elements = self._get_paragraph_elements(paragraph)
text = text.strip()
# Common styles for bullet and numbered lists.
@ -339,8 +425,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Identify wether list is a numbered list or not
# is_numbered = "List Bullet" not in paragraph.style.name
is_numbered = False
p_style_id, p_level = self.get_label_and_level(paragraph)
numid, ilevel = self.get_numId_and_ilvl(paragraph)
p_style_id, p_level = self._get_label_and_level(paragraph)
numid, ilevel = self._get_numId_and_ilvl(paragraph)
if numid == 0:
numid = None
@ -351,18 +437,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
and ilevel is not None
and p_style_id not in ["Title", "Heading"]
):
self.add_listitem(
doc,
numid,
ilevel,
text,
is_numbered,
self._add_list_item(
doc=doc,
numid=numid,
ilevel=ilevel,
elements=paragraph_elements,
is_numbered=is_numbered,
)
self.update_history(p_style_id, p_level, numid, ilevel)
self._update_history(p_style_id, p_level, numid, ilevel)
return
elif (
numid is None
and self.prev_numid() is not None
and self._prev_numid() is not None
and p_style_id not in ["Title", "Heading"]
): # Close list
if self.level_at_new_list:
@ -390,12 +476,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
)
else:
is_numbered_style = False
self.add_header(doc, p_level, text, is_numbered_style)
self._add_header(doc, p_level, text, is_numbered_style)
elif len(equations) > 0:
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
# Standalone equation
level = self.get_level()
level = self._get_level()
doc.add_text(
label=DocItemLabel.FORMULA,
parent=self.parents[level - 1],
@ -403,7 +489,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
)
else:
# Inline equation
level = self.get_level()
level = self._get_level()
inline_equation = doc.add_group(
label=GroupLabel.INLINE, parent=self.parents[level - 1]
)
@ -442,30 +528,50 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
"ListBullet",
"Quote",
]:
level = self.get_level()
doc.add_text(
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
level = self._get_level()
parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents.get(level - 1),
paragraph_elements=paragraph_elements,
)
for text, format, hyperlink in paragraph_elements:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
else:
# Text style names can, and will have, not only default values but user values too
# hence we treat all other labels as pure text
level = self.get_level()
doc.add_text(
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
level = self._get_level()
parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents.get(level - 1),
paragraph_elements=paragraph_elements,
)
for text, format, hyperlink in paragraph_elements:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
self.update_history(p_style_id, p_level, numid, ilevel)
self._update_history(p_style_id, p_level, numid, ilevel)
return
def add_header(
def _add_header(
self,
doc: DoclingDocument,
curr_level: Optional[int],
text: str,
is_numbered_style: bool = False,
) -> None:
level = self.get_level()
level = self._get_level()
if isinstance(curr_level, int):
if curr_level > level:
# add invisible group
@ -521,19 +627,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
)
return
def add_listitem(
def _add_list_item(
self,
*,
doc: DoclingDocument,
numid: int,
ilevel: int,
text: str,
elements: list,
is_numbered: bool = False,
) -> None:
enum_marker = ""
level = self.get_level()
prev_indent = self.prev_indent()
if self.prev_numid() is None: # Open new list
level = self._get_level()
prev_indent = self._prev_indent()
if self._prev_numid() is None: # Open new list
self.level_at_new_list = level
self.parents[level] = doc.add_group(
@ -545,15 +652,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[level],
text=text,
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[level],
paragraph_elements=elements,
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elif (
self.prev_numid() == numid
self._prev_numid() == numid
and self.level_at_new_list is not None
and prev_indent is not None
and prev_indent < ilevel
@ -581,15 +696,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[self.level_at_new_list + ilevel],
text=text,
)
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[self.level_at_new_list + ilevel],
paragraph_elements=elements,
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elif (
self.prev_numid() == numid
self._prev_numid() == numid
and self.level_at_new_list is not None
and prev_indent is not None
and ilevel < prev_indent
@ -603,29 +726,46 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[self.level_at_new_list + ilevel],
text=text,
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[self.level_at_new_list + ilevel],
paragraph_elements=elements,
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
self.listIter = 0
elif self.prev_numid() == numid or prev_indent == ilevel:
elif self._prev_numid() == numid or prev_indent == ilevel:
# TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[level - 1],
text=text,
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[level - 1],
paragraph_elements=elements,
)
for text, format, hyperlink in elements:
# Add the list item to the parent group
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
return
def handle_tables(
def _handle_tables(
self,
element: BaseOxmlElement,
docx_obj: DocxDocument,
@ -640,7 +780,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
cell_element = table.rows[0].cells[0]
# In case we have a table of only 1 cell, we consider it furniture
# And proceed processing the content of the cell as though it's in the document body
self.walk_linear(cell_element._element, docx_obj, doc)
self._walk_linear(cell_element._element, docx_obj, doc)
return
data = TableData(num_rows=num_rows, num_cols=num_cols)
@ -685,11 +825,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
data.table_cells.append(table_cell)
col_idx += cell.grid_span
level = self.get_level()
level = self._get_level()
doc.add_table(data=data, parent=self.parents[level - 1])
return
def handle_pictures(
def _handle_pictures(
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
) -> None:
def get_docx_image(drawing_blip):
@ -702,7 +842,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
image_data = image_part.blob # Get the binary image data
return image_data
level = self.get_level()
level = self._get_level()
# Open the BytesIO object with PIL to create an Image
try:
image_data = get_docx_image(drawing_blip)

Binary file not shown.

View File

@ -0,0 +1,30 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: italic
item-2 at level 1: paragraph: bold
item-3 at level 1: paragraph: underline
item-4 at level 1: paragraph: hyperlink
item-5 at level 1: paragraph: italic and bold hyperlink
item-6 at level 1: inline: group group
item-7 at level 2: paragraph: Normal
item-8 at level 2: paragraph: italic
item-9 at level 2: paragraph: bold
item-10 at level 2: paragraph: underline
item-11 at level 2: paragraph: and
item-12 at level 2: paragraph: hyperlink
item-13 at level 2: paragraph: on the same line
item-14 at level 1: paragraph:
item-15 at level 1: list: group list
item-16 at level 2: list_item: Italic bullet 1
item-17 at level 2: list_item: Bold bullet 2
item-18 at level 2: list_item: Underline bullet 3
item-19 at level 2: inline: group group
item-20 at level 3: list_item: Some
item-21 at level 3: list_item: italic
item-22 at level 3: list_item: bold
item-23 at level 3: list_item: underline
item-24 at level 2: list: group list
item-25 at level 3: inline: group group
item-26 at level 4: list_item: Nested
item-27 at level 4: list_item: italic
item-28 at level 4: list_item: bold
item-29 at level 1: paragraph:

View File

@ -0,0 +1,577 @@
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"name": "unit_test_formatting",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"binary_hash": 16380079676357958448,
"filename": "unit_test_formatting.docx"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
},
{
"$ref": "#/texts/4"
},
{
"$ref": "#/groups/0"
},
{
"$ref": "#/texts/12"
},
{
"$ref": "#/groups/1"
},
{
"$ref": "#/texts/23"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/5"
},
{
"$ref": "#/texts/6"
},
{
"$ref": "#/texts/7"
},
{
"$ref": "#/texts/8"
},
{
"$ref": "#/texts/9"
},
{
"$ref": "#/texts/10"
},
{
"$ref": "#/texts/11"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/13"
},
{
"$ref": "#/texts/14"
},
{
"$ref": "#/texts/15"
},
{
"$ref": "#/groups/2"
},
{
"$ref": "#/groups/3"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/groups/1"
},
"children": [
{
"$ref": "#/texts/16"
},
{
"$ref": "#/texts/17"
},
{
"$ref": "#/texts/18"
},
{
"$ref": "#/texts/19"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
},
{
"self_ref": "#/groups/3",
"parent": {
"$ref": "#/groups/1"
},
"children": [
{
"$ref": "#/groups/4"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/4",
"parent": {
"$ref": "#/groups/3"
},
"children": [
{
"$ref": "#/texts/20"
},
{
"$ref": "#/texts/21"
},
{
"$ref": "#/texts/22"
}
],
"content_layer": "body",
"name": "group",
"label": "inline"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
}
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "hyperlink",
"text": "hyperlink",
"hyperlink": "https:/github.com/DS4SD/docling"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "italic and bold hyperlink",
"text": "italic and bold hyperlink",
"formatting": {
"bold": true,
"italic": true,
"underline": false,
"strikethrough": false
},
"hyperlink": "https:/github.com/DS4SD/docling"
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "Normal",
"text": "Normal"
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
}
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "and",
"text": "and"
},
{
"self_ref": "#/texts/10",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "hyperlink",
"text": "hyperlink",
"hyperlink": "https:/github.com/DS4SD/docling"
},
{
"self_ref": "#/texts/11",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "on the same line",
"text": "on the same line"
},
{
"self_ref": "#/texts/12",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/13",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Italic bullet 1",
"text": "Italic bullet 1",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/14",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Bold bullet 2",
"text": "Bold bullet 2",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/15",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Underline bullet 3",
"text": "Underline bullet 3",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/16",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Some",
"text": "Some",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/17",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/18",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/20",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Nested",
"text": "Nested",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/21",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/22",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/23",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@ -0,0 +1,17 @@
*italic*
**bold**
underline
[hyperlink](https:/github.com/DS4SD/docling)
[***italic and bold hyperlink***](https:/github.com/DS4SD/docling)
Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docling) on the same line
- *Italic bullet 1*
- **Bold bullet 2**
- Underline bullet 3
- Some *italic* **bold** underline
- Nested *italic* **bold**

View File

@ -76,17 +76,19 @@ def test_e2e_docx_conversions():
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
assert verify_export(
pred_md, str(gt_path) + ".md", generate=GENERATE
), "export to md"
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(
pred_itxt, str(gt_path) + ".itxt"
pred_itxt, str(gt_path) + ".itxt", generate=GENERATE
), "export to indented-text"
assert verify_document(
doc, str(gt_path) + ".json", GENERATE
doc, str(gt_path) + ".json", generate=GENERATE
), "document document"
if docx_path.name == "word_tables.docx":