fix(docx): ensure list items have a list parent (#1827)

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-06-20 14:47:25 +02:00 committed by GitHub
parent 1350a8d3e5
commit d26dac61a8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 200 additions and 173 deletions

View File

@ -14,7 +14,7 @@ from docling_core.types.doc import (
TableCell,
TableData,
)
from docling_core.types.doc.document import Formatting
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
from docx import Document
from docx.document import Document as DocxDocument
from docx.oxml.table import CT_Tc
@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.valid = True
except Exception as e:
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
) from e
@override
@ -274,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self._handle_text_elements(element, docx_obj, doc)
else:
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
return doc
def _str_to_int(
@ -584,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
all_paragraphs = []
# Sort paragraphs within each container, then process containers
for container_id, paragraphs in container_paragraphs.items():
for paragraphs in container_paragraphs.values():
# Sort by vertical position within each container
sorted_container_paragraphs = sorted(
paragraphs,
@ -695,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc: DoclingDocument,
) -> None:
paragraph = Paragraph(element, docx_obj)
paragraph_elements = self._get_paragraph_elements(paragraph)
text, equations = self._handle_equations_in_text(
element=element, text=paragraph.text
)
if text is None:
return
paragraph_elements = self._get_paragraph_elements(paragraph)
text = text.strip()
# Common styles for bullet and numbered lists.
@ -918,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
)
return
def _add_formatted_list_item(
self,
doc: DoclingDocument,
elements: list,
marker: str,
enumerated: bool,
level: int,
) -> None:
# This should not happen by construction
if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
return
if len(elements) == 1:
text, format, hyperlink = elements[0]
doc.add_list_item(
marker=marker,
enumerated=enumerated,
parent=self.parents[level],
text=text,
formatting=format,
hyperlink=hyperlink,
)
else:
new_item = doc.add_list_item(
marker=marker,
enumerated=enumerated,
parent=self.parents[level],
text="",
)
new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
for text, format, hyperlink in elements:
doc.add_text(
label=DocItemLabel.TEXT,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
def _add_list_item(
self,
*,
@ -927,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
elements: list,
is_numbered: bool = False,
) -> None:
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
if not elements:
return None
enum_marker = ""
level = self._get_level()
@ -943,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[level],
paragraph_elements=elements,
self._add_formatted_list_item(
doc, elements, enum_marker, is_numbered, level
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elif (
self._prev_numid() == numid
and self.level_at_new_list is not None
@ -987,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[self.level_at_new_list + ilevel],
paragraph_elements=elements,
self._add_formatted_list_item(
doc,
elements,
enum_marker,
is_numbered,
self.level_at_new_list + ilevel,
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
elif (
self._prev_numid() == numid
and self.level_at_new_list is not None
and prev_indent is not None
and ilevel < prev_indent
): # Close list
for k, v in self.parents.items():
for k in self.parents:
if k > self.level_at_new_list + ilevel:
self.parents[k] = None
@ -1017,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[self.level_at_new_list + ilevel],
paragraph_elements=elements,
self._add_formatted_list_item(
doc,
elements,
enum_marker,
is_numbered,
self.level_at_new_list + ilevel,
)
for text, format, hyperlink in elements:
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
self.listIter = 0
elif self._prev_numid() == numid or prev_indent == ilevel:
@ -1039,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
new_parent = self._create_or_reuse_parent(
doc=doc,
prev_parent=self.parents[level - 1],
paragraph_elements=elements,
self._add_formatted_list_item(
doc, elements, enum_marker, is_numbered, level - 1
)
for text, format, hyperlink in elements:
# Add the list item to the parent group
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_parent,
text=text,
formatting=format,
hyperlink=hyperlink,
)
return
def _handle_tables(

View File

@ -17,14 +17,16 @@ item-0 at level 0: unspecified: group _root_
item-16 at level 2: list_item: Italic bullet 1
item-17 at level 2: list_item: Bold bullet 2
item-18 at level 2: list_item: Underline bullet 3
item-19 at level 2: inline: group group
item-20 at level 3: list_item: Some
item-21 at level 3: list_item: italic
item-22 at level 3: list_item: bold
item-23 at level 3: list_item: underline
item-24 at level 2: list: group list
item-25 at level 3: inline: group group
item-26 at level 4: list_item: Nested
item-27 at level 4: list_item: italic
item-28 at level 4: list_item: bold
item-29 at level 1: paragraph:
item-19 at level 2: list_item:
item-20 at level 3: inline: group group
item-21 at level 4: text: Some
item-22 at level 4: text: italic
item-23 at level 4: text: bold
item-24 at level 4: text: underline
item-25 at level 2: list: group list
item-26 at level 3: list_item:
item-27 at level 4: inline: group group
item-28 at level 5: text: Nested
item-29 at level 5: text: italic
item-30 at level 5: text: bold
item-31 at level 1: paragraph:

View File

@ -42,7 +42,7 @@
"$ref": "#/groups/1"
},
{
"$ref": "#/texts/23"
"$ref": "#/texts/25"
}
],
"content_layer": "body",
@ -98,7 +98,7 @@
"$ref": "#/texts/15"
},
{
"$ref": "#/groups/2"
"$ref": "#/texts/16"
},
{
"$ref": "#/groups/3"
@ -111,12 +111,9 @@
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/groups/1"
"$ref": "#/texts/16"
},
"children": [
{
"$ref": "#/texts/16"
},
{
"$ref": "#/texts/17"
},
@ -125,6 +122,9 @@
},
{
"$ref": "#/texts/19"
},
{
"$ref": "#/texts/20"
}
],
"content_layer": "body",
@ -138,7 +138,7 @@
},
"children": [
{
"$ref": "#/groups/4"
"$ref": "#/texts/21"
}
],
"content_layer": "body",
@ -148,17 +148,17 @@
{
"self_ref": "#/groups/4",
"parent": {
"$ref": "#/groups/3"
"$ref": "#/texts/21"
},
"children": [
{
"$ref": "#/texts/20"
},
{
"$ref": "#/texts/21"
},
{
"$ref": "#/texts/22"
},
{
"$ref": "#/texts/23"
},
{
"$ref": "#/texts/24"
}
],
"content_layer": "body",
@ -461,20 +461,18 @@
{
"self_ref": "#/texts/16",
"parent": {
"$ref": "#/groups/2"
"$ref": "#/groups/1"
},
"children": [],
"children": [
{
"$ref": "#/groups/2"
}
],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Some",
"text": "Some",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
},
"orig": "",
"text": "",
"enumerated": false,
"marker": "-"
},
@ -485,18 +483,16 @@
},
"children": [],
"content_layer": "body",
"label": "list_item",
"label": "text",
"prov": [],
"orig": "italic",
"text": "italic",
"orig": "Some",
"text": "Some",
"formatting": {
"bold": false,
"italic": true,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
}
},
{
"self_ref": "#/texts/18",
@ -505,67 +501,7 @@
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/20",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Nested",
"text": "Nested",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/21",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"label": "text",
"prov": [],
"orig": "italic",
"text": "italic",
@ -574,7 +510,59 @@
"italic": true,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "bold",
"text": "bold",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/20",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "underline",
"text": "underline",
"formatting": {
"bold": false,
"italic": false,
"underline": true,
"strikethrough": false
}
},
{
"self_ref": "#/texts/21",
"parent": {
"$ref": "#/groups/3"
},
"children": [
{
"$ref": "#/groups/4"
}
],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "",
"text": "",
"enumerated": false,
"marker": "-"
},
@ -585,7 +573,43 @@
},
"children": [],
"content_layer": "body",
"label": "list_item",
"label": "text",
"prov": [],
"orig": "Nested",
"text": "Nested",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/23",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "italic",
"text": "italic",
"formatting": {
"bold": false,
"italic": true,
"underline": false,
"strikethrough": false
}
},
{
"self_ref": "#/texts/24",
"parent": {
"$ref": "#/groups/4"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "bold",
"text": "bold",
@ -594,12 +618,10 @@
"italic": false,
"underline": false,
"strikethrough": false
},
"enumerated": false,
"marker": "-"
}
},
{
"self_ref": "#/texts/23",
"self_ref": "#/texts/25",
"parent": {
"$ref": "#/body"
},

View File

@ -97,18 +97,18 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
"export to md"
f"export to markdown failed on {docx_path}"
)
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
"export to indented-text"
f"export to indented-text failed on {docx_path}"
)
assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
"document document"
f"DoclingDocument verification failed on {docx_path}"
)
if docx_path.name == "word_tables.docx":
@ -117,7 +117,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
pred_text=pred_html,
gtfile=str(gt_path) + ".html",
generate=GENERATE,
), "export to html"
), f"export to html failed on {docx_path}"
flaky_path = Path("tests/data/docx/textbox.docx")