From d26dac61a86b0af5b16686f78956ba047bcbddba Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Fri, 20 Jun 2025 14:47:25 +0200 Subject: [PATCH] fix(docx): ensure list items have a list parent (#1827) Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/msword_backend.py | 125 +++++----- .../docling_v2/unit_test_formatting.docx.itxt | 24 +- .../docling_v2/unit_test_formatting.docx.json | 216 ++++++++++-------- tests/test_backend_msword.py | 8 +- 4 files changed, 200 insertions(+), 173 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index ec071ef..8386082 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -14,7 +14,7 @@ from docling_core.types.doc import ( TableCell, TableData, ) -from docling_core.types.doc.document import Formatting +from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList from docx import Document from docx.document import Document as DocxDocument from docx.oxml.table import CT_Tc @@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.valid = True except Exception as e: raise RuntimeError( - f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}" + f"MsWordDocumentBackend could not load document with hash {self.document_hash}" ) from e @override @@ -274,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self._handle_text_elements(element, docx_obj, doc) else: _log.debug(f"Ignoring element in DOCX with tag: {tag_name}") + return doc def _str_to_int( @@ -584,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): all_paragraphs = [] # Sort paragraphs within each container, then process containers - for container_id, paragraphs in container_paragraphs.items(): + for paragraphs in container_paragraphs.values(): # Sort by vertical position within each container sorted_container_paragraphs = sorted( paragraphs, @@ -695,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc: DoclingDocument, ) -> None: paragraph = Paragraph(element, docx_obj) - + paragraph_elements = self._get_paragraph_elements(paragraph) text, equations = self._handle_equations_in_text( element=element, text=paragraph.text ) if text is None: return - paragraph_elements = self._get_paragraph_elements(paragraph) text = text.strip() # Common styles for bullet and numbered lists. @@ -918,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) return + def _add_formatted_list_item( + self, + doc: DoclingDocument, + elements: list, + marker: str, + enumerated: bool, + level: int, + ) -> None: + # This should not happen by construction + if not isinstance(self.parents[level], (OrderedList, UnorderedList)): + return + if len(elements) == 1: + text, format, hyperlink = elements[0] + doc.add_list_item( + marker=marker, + enumerated=enumerated, + parent=self.parents[level], + text=text, + formatting=format, + hyperlink=hyperlink, + ) + else: + new_item = doc.add_list_item( + marker=marker, + enumerated=enumerated, + parent=self.parents[level], + text="", + ) + new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item) + for text, format, hyperlink in elements: + doc.add_text( + label=DocItemLabel.TEXT, + parent=new_parent, + text=text, + formatting=format, + hyperlink=hyperlink, + ) + def _add_list_item( self, *, @@ -927,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): elements: list, is_numbered: bool = False, ) -> None: + # TODO: this method is always called with is_numbered. Numbered lists should be properly addressed. + if not elements: + return None enum_marker = "" level = self._get_level() @@ -943,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - new_parent = self._create_or_reuse_parent( - doc=doc, - prev_parent=self.parents[level], - paragraph_elements=elements, + self._add_formatted_list_item( + doc, elements, enum_marker, is_numbered, level ) - for text, format, hyperlink in elements: - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) - elif ( self._prev_numid() == numid and self.level_at_new_list is not None @@ -987,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - - new_parent = self._create_or_reuse_parent( - doc=doc, - prev_parent=self.parents[self.level_at_new_list + ilevel], - paragraph_elements=elements, + self._add_formatted_list_item( + doc, + elements, + enum_marker, + is_numbered, + self.level_at_new_list + ilevel, ) - for text, format, hyperlink in elements: - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) elif ( self._prev_numid() == numid and self.level_at_new_list is not None and prev_indent is not None and ilevel < prev_indent ): # Close list - for k, v in self.parents.items(): + for k in self.parents: if k > self.level_at_new_list + ilevel: self.parents[k] = None @@ -1017,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - new_parent = self._create_or_reuse_parent( - doc=doc, - prev_parent=self.parents[self.level_at_new_list + ilevel], - paragraph_elements=elements, + self._add_formatted_list_item( + doc, + elements, + enum_marker, + is_numbered, + self.level_at_new_list + ilevel, ) - for text, format, hyperlink in elements: - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) self.listIter = 0 elif self._prev_numid() == numid or prev_indent == ilevel: @@ -1039,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - new_parent = self._create_or_reuse_parent( - doc=doc, - prev_parent=self.parents[level - 1], - paragraph_elements=elements, + self._add_formatted_list_item( + doc, elements, enum_marker, is_numbered, level - 1 ) - for text, format, hyperlink in elements: - # Add the list item to the parent group - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) + return def _handle_tables( diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt index 2860c30..fccb44c 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt @@ -17,14 +17,16 @@ item-0 at level 0: unspecified: group _root_ item-16 at level 2: list_item: Italic bullet 1 item-17 at level 2: list_item: Bold bullet 2 item-18 at level 2: list_item: Underline bullet 3 - item-19 at level 2: inline: group group - item-20 at level 3: list_item: Some - item-21 at level 3: list_item: italic - item-22 at level 3: list_item: bold - item-23 at level 3: list_item: underline - item-24 at level 2: list: group list - item-25 at level 3: inline: group group - item-26 at level 4: list_item: Nested - item-27 at level 4: list_item: italic - item-28 at level 4: list_item: bold - item-29 at level 1: paragraph: \ No newline at end of file + item-19 at level 2: list_item: + item-20 at level 3: inline: group group + item-21 at level 4: text: Some + item-22 at level 4: text: italic + item-23 at level 4: text: bold + item-24 at level 4: text: underline + item-25 at level 2: list: group list + item-26 at level 3: list_item: + item-27 at level 4: inline: group group + item-28 at level 5: text: Nested + item-29 at level 5: text: italic + item-30 at level 5: text: bold + item-31 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json index 8b6ee9d..967aff1 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json @@ -42,7 +42,7 @@ "$ref": "#/groups/1" }, { - "$ref": "#/texts/23" + "$ref": "#/texts/25" } ], "content_layer": "body", @@ -98,7 +98,7 @@ "$ref": "#/texts/15" }, { - "$ref": "#/groups/2" + "$ref": "#/texts/16" }, { "$ref": "#/groups/3" @@ -111,12 +111,9 @@ { "self_ref": "#/groups/2", "parent": { - "$ref": "#/groups/1" + "$ref": "#/texts/16" }, "children": [ - { - "$ref": "#/texts/16" - }, { "$ref": "#/texts/17" }, @@ -125,6 +122,9 @@ }, { "$ref": "#/texts/19" + }, + { + "$ref": "#/texts/20" } ], "content_layer": "body", @@ -138,7 +138,7 @@ }, "children": [ { - "$ref": "#/groups/4" + "$ref": "#/texts/21" } ], "content_layer": "body", @@ -148,17 +148,17 @@ { "self_ref": "#/groups/4", "parent": { - "$ref": "#/groups/3" + "$ref": "#/texts/21" }, "children": [ - { - "$ref": "#/texts/20" - }, - { - "$ref": "#/texts/21" - }, { "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" + }, + { + "$ref": "#/texts/24" } ], "content_layer": "body", @@ -461,20 +461,18 @@ { "self_ref": "#/texts/16", "parent": { - "$ref": "#/groups/2" + "$ref": "#/groups/1" }, - "children": [], + "children": [ + { + "$ref": "#/groups/2" + } + ], "content_layer": "body", "label": "list_item", "prov": [], - "orig": "Some", - "text": "Some", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false - }, + "orig": "", + "text": "", "enumerated": false, "marker": "-" }, @@ -485,18 +483,16 @@ }, "children": [], "content_layer": "body", - "label": "list_item", + "label": "text", "prov": [], - "orig": "italic", - "text": "italic", + "orig": "Some", + "text": "Some", "formatting": { "bold": false, - "italic": true, + "italic": false, "underline": false, "strikethrough": false - }, - "enumerated": false, - "marker": "-" + } }, { "self_ref": "#/texts/18", @@ -505,67 +501,7 @@ }, "children": [], "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "bold", - "text": "bold", - "formatting": { - "bold": true, - "italic": false, - "underline": false, - "strikethrough": false - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/19", - "parent": { - "$ref": "#/groups/2" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "underline", - "text": "underline", - "formatting": { - "bold": false, - "italic": false, - "underline": true, - "strikethrough": false - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/20", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "Nested", - "text": "Nested", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/21", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "list_item", + "label": "text", "prov": [], "orig": "italic", "text": "italic", @@ -574,7 +510,59 @@ "italic": true, "underline": false, "strikethrough": false + } + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/2" }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "bold", + "text": "bold", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "underline", + "text": "underline", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/groups/3" + }, + "children": [ + { + "$ref": "#/groups/4" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "", + "text": "", "enumerated": false, "marker": "-" }, @@ -585,7 +573,43 @@ }, "children": [], "content_layer": "body", - "label": "list_item", + "label": "text", + "prov": [], + "orig": "Nested", + "text": "Nested", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "text", "prov": [], "orig": "bold", "text": "bold", @@ -594,12 +618,10 @@ "italic": false, "underline": false, "strikethrough": false - }, - "enumerated": false, - "marker": "-" + } }, { - "self_ref": "#/texts/23", + "self_ref": "#/texts/25", "parent": { "$ref": "#/body" }, diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 61ddd2a..9da0ea2 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -97,18 +97,18 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]): pred_md: str = doc.export_to_markdown() assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), ( - "export to md" + f"export to markdown failed on {docx_path}" ) pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), ( - "export to indented-text" + f"export to indented-text failed on {docx_path}" ) assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), ( - "document document" + f"DoclingDocument verification failed on {docx_path}" ) if docx_path.name == "word_tables.docx": @@ -117,7 +117,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]): pred_text=pred_html, gtfile=str(gt_path) + ".html", generate=GENERATE, - ), "export to html" + ), f"export to html failed on {docx_path}" flaky_path = Path("tests/data/docx/textbox.docx")