fix(docx): ensure list items have a list parent (#1827)
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
1350a8d3e5
commit
d26dac61a8
@ -14,7 +14,7 @@ from docling_core.types.doc import (
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
from docling_core.types.doc.document import Formatting
|
||||
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
||||
from docx import Document
|
||||
from docx.document import Document as DocxDocument
|
||||
from docx.oxml.table import CT_Tc
|
||||
@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.valid = True
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
||||
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
|
||||
) from e
|
||||
|
||||
@override
|
||||
@ -274,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._handle_text_elements(element, docx_obj, doc)
|
||||
else:
|
||||
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||
|
||||
return doc
|
||||
|
||||
def _str_to_int(
|
||||
@ -584,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
all_paragraphs = []
|
||||
|
||||
# Sort paragraphs within each container, then process containers
|
||||
for container_id, paragraphs in container_paragraphs.items():
|
||||
for paragraphs in container_paragraphs.values():
|
||||
# Sort by vertical position within each container
|
||||
sorted_container_paragraphs = sorted(
|
||||
paragraphs,
|
||||
@ -695,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
doc: DoclingDocument,
|
||||
) -> None:
|
||||
paragraph = Paragraph(element, docx_obj)
|
||||
|
||||
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||
text, equations = self._handle_equations_in_text(
|
||||
element=element, text=paragraph.text
|
||||
)
|
||||
|
||||
if text is None:
|
||||
return
|
||||
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||
text = text.strip()
|
||||
|
||||
# Common styles for bullet and numbered lists.
|
||||
@ -918,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
return
|
||||
|
||||
def _add_formatted_list_item(
|
||||
self,
|
||||
doc: DoclingDocument,
|
||||
elements: list,
|
||||
marker: str,
|
||||
enumerated: bool,
|
||||
level: int,
|
||||
) -> None:
|
||||
# This should not happen by construction
|
||||
if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
|
||||
return
|
||||
if len(elements) == 1:
|
||||
text, format, hyperlink = elements[0]
|
||||
doc.add_list_item(
|
||||
marker=marker,
|
||||
enumerated=enumerated,
|
||||
parent=self.parents[level],
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
else:
|
||||
new_item = doc.add_list_item(
|
||||
marker=marker,
|
||||
enumerated=enumerated,
|
||||
parent=self.parents[level],
|
||||
text="",
|
||||
)
|
||||
new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.TEXT,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
def _add_list_item(
|
||||
self,
|
||||
*,
|
||||
@ -927,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
elements: list,
|
||||
is_numbered: bool = False,
|
||||
) -> None:
|
||||
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
|
||||
if not elements:
|
||||
return None
|
||||
enum_marker = ""
|
||||
|
||||
level = self._get_level()
|
||||
@ -943,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[level],
|
||||
paragraph_elements=elements,
|
||||
self._add_formatted_list_item(
|
||||
doc, elements, enum_marker, is_numbered, level
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
elif (
|
||||
self._prev_numid() == numid
|
||||
and self.level_at_new_list is not None
|
||||
@ -987,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
||||
paragraph_elements=elements,
|
||||
self._add_formatted_list_item(
|
||||
doc,
|
||||
elements,
|
||||
enum_marker,
|
||||
is_numbered,
|
||||
self.level_at_new_list + ilevel,
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
elif (
|
||||
self._prev_numid() == numid
|
||||
and self.level_at_new_list is not None
|
||||
and prev_indent is not None
|
||||
and ilevel < prev_indent
|
||||
): # Close list
|
||||
for k, v in self.parents.items():
|
||||
for k in self.parents:
|
||||
if k > self.level_at_new_list + ilevel:
|
||||
self.parents[k] = None
|
||||
|
||||
@ -1017,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
||||
paragraph_elements=elements,
|
||||
self._add_formatted_list_item(
|
||||
doc,
|
||||
elements,
|
||||
enum_marker,
|
||||
is_numbered,
|
||||
self.level_at_new_list + ilevel,
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
self.listIter = 0
|
||||
|
||||
elif self._prev_numid() == numid or prev_indent == ilevel:
|
||||
@ -1039,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
is_numbered = True
|
||||
new_parent = self._create_or_reuse_parent(
|
||||
doc=doc,
|
||||
prev_parent=self.parents[level - 1],
|
||||
paragraph_elements=elements,
|
||||
self._add_formatted_list_item(
|
||||
doc, elements, enum_marker, is_numbered, level - 1
|
||||
)
|
||||
for text, format, hyperlink in elements:
|
||||
# Add the list item to the parent group
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_parent,
|
||||
text=text,
|
||||
formatting=format,
|
||||
hyperlink=hyperlink,
|
||||
)
|
||||
|
||||
return
|
||||
|
||||
def _handle_tables(
|
||||
|
@ -17,14 +17,16 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-16 at level 2: list_item: Italic bullet 1
|
||||
item-17 at level 2: list_item: Bold bullet 2
|
||||
item-18 at level 2: list_item: Underline bullet 3
|
||||
item-19 at level 2: inline: group group
|
||||
item-20 at level 3: list_item: Some
|
||||
item-21 at level 3: list_item: italic
|
||||
item-22 at level 3: list_item: bold
|
||||
item-23 at level 3: list_item: underline
|
||||
item-24 at level 2: list: group list
|
||||
item-25 at level 3: inline: group group
|
||||
item-26 at level 4: list_item: Nested
|
||||
item-27 at level 4: list_item: italic
|
||||
item-28 at level 4: list_item: bold
|
||||
item-29 at level 1: paragraph:
|
||||
item-19 at level 2: list_item:
|
||||
item-20 at level 3: inline: group group
|
||||
item-21 at level 4: text: Some
|
||||
item-22 at level 4: text: italic
|
||||
item-23 at level 4: text: bold
|
||||
item-24 at level 4: text: underline
|
||||
item-25 at level 2: list: group list
|
||||
item-26 at level 3: list_item:
|
||||
item-27 at level 4: inline: group group
|
||||
item-28 at level 5: text: Nested
|
||||
item-29 at level 5: text: italic
|
||||
item-30 at level 5: text: bold
|
||||
item-31 at level 1: paragraph:
|
@ -42,7 +42,7 @@
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/23"
|
||||
"$ref": "#/texts/25"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -98,7 +98,7 @@
|
||||
"$ref": "#/texts/15"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/2"
|
||||
"$ref": "#/texts/16"
|
||||
},
|
||||
{
|
||||
"$ref": "#/groups/3"
|
||||
@ -111,12 +111,9 @@
|
||||
{
|
||||
"self_ref": "#/groups/2",
|
||||
"parent": {
|
||||
"$ref": "#/groups/1"
|
||||
"$ref": "#/texts/16"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/16"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/17"
|
||||
},
|
||||
@ -125,6 +122,9 @@
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/19"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/20"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -138,7 +138,7 @@
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/4"
|
||||
"$ref": "#/texts/21"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -148,17 +148,17 @@
|
||||
{
|
||||
"self_ref": "#/groups/4",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
"$ref": "#/texts/21"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/20"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/21"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/22"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/23"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/24"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -461,20 +461,18 @@
|
||||
{
|
||||
"self_ref": "#/texts/16",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
"$ref": "#/groups/1"
|
||||
},
|
||||
"children": [],
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/2"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Some",
|
||||
"text": "Some",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"orig": "",
|
||||
"text": "",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
@ -485,18 +483,16 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "italic",
|
||||
"text": "italic",
|
||||
"orig": "Some",
|
||||
"text": "Some",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/18",
|
||||
@ -505,67 +501,7 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "bold",
|
||||
"text": "bold",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/19",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "underline",
|
||||
"text": "underline",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": true,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/20",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Nested",
|
||||
"text": "Nested",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/21",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "italic",
|
||||
"text": "italic",
|
||||
@ -574,7 +510,59 @@
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/19",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "bold",
|
||||
"text": "bold",
|
||||
"formatting": {
|
||||
"bold": true,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/20",
|
||||
"parent": {
|
||||
"$ref": "#/groups/2"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "underline",
|
||||
"text": "underline",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": true,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/21",
|
||||
"parent": {
|
||||
"$ref": "#/groups/3"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/4"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": "",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
@ -585,7 +573,43 @@
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "Nested",
|
||||
"text": "Nested",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/23",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "italic",
|
||||
"text": "italic",
|
||||
"formatting": {
|
||||
"bold": false,
|
||||
"italic": true,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/24",
|
||||
"parent": {
|
||||
"$ref": "#/groups/4"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [],
|
||||
"orig": "bold",
|
||||
"text": "bold",
|
||||
@ -594,12 +618,10 @@
|
||||
"italic": false,
|
||||
"underline": false,
|
||||
"strikethrough": false
|
||||
},
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
}
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/23",
|
||||
"self_ref": "#/texts/25",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
|
@ -97,18 +97,18 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
|
||||
|
||||
pred_md: str = doc.export_to_markdown()
|
||||
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
|
||||
"export to md"
|
||||
f"export to markdown failed on {docx_path}"
|
||||
)
|
||||
|
||||
pred_itxt: str = doc._export_to_indented_text(
|
||||
max_text_len=70, explicit_tables=False
|
||||
)
|
||||
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
|
||||
"export to indented-text"
|
||||
f"export to indented-text failed on {docx_path}"
|
||||
)
|
||||
|
||||
assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
|
||||
"document document"
|
||||
f"DoclingDocument verification failed on {docx_path}"
|
||||
)
|
||||
|
||||
if docx_path.name == "word_tables.docx":
|
||||
@ -117,7 +117,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
|
||||
pred_text=pred_html,
|
||||
gtfile=str(gt_path) + ".html",
|
||||
generate=GENERATE,
|
||||
), "export to html"
|
||||
), f"export to html failed on {docx_path}"
|
||||
|
||||
|
||||
flaky_path = Path("tests/data/docx/textbox.docx")
|
||||
|
Loading…
Reference in New Issue
Block a user