fix(docx): ensure list items have a list parent (#1827)
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
1350a8d3e5
commit
d26dac61a8
@ -14,7 +14,7 @@ from docling_core.types.doc import (
|
|||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
from docling_core.types.doc.document import Formatting
|
from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
|
||||||
from docx import Document
|
from docx import Document
|
||||||
from docx.document import Document as DocxDocument
|
from docx.document import Document as DocxDocument
|
||||||
from docx.oxml.table import CT_Tc
|
from docx.oxml.table import CT_Tc
|
||||||
@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.valid = True
|
self.valid = True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
@override
|
@override
|
||||||
@ -274,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self._handle_text_elements(element, docx_obj, doc)
|
self._handle_text_elements(element, docx_obj, doc)
|
||||||
else:
|
else:
|
||||||
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def _str_to_int(
|
def _str_to_int(
|
||||||
@ -584,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
all_paragraphs = []
|
all_paragraphs = []
|
||||||
|
|
||||||
# Sort paragraphs within each container, then process containers
|
# Sort paragraphs within each container, then process containers
|
||||||
for container_id, paragraphs in container_paragraphs.items():
|
for paragraphs in container_paragraphs.values():
|
||||||
# Sort by vertical position within each container
|
# Sort by vertical position within each container
|
||||||
sorted_container_paragraphs = sorted(
|
sorted_container_paragraphs = sorted(
|
||||||
paragraphs,
|
paragraphs,
|
||||||
@ -695,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
) -> None:
|
) -> None:
|
||||||
paragraph = Paragraph(element, docx_obj)
|
paragraph = Paragraph(element, docx_obj)
|
||||||
|
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||||
text, equations = self._handle_equations_in_text(
|
text, equations = self._handle_equations_in_text(
|
||||||
element=element, text=paragraph.text
|
element=element, text=paragraph.text
|
||||||
)
|
)
|
||||||
|
|
||||||
if text is None:
|
if text is None:
|
||||||
return
|
return
|
||||||
paragraph_elements = self._get_paragraph_elements(paragraph)
|
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
|
|
||||||
# Common styles for bullet and numbered lists.
|
# Common styles for bullet and numbered lists.
|
||||||
@ -918,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def _add_formatted_list_item(
|
||||||
|
self,
|
||||||
|
doc: DoclingDocument,
|
||||||
|
elements: list,
|
||||||
|
marker: str,
|
||||||
|
enumerated: bool,
|
||||||
|
level: int,
|
||||||
|
) -> None:
|
||||||
|
# This should not happen by construction
|
||||||
|
if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
|
||||||
|
return
|
||||||
|
if len(elements) == 1:
|
||||||
|
text, format, hyperlink = elements[0]
|
||||||
|
doc.add_list_item(
|
||||||
|
marker=marker,
|
||||||
|
enumerated=enumerated,
|
||||||
|
parent=self.parents[level],
|
||||||
|
text=text,
|
||||||
|
formatting=format,
|
||||||
|
hyperlink=hyperlink,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
new_item = doc.add_list_item(
|
||||||
|
marker=marker,
|
||||||
|
enumerated=enumerated,
|
||||||
|
parent=self.parents[level],
|
||||||
|
text="",
|
||||||
|
)
|
||||||
|
new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
|
||||||
|
for text, format, hyperlink in elements:
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.TEXT,
|
||||||
|
parent=new_parent,
|
||||||
|
text=text,
|
||||||
|
formatting=format,
|
||||||
|
hyperlink=hyperlink,
|
||||||
|
)
|
||||||
|
|
||||||
def _add_list_item(
|
def _add_list_item(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
@ -927,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
elements: list,
|
elements: list,
|
||||||
is_numbered: bool = False,
|
is_numbered: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
# TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
|
||||||
|
if not elements:
|
||||||
|
return None
|
||||||
enum_marker = ""
|
enum_marker = ""
|
||||||
|
|
||||||
level = self._get_level()
|
level = self._get_level()
|
||||||
@ -943,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
new_parent = self._create_or_reuse_parent(
|
self._add_formatted_list_item(
|
||||||
doc=doc,
|
doc, elements, enum_marker, is_numbered, level
|
||||||
prev_parent=self.parents[level],
|
|
||||||
paragraph_elements=elements,
|
|
||||||
)
|
)
|
||||||
for text, format, hyperlink in elements:
|
|
||||||
doc.add_list_item(
|
|
||||||
marker=enum_marker,
|
|
||||||
enumerated=is_numbered,
|
|
||||||
parent=new_parent,
|
|
||||||
text=text,
|
|
||||||
formatting=format,
|
|
||||||
hyperlink=hyperlink,
|
|
||||||
)
|
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
self._prev_numid() == numid
|
self._prev_numid() == numid
|
||||||
and self.level_at_new_list is not None
|
and self.level_at_new_list is not None
|
||||||
@ -987,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
|
self._add_formatted_list_item(
|
||||||
new_parent = self._create_or_reuse_parent(
|
doc,
|
||||||
doc=doc,
|
elements,
|
||||||
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
enum_marker,
|
||||||
paragraph_elements=elements,
|
is_numbered,
|
||||||
|
self.level_at_new_list + ilevel,
|
||||||
)
|
)
|
||||||
for text, format, hyperlink in elements:
|
|
||||||
doc.add_list_item(
|
|
||||||
marker=enum_marker,
|
|
||||||
enumerated=is_numbered,
|
|
||||||
parent=new_parent,
|
|
||||||
text=text,
|
|
||||||
formatting=format,
|
|
||||||
hyperlink=hyperlink,
|
|
||||||
)
|
|
||||||
elif (
|
elif (
|
||||||
self._prev_numid() == numid
|
self._prev_numid() == numid
|
||||||
and self.level_at_new_list is not None
|
and self.level_at_new_list is not None
|
||||||
and prev_indent is not None
|
and prev_indent is not None
|
||||||
and ilevel < prev_indent
|
and ilevel < prev_indent
|
||||||
): # Close list
|
): # Close list
|
||||||
for k, v in self.parents.items():
|
for k in self.parents:
|
||||||
if k > self.level_at_new_list + ilevel:
|
if k > self.level_at_new_list + ilevel:
|
||||||
self.parents[k] = None
|
self.parents[k] = None
|
||||||
|
|
||||||
@ -1017,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
new_parent = self._create_or_reuse_parent(
|
self._add_formatted_list_item(
|
||||||
doc=doc,
|
doc,
|
||||||
prev_parent=self.parents[self.level_at_new_list + ilevel],
|
elements,
|
||||||
paragraph_elements=elements,
|
enum_marker,
|
||||||
|
is_numbered,
|
||||||
|
self.level_at_new_list + ilevel,
|
||||||
)
|
)
|
||||||
for text, format, hyperlink in elements:
|
|
||||||
doc.add_list_item(
|
|
||||||
marker=enum_marker,
|
|
||||||
enumerated=is_numbered,
|
|
||||||
parent=new_parent,
|
|
||||||
text=text,
|
|
||||||
formatting=format,
|
|
||||||
hyperlink=hyperlink,
|
|
||||||
)
|
|
||||||
self.listIter = 0
|
self.listIter = 0
|
||||||
|
|
||||||
elif self._prev_numid() == numid or prev_indent == ilevel:
|
elif self._prev_numid() == numid or prev_indent == ilevel:
|
||||||
@ -1039,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
new_parent = self._create_or_reuse_parent(
|
self._add_formatted_list_item(
|
||||||
doc=doc,
|
doc, elements, enum_marker, is_numbered, level - 1
|
||||||
prev_parent=self.parents[level - 1],
|
|
||||||
paragraph_elements=elements,
|
|
||||||
)
|
)
|
||||||
for text, format, hyperlink in elements:
|
|
||||||
# Add the list item to the parent group
|
|
||||||
doc.add_list_item(
|
|
||||||
marker=enum_marker,
|
|
||||||
enumerated=is_numbered,
|
|
||||||
parent=new_parent,
|
|
||||||
text=text,
|
|
||||||
formatting=format,
|
|
||||||
hyperlink=hyperlink,
|
|
||||||
)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def _handle_tables(
|
def _handle_tables(
|
||||||
|
@ -17,14 +17,16 @@ item-0 at level 0: unspecified: group _root_
|
|||||||
item-16 at level 2: list_item: Italic bullet 1
|
item-16 at level 2: list_item: Italic bullet 1
|
||||||
item-17 at level 2: list_item: Bold bullet 2
|
item-17 at level 2: list_item: Bold bullet 2
|
||||||
item-18 at level 2: list_item: Underline bullet 3
|
item-18 at level 2: list_item: Underline bullet 3
|
||||||
item-19 at level 2: inline: group group
|
item-19 at level 2: list_item:
|
||||||
item-20 at level 3: list_item: Some
|
item-20 at level 3: inline: group group
|
||||||
item-21 at level 3: list_item: italic
|
item-21 at level 4: text: Some
|
||||||
item-22 at level 3: list_item: bold
|
item-22 at level 4: text: italic
|
||||||
item-23 at level 3: list_item: underline
|
item-23 at level 4: text: bold
|
||||||
item-24 at level 2: list: group list
|
item-24 at level 4: text: underline
|
||||||
item-25 at level 3: inline: group group
|
item-25 at level 2: list: group list
|
||||||
item-26 at level 4: list_item: Nested
|
item-26 at level 3: list_item:
|
||||||
item-27 at level 4: list_item: italic
|
item-27 at level 4: inline: group group
|
||||||
item-28 at level 4: list_item: bold
|
item-28 at level 5: text: Nested
|
||||||
item-29 at level 1: paragraph:
|
item-29 at level 5: text: italic
|
||||||
|
item-30 at level 5: text: bold
|
||||||
|
item-31 at level 1: paragraph:
|
@ -42,7 +42,7 @@
|
|||||||
"$ref": "#/groups/1"
|
"$ref": "#/groups/1"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/23"
|
"$ref": "#/texts/25"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -98,7 +98,7 @@
|
|||||||
"$ref": "#/texts/15"
|
"$ref": "#/texts/15"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/2"
|
"$ref": "#/texts/16"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/3"
|
"$ref": "#/groups/3"
|
||||||
@ -111,12 +111,9 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/groups/2",
|
"self_ref": "#/groups/2",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/1"
|
"$ref": "#/texts/16"
|
||||||
},
|
},
|
||||||
"children": [
|
"children": [
|
||||||
{
|
|
||||||
"$ref": "#/texts/16"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/17"
|
"$ref": "#/texts/17"
|
||||||
},
|
},
|
||||||
@ -125,6 +122,9 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/19"
|
"$ref": "#/texts/19"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/20"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -138,7 +138,7 @@
|
|||||||
},
|
},
|
||||||
"children": [
|
"children": [
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/4"
|
"$ref": "#/texts/21"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -148,17 +148,17 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/groups/4",
|
"self_ref": "#/groups/4",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/3"
|
"$ref": "#/texts/21"
|
||||||
},
|
},
|
||||||
"children": [
|
"children": [
|
||||||
{
|
|
||||||
"$ref": "#/texts/20"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"$ref": "#/texts/21"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/22"
|
"$ref": "#/texts/22"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/23"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/24"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -461,20 +461,18 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/16",
|
"self_ref": "#/texts/16",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/2"
|
"$ref": "#/groups/1"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/2"
|
||||||
|
}
|
||||||
|
],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "list_item",
|
"label": "list_item",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Some",
|
"orig": "",
|
||||||
"text": "Some",
|
"text": "",
|
||||||
"formatting": {
|
|
||||||
"bold": false,
|
|
||||||
"italic": false,
|
|
||||||
"underline": false,
|
|
||||||
"strikethrough": false
|
|
||||||
},
|
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -485,18 +483,16 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "list_item",
|
"label": "text",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "italic",
|
"orig": "Some",
|
||||||
"text": "italic",
|
"text": "Some",
|
||||||
"formatting": {
|
"formatting": {
|
||||||
"bold": false,
|
"bold": false,
|
||||||
"italic": true,
|
"italic": false,
|
||||||
"underline": false,
|
"underline": false,
|
||||||
"strikethrough": false
|
"strikethrough": false
|
||||||
},
|
}
|
||||||
"enumerated": false,
|
|
||||||
"marker": "-"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/18",
|
"self_ref": "#/texts/18",
|
||||||
@ -505,67 +501,7 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "list_item",
|
"label": "text",
|
||||||
"prov": [],
|
|
||||||
"orig": "bold",
|
|
||||||
"text": "bold",
|
|
||||||
"formatting": {
|
|
||||||
"bold": true,
|
|
||||||
"italic": false,
|
|
||||||
"underline": false,
|
|
||||||
"strikethrough": false
|
|
||||||
},
|
|
||||||
"enumerated": false,
|
|
||||||
"marker": "-"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/texts/19",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/2"
|
|
||||||
},
|
|
||||||
"children": [],
|
|
||||||
"content_layer": "body",
|
|
||||||
"label": "list_item",
|
|
||||||
"prov": [],
|
|
||||||
"orig": "underline",
|
|
||||||
"text": "underline",
|
|
||||||
"formatting": {
|
|
||||||
"bold": false,
|
|
||||||
"italic": false,
|
|
||||||
"underline": true,
|
|
||||||
"strikethrough": false
|
|
||||||
},
|
|
||||||
"enumerated": false,
|
|
||||||
"marker": "-"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/texts/20",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/4"
|
|
||||||
},
|
|
||||||
"children": [],
|
|
||||||
"content_layer": "body",
|
|
||||||
"label": "list_item",
|
|
||||||
"prov": [],
|
|
||||||
"orig": "Nested",
|
|
||||||
"text": "Nested",
|
|
||||||
"formatting": {
|
|
||||||
"bold": false,
|
|
||||||
"italic": false,
|
|
||||||
"underline": false,
|
|
||||||
"strikethrough": false
|
|
||||||
},
|
|
||||||
"enumerated": false,
|
|
||||||
"marker": "-"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/texts/21",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/groups/4"
|
|
||||||
},
|
|
||||||
"children": [],
|
|
||||||
"content_layer": "body",
|
|
||||||
"label": "list_item",
|
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "italic",
|
"orig": "italic",
|
||||||
"text": "italic",
|
"text": "italic",
|
||||||
@ -574,7 +510,59 @@
|
|||||||
"italic": true,
|
"italic": true,
|
||||||
"underline": false,
|
"underline": false,
|
||||||
"strikethrough": false
|
"strikethrough": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/19",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/2"
|
||||||
},
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "bold",
|
||||||
|
"text": "bold",
|
||||||
|
"formatting": {
|
||||||
|
"bold": true,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/20",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/2"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "underline",
|
||||||
|
"text": "underline",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": true,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/21",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/3"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/groups/4"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "list_item",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "",
|
||||||
|
"text": "",
|
||||||
"enumerated": false,
|
"enumerated": false,
|
||||||
"marker": "-"
|
"marker": "-"
|
||||||
},
|
},
|
||||||
@ -585,7 +573,43 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "list_item",
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Nested",
|
||||||
|
"text": "Nested",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": false,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/23",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/4"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "italic",
|
||||||
|
"text": "italic",
|
||||||
|
"formatting": {
|
||||||
|
"bold": false,
|
||||||
|
"italic": true,
|
||||||
|
"underline": false,
|
||||||
|
"strikethrough": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/24",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/4"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "bold",
|
"orig": "bold",
|
||||||
"text": "bold",
|
"text": "bold",
|
||||||
@ -594,12 +618,10 @@
|
|||||||
"italic": false,
|
"italic": false,
|
||||||
"underline": false,
|
"underline": false,
|
||||||
"strikethrough": false
|
"strikethrough": false
|
||||||
},
|
}
|
||||||
"enumerated": false,
|
|
||||||
"marker": "-"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/23",
|
"self_ref": "#/texts/25",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/body"
|
"$ref": "#/body"
|
||||||
},
|
},
|
||||||
|
@ -97,18 +97,18 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
|
|||||||
|
|
||||||
pred_md: str = doc.export_to_markdown()
|
pred_md: str = doc.export_to_markdown()
|
||||||
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
|
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
|
||||||
"export to md"
|
f"export to markdown failed on {docx_path}"
|
||||||
)
|
)
|
||||||
|
|
||||||
pred_itxt: str = doc._export_to_indented_text(
|
pred_itxt: str = doc._export_to_indented_text(
|
||||||
max_text_len=70, explicit_tables=False
|
max_text_len=70, explicit_tables=False
|
||||||
)
|
)
|
||||||
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
|
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
|
||||||
"export to indented-text"
|
f"export to indented-text failed on {docx_path}"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
|
assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
|
||||||
"document document"
|
f"DoclingDocument verification failed on {docx_path}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if docx_path.name == "word_tables.docx":
|
if docx_path.name == "word_tables.docx":
|
||||||
@ -117,7 +117,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
|
|||||||
pred_text=pred_html,
|
pred_text=pred_html,
|
||||||
gtfile=str(gt_path) + ".html",
|
gtfile=str(gt_path) + ".html",
|
||||||
generate=GENERATE,
|
generate=GENERATE,
|
||||||
), "export to html"
|
), f"export to html failed on {docx_path}"
|
||||||
|
|
||||||
|
|
||||||
flaky_path = Path("tests/data/docx/textbox.docx")
|
flaky_path = Path("tests/data/docx/textbox.docx")
|
||||||
|
Loading…
Reference in New Issue
Block a user