diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 5094c8f..d6b73f7 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -14,15 +14,19 @@ from docling_core.types.doc import ( TableCell, TableData, ) +from docling_core.types.doc.document import Formatting from docx import Document from docx.document import Document as DocxDocument from docx.oxml.table import CT_Tc from docx.oxml.xmlchemy import BaseOxmlElement from docx.table import Table, _Cell +from docx.text.hyperlink import Hyperlink from docx.text.paragraph import Paragraph +from docx.text.run import Run from lxml import etree from lxml.etree import XPath from PIL import Image, UnidentifiedImageError +from pydantic import AnyUrl from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend @@ -118,14 +122,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc = DoclingDocument(name=self.file.stem or "file", origin=origin) if self.is_valid(): assert self.docx_obj is not None - doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc) + doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc) return doc else: raise RuntimeError( f"Cannot convert doc with {self.document_hash} because the backend failed to init." ) - def update_history( + def _update_history( self, name: str, level: Optional[int], @@ -138,26 +142,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.history["numids"].append(numid) self.history["indents"].append(ilevel) - def prev_name(self) -> Optional[str]: + def _prev_name(self) -> Optional[str]: return self.history["names"][-1] - def prev_level(self) -> Optional[int]: + def _prev_level(self) -> Optional[int]: return self.history["levels"][-1] - def prev_numid(self) -> Optional[int]: + def _prev_numid(self) -> Optional[int]: return self.history["numids"][-1] - def prev_indent(self) -> Optional[int]: + def _prev_indent(self) -> Optional[int]: return self.history["indents"][-1] - def get_level(self) -> int: + def _get_level(self) -> int: """Return the first None index.""" for k, v in self.parents.items(): if k >= 0 and v == None: return k return 0 - def walk_linear( + def _walk_linear( self, body: BaseOxmlElement, docx_obj: DocxDocument, @@ -177,12 +181,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Check for Tables if element.tag.endswith("tbl"): try: - self.handle_tables(element, docx_obj, doc) + self._handle_tables(element, docx_obj, doc) except Exception: _log.debug("could not parse a table, broken docx table") elif drawing_blip: - self.handle_pictures(docx_obj, drawing_blip, doc) + self._handle_pictures(docx_obj, drawing_blip, doc) # Check for the sdt containers, like table of contents elif tag_name in ["sdt"]: sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) @@ -190,16 +194,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Iterate paragraphs, runs, or text inside . paragraphs = sdt_content.findall(".//w:p", namespaces=namespaces) for p in paragraphs: - self.handle_text_elements(p, docx_obj, doc) + self._handle_text_elements(p, docx_obj, doc) # Check for Text elif tag_name in ["p"]: # "tcPr", "sectPr" - self.handle_text_elements(element, docx_obj, doc) + self._handle_text_elements(element, docx_obj, doc) else: _log.debug(f"Ignoring element in DOCX with tag: {tag_name}") return doc - def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]: + def _str_to_int( + self, s: Optional[str], default: Optional[int] = 0 + ) -> Optional[int]: if s is None: return None try: @@ -207,7 +213,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): except ValueError: return default - def split_text_and_number(self, input_string: str) -> list[str]: + def _split_text_and_number(self, input_string: str) -> list[str]: match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string) if match: parts = list(filter(None, match.groups())) @@ -215,7 +221,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): else: return [input_string] - def get_numId_and_ilvl( + def _get_numId_and_ilvl( self, paragraph: Paragraph ) -> tuple[Optional[int], Optional[int]]: # Access the XML element of the paragraph @@ -230,12 +236,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None - return self.str_to_int(numId, None), self.str_to_int(ilvl, None) + return self._str_to_int(numId, None), self._str_to_int(ilvl, None) return None, None # If the paragraph is not part of a list - def get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]: - parts = self.split_text_and_number(style_label) + def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]: + parts = self._split_text_and_number(style_label) if len(parts) == 2: parts.sort() @@ -243,15 +249,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): label_level: Optional[int] = 0 if parts[0].strip().lower() == "heading": label_str = "Heading" - label_level = self.str_to_int(parts[1], None) + label_level = self._str_to_int(parts[1], None) if parts[1].strip().lower() == "heading": label_str = "Heading" - label_level = self.str_to_int(parts[0], None) + label_level = self._str_to_int(parts[0], None) return label_str, label_level return style_label, None - def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]: + def _get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]: if paragraph.style is None: return "Normal", None @@ -264,16 +270,82 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if ":" in label: parts = label.split(":") if len(parts) == 2: - return parts[0], self.str_to_int(parts[1], None) + return parts[0], self._str_to_int(parts[1], None) if "heading" in label.lower(): - return self.get_heading_and_level(label) + return self._get_heading_and_level(label) if "heading" in name.lower(): - return self.get_heading_and_level(name) + return self._get_heading_and_level(name) return label, None - def handle_equations_in_text(self, element, text): + @classmethod + def _get_format_from_run(cls, run: Run) -> Optional[Formatting]: + has_any_formatting = run.bold or run.italic or run.underline + return ( + Formatting( + bold=run.bold or False, + italic=run.italic or False, + underline=run.underline or False, + ) + if has_any_formatting + else None + ) + + def _get_paragraph_elements(self, paragraph: Paragraph): + """ + Extract paragraph elements along with their formatting and hyperlink + """ + + # for now retain empty paragraphs for backwards compatibility: + if paragraph.text.strip() == "": + return [("", None, None)] + + paragraph_elements: list[ + tuple[str, Optional[Formatting], Optional[Union[AnyUrl, Path]]] + ] = [] + group_text = "" + previous_format = None + + # Iterate over the runs of the paragraph and group them by format + for c in paragraph.iter_inner_content(): + if isinstance(c, Hyperlink): + text = c.text + hyperlink = Path(c.address) + format = self._get_format_from_run(c.runs[0]) + elif isinstance(c, Run): + text = c.text + hyperlink = None + format = self._get_format_from_run(c) + else: + continue + + if (len(text.strip()) and format != previous_format) or ( + hyperlink is not None + ): + # If the style changes for a non empty text, add the previous group + if len(group_text.strip()) > 0: + paragraph_elements.append( + (group_text.strip(), previous_format, None) + ) + group_text = "" + + # If there is a hyperlink, add it immediately + if hyperlink is not None: + paragraph_elements.append((text.strip(), format, hyperlink)) + text = "" + else: + previous_format = format + + group_text += text + + # Format the last group + if len(group_text.strip()) > 0: + paragraph_elements.append((group_text.strip(), format, None)) + + return paragraph_elements + + def _handle_equations_in_text(self, element, text): only_texts = [] only_equations = [] texts_and_equations = [] @@ -319,7 +391,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): return output_text, only_equations - def handle_text_elements( + def _create_or_reuse_parent( + self, + *, + doc: DoclingDocument, + prev_parent: Optional[NodeItem], + paragraph_elements: list, + ) -> Optional[NodeItem]: + return ( + doc.add_group(label=GroupLabel.INLINE, parent=prev_parent) + if len(paragraph_elements) > 1 + else prev_parent + ) + + def _handle_text_elements( self, element: BaseOxmlElement, docx_obj: DocxDocument, @@ -328,10 +413,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): paragraph = Paragraph(element, docx_obj) raw_text = paragraph.text - text, equations = self.handle_equations_in_text(element=element, text=raw_text) + text, equations = self._handle_equations_in_text(element=element, text=raw_text) if text is None: return + paragraph_elements = self._get_paragraph_elements(paragraph) text = text.strip() # Common styles for bullet and numbered lists. @@ -339,8 +425,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Identify wether list is a numbered list or not # is_numbered = "List Bullet" not in paragraph.style.name is_numbered = False - p_style_id, p_level = self.get_label_and_level(paragraph) - numid, ilevel = self.get_numId_and_ilvl(paragraph) + p_style_id, p_level = self._get_label_and_level(paragraph) + numid, ilevel = self._get_numId_and_ilvl(paragraph) if numid == 0: numid = None @@ -351,18 +437,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): and ilevel is not None and p_style_id not in ["Title", "Heading"] ): - self.add_listitem( - doc, - numid, - ilevel, - text, - is_numbered, + self._add_list_item( + doc=doc, + numid=numid, + ilevel=ilevel, + elements=paragraph_elements, + is_numbered=is_numbered, ) - self.update_history(p_style_id, p_level, numid, ilevel) + self._update_history(p_style_id, p_level, numid, ilevel) return elif ( numid is None - and self.prev_numid() is not None + and self._prev_numid() is not None and p_style_id not in ["Title", "Heading"] ): # Close list if self.level_at_new_list: @@ -390,12 +476,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) else: is_numbered_style = False - self.add_header(doc, p_level, text, is_numbered_style) + self._add_header(doc, p_level, text, is_numbered_style) elif len(equations) > 0: if (raw_text is None or len(raw_text) == 0) and len(text) > 0: # Standalone equation - level = self.get_level() + level = self._get_level() doc.add_text( label=DocItemLabel.FORMULA, parent=self.parents[level - 1], @@ -403,7 +489,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) else: # Inline equation - level = self.get_level() + level = self._get_level() inline_equation = doc.add_group( label=GroupLabel.INLINE, parent=self.parents[level - 1] ) @@ -442,30 +528,50 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): "ListBullet", "Quote", ]: - level = self.get_level() - doc.add_text( - label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text + level = self._get_level() + parent = self._create_or_reuse_parent( + doc=doc, + prev_parent=self.parents.get(level - 1), + paragraph_elements=paragraph_elements, ) + for text, format, hyperlink in paragraph_elements: + doc.add_text( + label=DocItemLabel.PARAGRAPH, + parent=parent, + text=text, + formatting=format, + hyperlink=hyperlink, + ) else: # Text style names can, and will have, not only default values but user values too # hence we treat all other labels as pure text - level = self.get_level() - doc.add_text( - label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text + level = self._get_level() + parent = self._create_or_reuse_parent( + doc=doc, + prev_parent=self.parents.get(level - 1), + paragraph_elements=paragraph_elements, ) + for text, format, hyperlink in paragraph_elements: + doc.add_text( + label=DocItemLabel.PARAGRAPH, + parent=parent, + text=text, + formatting=format, + hyperlink=hyperlink, + ) - self.update_history(p_style_id, p_level, numid, ilevel) + self._update_history(p_style_id, p_level, numid, ilevel) return - def add_header( + def _add_header( self, doc: DoclingDocument, curr_level: Optional[int], text: str, is_numbered_style: bool = False, ) -> None: - level = self.get_level() + level = self._get_level() if isinstance(curr_level, int): if curr_level > level: # add invisible group @@ -521,19 +627,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) return - def add_listitem( + def _add_list_item( self, + *, doc: DoclingDocument, numid: int, ilevel: int, - text: str, + elements: list, is_numbered: bool = False, ) -> None: enum_marker = "" - level = self.get_level() - prev_indent = self.prev_indent() - if self.prev_numid() is None: # Open new list + level = self._get_level() + prev_indent = self._prev_indent() + if self._prev_numid() is None: # Open new list self.level_at_new_list = level self.parents[level] = doc.add_group( @@ -545,15 +652,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=self.parents[level], - text=text, + new_parent = self._create_or_reuse_parent( + doc=doc, + prev_parent=self.parents[level], + paragraph_elements=elements, ) + for text, format, hyperlink in elements: + doc.add_list_item( + marker=enum_marker, + enumerated=is_numbered, + parent=new_parent, + text=text, + formatting=format, + hyperlink=hyperlink, + ) elif ( - self.prev_numid() == numid + self._prev_numid() == numid and self.level_at_new_list is not None and prev_indent is not None and prev_indent < ilevel @@ -581,15 +696,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=self.parents[self.level_at_new_list + ilevel], - text=text, - ) + new_parent = self._create_or_reuse_parent( + doc=doc, + prev_parent=self.parents[self.level_at_new_list + ilevel], + paragraph_elements=elements, + ) + for text, format, hyperlink in elements: + doc.add_list_item( + marker=enum_marker, + enumerated=is_numbered, + parent=new_parent, + text=text, + formatting=format, + hyperlink=hyperlink, + ) elif ( - self.prev_numid() == numid + self._prev_numid() == numid and self.level_at_new_list is not None and prev_indent is not None and ilevel < prev_indent @@ -603,29 +726,46 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=self.parents[self.level_at_new_list + ilevel], - text=text, + new_parent = self._create_or_reuse_parent( + doc=doc, + prev_parent=self.parents[self.level_at_new_list + ilevel], + paragraph_elements=elements, ) + for text, format, hyperlink in elements: + doc.add_list_item( + marker=enum_marker, + enumerated=is_numbered, + parent=new_parent, + text=text, + formatting=format, + hyperlink=hyperlink, + ) self.listIter = 0 - elif self.prev_numid() == numid or prev_indent == ilevel: + elif self._prev_numid() == numid or prev_indent == ilevel: # TODO: Set marker and enumerated arguments if this is an enumeration element. self.listIter += 1 if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=self.parents[level - 1], - text=text, + new_parent = self._create_or_reuse_parent( + doc=doc, + prev_parent=self.parents[level - 1], + paragraph_elements=elements, ) + for text, format, hyperlink in elements: + # Add the list item to the parent group + doc.add_list_item( + marker=enum_marker, + enumerated=is_numbered, + parent=new_parent, + text=text, + formatting=format, + hyperlink=hyperlink, + ) return - def handle_tables( + def _handle_tables( self, element: BaseOxmlElement, docx_obj: DocxDocument, @@ -640,7 +780,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): cell_element = table.rows[0].cells[0] # In case we have a table of only 1 cell, we consider it furniture # And proceed processing the content of the cell as though it's in the document body - self.walk_linear(cell_element._element, docx_obj, doc) + self._walk_linear(cell_element._element, docx_obj, doc) return data = TableData(num_rows=num_rows, num_cols=num_cols) @@ -685,11 +825,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): data.table_cells.append(table_cell) col_idx += cell.grid_span - level = self.get_level() + level = self._get_level() doc.add_table(data=data, parent=self.parents[level - 1]) return - def handle_pictures( + def _handle_pictures( self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument ) -> None: def get_docx_image(drawing_blip): @@ -702,7 +842,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): image_data = image_part.blob # Get the binary image data return image_data - level = self.get_level() + level = self._get_level() # Open the BytesIO object with PIL to create an Image try: image_data = get_docx_image(drawing_blip) diff --git a/tests/data/docx/unit_test_formatting.docx b/tests/data/docx/unit_test_formatting.docx new file mode 100644 index 0000000..5d08668 Binary files /dev/null and b/tests/data/docx/unit_test_formatting.docx differ diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt new file mode 100644 index 0000000..2860c30 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt @@ -0,0 +1,30 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: paragraph: italic + item-2 at level 1: paragraph: bold + item-3 at level 1: paragraph: underline + item-4 at level 1: paragraph: hyperlink + item-5 at level 1: paragraph: italic and bold hyperlink + item-6 at level 1: inline: group group + item-7 at level 2: paragraph: Normal + item-8 at level 2: paragraph: italic + item-9 at level 2: paragraph: bold + item-10 at level 2: paragraph: underline + item-11 at level 2: paragraph: and + item-12 at level 2: paragraph: hyperlink + item-13 at level 2: paragraph: on the same line + item-14 at level 1: paragraph: + item-15 at level 1: list: group list + item-16 at level 2: list_item: Italic bullet 1 + item-17 at level 2: list_item: Bold bullet 2 + item-18 at level 2: list_item: Underline bullet 3 + item-19 at level 2: inline: group group + item-20 at level 3: list_item: Some + item-21 at level 3: list_item: italic + item-22 at level 3: list_item: bold + item-23 at level 3: list_item: underline + item-24 at level 2: list: group list + item-25 at level 3: inline: group group + item-26 at level 4: list_item: Nested + item-27 at level 4: list_item: italic + item-28 at level 4: list_item: bold + item-29 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json new file mode 100644 index 0000000..9ad75e8 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json @@ -0,0 +1,577 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.3.0", + "name": "unit_test_formatting", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 16380079676357958448, + "filename": "unit_test_formatting.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/12" + }, + { + "$ref": "#/groups/1" + }, + { + "$ref": "#/texts/23" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/13" + }, + { + "$ref": "#/texts/14" + }, + { + "$ref": "#/texts/15" + }, + { + "$ref": "#/groups/2" + }, + { + "$ref": "#/groups/3" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/texts/16" + }, + { + "$ref": "#/texts/17" + }, + { + "$ref": "#/texts/18" + }, + { + "$ref": "#/texts/19" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/groups/4" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/4", + "parent": { + "$ref": "#/groups/3" + }, + "children": [ + { + "$ref": "#/texts/20" + }, + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "bold", + "text": "bold", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "underline", + "text": "underline", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "hyperlink", + "text": "hyperlink", + "hyperlink": "https:/github.com/DS4SD/docling" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "italic and bold hyperlink", + "text": "italic and bold hyperlink", + "formatting": { + "bold": true, + "italic": true, + "underline": false, + "strikethrough": false + }, + "hyperlink": "https:/github.com/DS4SD/docling" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Normal", + "text": "Normal" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "bold", + "text": "bold", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "underline", + "text": "underline", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "and", + "text": "and" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "hyperlink", + "text": "hyperlink", + "hyperlink": "https:/github.com/DS4SD/docling" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "on the same line", + "text": "on the same line" + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Italic bullet 1", + "text": "Italic bullet 1", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Bold bullet 2", + "text": "Bold bullet 2", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Underline bullet 3", + "text": "Underline bullet 3", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Some", + "text": "Some", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/17", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "bold", + "text": "bold", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "underline", + "text": "underline", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Nested", + "text": "Nested", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "bold", + "text": "bold", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md new file mode 100644 index 0000000..918e89e --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md @@ -0,0 +1,17 @@ +*italic* + +**bold** + +underline + +[hyperlink](https:/github.com/DS4SD/docling) + +[***italic and bold hyperlink***](https:/github.com/DS4SD/docling) + +Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docling) on the same line + +- *Italic bullet 1* +- **Bold bullet 2** +- Underline bullet 3 +- Some *italic* **bold** underline + - Nested *italic* **bold** \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index f9843c7..5c43ccf 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -76,17 +76,19 @@ def test_e2e_docx_conversions(): doc: DoclingDocument = conv_result.document pred_md: str = doc.export_to_markdown() - assert verify_export(pred_md, str(gt_path) + ".md"), "export to md" + assert verify_export( + pred_md, str(gt_path) + ".md", generate=GENERATE + ), "export to md" pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) assert verify_export( - pred_itxt, str(gt_path) + ".itxt" + pred_itxt, str(gt_path) + ".itxt", generate=GENERATE ), "export to indented-text" assert verify_document( - doc, str(gt_path) + ".json", GENERATE + doc, str(gt_path) + ".json", generate=GENERATE ), "document document" if docx_path.name == "word_tables.docx":