fix: Missing text in docx (t tag) when embedded in a table (#528)

Fix for missing text in docx (t tag) when embedded in a table Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
2024-12-06 12:37:25 +01:00 · 2024-12-06 12:37:25 +01:00 · b730b2d7a0
commit b730b2d7a0
parent c830b92b2e
1 changed files with 20 additions and 13 deletions
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -133,7 +133,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
        for element in body:
            tag_name = etree.QName(element).localname
            # Check for Inline Images (blip elements)
            namespaces = {
                "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
@ -153,6 +152,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                self.handle_pictures(element, docx_obj, drawing_blip, doc)
            # Check for Text
            elif tag_name in ["p"]:
                # "tcPr", "sectPr"
                self.handle_text_elements(element, docx_obj, doc)
            else:
                _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
@ -219,7 +219,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        if paragraph.text is None:
            return
        text = paragraph.text.strip()
        # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
        # Common styles for bullet and numbered lists.
        # "List Bullet", "List Number", "List Paragraph"
@ -291,9 +290,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
        level = self.get_level()
        if isinstance(curr_level, int):
            if curr_level > level:
                # add invisible group
                for i in range(level, curr_level):
                    self.parents[i] = doc.add_group(
@ -301,9 +298,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                        label=GroupLabel.SECTION,
                        name=f"header-{i}",
                    )
            elif curr_level < level:
                # remove the tail
                for key, val in self.parents.items():
                    if key >= curr_level:
@ -314,7 +309,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                text=text,
                level=curr_level,
            )
        else:
            self.parents[self.level] = doc.add_heading(
                parent=self.parents[self.level - 1],
@ -346,7 +340,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
            )
-            # TODO: Set marker and enumerated arguments if this is an enumeration element.
+            # Set marker and enumerated arguments if this is an enumeration element.
            self.listIter += 1
            if is_numbered:
                enum_marker = str(self.listIter) + "."
@ -365,8 +359,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                self.level_at_new_list + self.prev_indent() + 1,
                self.level_at_new_list + ilevel + 1,
            ):
-                # TODO: determine if this is an unordered list or an ordered list.
+                # Determine if this is an unordered list or an ordered list.
-                #  Set GroupLabel.ORDERED_LIST when it fits.
+                # Set GroupLabel.ORDERED_LIST when it fits.
                self.listIter = 0
                if is_numbered:
                    self.parents[i] = doc.add_group(
@ -467,6 +461,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                row_span = get_rowspan(cell)
                col_span = get_colspan(cell)
                cell_text = cell.text
                # In case cell doesn't return text via docx library:
                if len(cell_text) == 0:
                    cell_xml = cell._element
                    texts = [""]
                    for elem in cell_xml.iter():
                        if elem.tag.endswith("t"):  # <w:t> tags that contain text
                            if elem.text:
                                texts.append(elem.text)
                    # Join the collected text
                    cell_text = " ".join(texts).strip()
                # Find the next available column in the grid
                while table_grid[row_idx][col_idx] is not None:
                    col_idx += 1
@ -477,15 +484,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                        table_grid[row_idx + i][col_idx + j] = ""
                cell = TableCell(
-                    text=cell.text,
+                    text=cell_text,
                    row_span=row_span,
                    col_span=col_span,
                    start_row_offset_idx=row_idx,
                    end_row_offset_idx=row_idx + row_span,
                    start_col_offset_idx=col_idx,
                    end_col_offset_idx=col_idx + col_span,
-                    col_header=False,  # col_header,
+                    col_header=False,
-                    row_header=False,  # ((not col_header) and html_cell.name=='th')
+                    row_header=False,
                )
                data.table_cells.append(cell)