fix: Missing text in docx (t tag) when embedded in a table (#528)

Fix for missing text in docx (t tag) when embedded in a table

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak 2024-12-06 12:37:25 +01:00 committed by GitHub
parent c830b92b2e
commit b730b2d7a0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -133,7 +133,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument: def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
for element in body: for element in body:
tag_name = etree.QName(element).localname tag_name = etree.QName(element).localname
# Check for Inline Images (blip elements) # Check for Inline Images (blip elements)
namespaces = { namespaces = {
"a": "http://schemas.openxmlformats.org/drawingml/2006/main", "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
@ -153,6 +152,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.handle_pictures(element, docx_obj, drawing_blip, doc) self.handle_pictures(element, docx_obj, drawing_blip, doc)
# Check for Text # Check for Text
elif tag_name in ["p"]: elif tag_name in ["p"]:
# "tcPr", "sectPr"
self.handle_text_elements(element, docx_obj, doc) self.handle_text_elements(element, docx_obj, doc)
else: else:
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}") _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
@ -219,7 +219,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if paragraph.text is None: if paragraph.text is None:
return return
text = paragraph.text.strip() text = paragraph.text.strip()
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
# Common styles for bullet and numbered lists. # Common styles for bullet and numbered lists.
# "List Bullet", "List Number", "List Paragraph" # "List Bullet", "List Number", "List Paragraph"
@ -291,9 +290,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str): def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
level = self.get_level() level = self.get_level()
if isinstance(curr_level, int): if isinstance(curr_level, int):
if curr_level > level: if curr_level > level:
# add invisible group # add invisible group
for i in range(level, curr_level): for i in range(level, curr_level):
self.parents[i] = doc.add_group( self.parents[i] = doc.add_group(
@ -301,9 +298,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label=GroupLabel.SECTION, label=GroupLabel.SECTION,
name=f"header-{i}", name=f"header-{i}",
) )
elif curr_level < level: elif curr_level < level:
# remove the tail # remove the tail
for key, val in self.parents.items(): for key, val in self.parents.items():
if key >= curr_level: if key >= curr_level:
@ -314,7 +309,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
text=text, text=text,
level=curr_level, level=curr_level,
) )
else: else:
self.parents[self.level] = doc.add_heading( self.parents[self.level] = doc.add_heading(
parent=self.parents[self.level - 1], parent=self.parents[self.level - 1],
@ -346,7 +340,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1] label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
) )
# TODO: Set marker and enumerated arguments if this is an enumeration element. # Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1 self.listIter += 1
if is_numbered: if is_numbered:
enum_marker = str(self.listIter) + "." enum_marker = str(self.listIter) + "."
@ -365,7 +359,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level_at_new_list + self.prev_indent() + 1, self.level_at_new_list + self.prev_indent() + 1,
self.level_at_new_list + ilevel + 1, self.level_at_new_list + ilevel + 1,
): ):
# TODO: determine if this is an unordered list or an ordered list. # Determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits. # Set GroupLabel.ORDERED_LIST when it fits.
self.listIter = 0 self.listIter = 0
if is_numbered: if is_numbered:
@ -467,6 +461,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
row_span = get_rowspan(cell) row_span = get_rowspan(cell)
col_span = get_colspan(cell) col_span = get_colspan(cell)
cell_text = cell.text
# In case cell doesn't return text via docx library:
if len(cell_text) == 0:
cell_xml = cell._element
texts = [""]
for elem in cell_xml.iter():
if elem.tag.endswith("t"): # <w:t> tags that contain text
if elem.text:
texts.append(elem.text)
# Join the collected text
cell_text = " ".join(texts).strip()
# Find the next available column in the grid # Find the next available column in the grid
while table_grid[row_idx][col_idx] is not None: while table_grid[row_idx][col_idx] is not None:
col_idx += 1 col_idx += 1
@ -477,15 +484,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
table_grid[row_idx + i][col_idx + j] = "" table_grid[row_idx + i][col_idx + j] = ""
cell = TableCell( cell = TableCell(
text=cell.text, text=cell_text,
row_span=row_span, row_span=row_span,
col_span=col_span, col_span=col_span,
start_row_offset_idx=row_idx, start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span, end_row_offset_idx=row_idx + row_span,
start_col_offset_idx=col_idx, start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span, end_col_offset_idx=col_idx + col_span,
col_header=False, # col_header, col_header=False,
row_header=False, # ((not col_header) and html_cell.name=='th') row_header=False,
) )
data.table_cells.append(cell) data.table_cells.append(cell)