fix: Missing text in docx (t tag) when embedded in a table (#528)
Fix for missing text in docx (t tag) when embedded in a table Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
c830b92b2e
commit
b730b2d7a0
@ -133,7 +133,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
||||
for element in body:
|
||||
tag_name = etree.QName(element).localname
|
||||
|
||||
# Check for Inline Images (blip elements)
|
||||
namespaces = {
|
||||
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
||||
@ -153,6 +152,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.handle_pictures(element, docx_obj, drawing_blip, doc)
|
||||
# Check for Text
|
||||
elif tag_name in ["p"]:
|
||||
# "tcPr", "sectPr"
|
||||
self.handle_text_elements(element, docx_obj, doc)
|
||||
else:
|
||||
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||
@ -219,7 +219,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if paragraph.text is None:
|
||||
return
|
||||
text = paragraph.text.strip()
|
||||
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
||||
|
||||
# Common styles for bullet and numbered lists.
|
||||
# "List Bullet", "List Number", "List Paragraph"
|
||||
@ -291,9 +290,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
|
||||
level = self.get_level()
|
||||
if isinstance(curr_level, int):
|
||||
|
||||
if curr_level > level:
|
||||
|
||||
# add invisible group
|
||||
for i in range(level, curr_level):
|
||||
self.parents[i] = doc.add_group(
|
||||
@ -301,9 +298,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
label=GroupLabel.SECTION,
|
||||
name=f"header-{i}",
|
||||
)
|
||||
|
||||
elif curr_level < level:
|
||||
|
||||
# remove the tail
|
||||
for key, val in self.parents.items():
|
||||
if key >= curr_level:
|
||||
@ -314,7 +309,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
text=text,
|
||||
level=curr_level,
|
||||
)
|
||||
|
||||
else:
|
||||
self.parents[self.level] = doc.add_heading(
|
||||
parent=self.parents[self.level - 1],
|
||||
@ -346,7 +340,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
|
||||
)
|
||||
|
||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||
# Set marker and enumerated arguments if this is an enumeration element.
|
||||
self.listIter += 1
|
||||
if is_numbered:
|
||||
enum_marker = str(self.listIter) + "."
|
||||
@ -365,7 +359,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.level_at_new_list + self.prev_indent() + 1,
|
||||
self.level_at_new_list + ilevel + 1,
|
||||
):
|
||||
# TODO: determine if this is an unordered list or an ordered list.
|
||||
# Determine if this is an unordered list or an ordered list.
|
||||
# Set GroupLabel.ORDERED_LIST when it fits.
|
||||
self.listIter = 0
|
||||
if is_numbered:
|
||||
@ -467,6 +461,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
row_span = get_rowspan(cell)
|
||||
col_span = get_colspan(cell)
|
||||
|
||||
cell_text = cell.text
|
||||
# In case cell doesn't return text via docx library:
|
||||
if len(cell_text) == 0:
|
||||
cell_xml = cell._element
|
||||
|
||||
texts = [""]
|
||||
for elem in cell_xml.iter():
|
||||
if elem.tag.endswith("t"): # <w:t> tags that contain text
|
||||
if elem.text:
|
||||
texts.append(elem.text)
|
||||
# Join the collected text
|
||||
cell_text = " ".join(texts).strip()
|
||||
|
||||
# Find the next available column in the grid
|
||||
while table_grid[row_idx][col_idx] is not None:
|
||||
col_idx += 1
|
||||
@ -477,15 +484,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
table_grid[row_idx + i][col_idx + j] = ""
|
||||
|
||||
cell = TableCell(
|
||||
text=cell.text,
|
||||
text=cell_text,
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
start_row_offset_idx=row_idx,
|
||||
end_row_offset_idx=row_idx + row_span,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
col_header=False, # col_header,
|
||||
row_header=False, # ((not col_header) and html_cell.name=='th')
|
||||
col_header=False,
|
||||
row_header=False,
|
||||
)
|
||||
|
||||
data.table_cells.append(cell)
|
||||
|
Loading…
Reference in New Issue
Block a user