fix: Missing text in docx (t tag) when embedded in a table (#528)
Fix for missing text in docx (t tag) when embedded in a table Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
c830b92b2e
commit
b730b2d7a0
@ -133,7 +133,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
||||||
for element in body:
|
for element in body:
|
||||||
tag_name = etree.QName(element).localname
|
tag_name = etree.QName(element).localname
|
||||||
|
|
||||||
# Check for Inline Images (blip elements)
|
# Check for Inline Images (blip elements)
|
||||||
namespaces = {
|
namespaces = {
|
||||||
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
||||||
@ -153,6 +152,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.handle_pictures(element, docx_obj, drawing_blip, doc)
|
self.handle_pictures(element, docx_obj, drawing_blip, doc)
|
||||||
# Check for Text
|
# Check for Text
|
||||||
elif tag_name in ["p"]:
|
elif tag_name in ["p"]:
|
||||||
|
# "tcPr", "sectPr"
|
||||||
self.handle_text_elements(element, docx_obj, doc)
|
self.handle_text_elements(element, docx_obj, doc)
|
||||||
else:
|
else:
|
||||||
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||||
@ -219,7 +219,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if paragraph.text is None:
|
if paragraph.text is None:
|
||||||
return
|
return
|
||||||
text = paragraph.text.strip()
|
text = paragraph.text.strip()
|
||||||
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
|
||||||
|
|
||||||
# Common styles for bullet and numbered lists.
|
# Common styles for bullet and numbered lists.
|
||||||
# "List Bullet", "List Number", "List Paragraph"
|
# "List Bullet", "List Number", "List Paragraph"
|
||||||
@ -291,9 +290,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
|
def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
if isinstance(curr_level, int):
|
if isinstance(curr_level, int):
|
||||||
|
|
||||||
if curr_level > level:
|
if curr_level > level:
|
||||||
|
|
||||||
# add invisible group
|
# add invisible group
|
||||||
for i in range(level, curr_level):
|
for i in range(level, curr_level):
|
||||||
self.parents[i] = doc.add_group(
|
self.parents[i] = doc.add_group(
|
||||||
@ -301,9 +298,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
label=GroupLabel.SECTION,
|
label=GroupLabel.SECTION,
|
||||||
name=f"header-{i}",
|
name=f"header-{i}",
|
||||||
)
|
)
|
||||||
|
|
||||||
elif curr_level < level:
|
elif curr_level < level:
|
||||||
|
|
||||||
# remove the tail
|
# remove the tail
|
||||||
for key, val in self.parents.items():
|
for key, val in self.parents.items():
|
||||||
if key >= curr_level:
|
if key >= curr_level:
|
||||||
@ -314,7 +309,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
text=text,
|
text=text,
|
||||||
level=curr_level,
|
level=curr_level,
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.parents[self.level] = doc.add_heading(
|
self.parents[self.level] = doc.add_heading(
|
||||||
parent=self.parents[self.level - 1],
|
parent=self.parents[self.level - 1],
|
||||||
@ -346,7 +340,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
|
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
# Set marker and enumerated arguments if this is an enumeration element.
|
||||||
self.listIter += 1
|
self.listIter += 1
|
||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
@ -365,8 +359,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.level_at_new_list + self.prev_indent() + 1,
|
self.level_at_new_list + self.prev_indent() + 1,
|
||||||
self.level_at_new_list + ilevel + 1,
|
self.level_at_new_list + ilevel + 1,
|
||||||
):
|
):
|
||||||
# TODO: determine if this is an unordered list or an ordered list.
|
# Determine if this is an unordered list or an ordered list.
|
||||||
# Set GroupLabel.ORDERED_LIST when it fits.
|
# Set GroupLabel.ORDERED_LIST when it fits.
|
||||||
self.listIter = 0
|
self.listIter = 0
|
||||||
if is_numbered:
|
if is_numbered:
|
||||||
self.parents[i] = doc.add_group(
|
self.parents[i] = doc.add_group(
|
||||||
@ -467,6 +461,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
row_span = get_rowspan(cell)
|
row_span = get_rowspan(cell)
|
||||||
col_span = get_colspan(cell)
|
col_span = get_colspan(cell)
|
||||||
|
|
||||||
|
cell_text = cell.text
|
||||||
|
# In case cell doesn't return text via docx library:
|
||||||
|
if len(cell_text) == 0:
|
||||||
|
cell_xml = cell._element
|
||||||
|
|
||||||
|
texts = [""]
|
||||||
|
for elem in cell_xml.iter():
|
||||||
|
if elem.tag.endswith("t"): # <w:t> tags that contain text
|
||||||
|
if elem.text:
|
||||||
|
texts.append(elem.text)
|
||||||
|
# Join the collected text
|
||||||
|
cell_text = " ".join(texts).strip()
|
||||||
|
|
||||||
# Find the next available column in the grid
|
# Find the next available column in the grid
|
||||||
while table_grid[row_idx][col_idx] is not None:
|
while table_grid[row_idx][col_idx] is not None:
|
||||||
col_idx += 1
|
col_idx += 1
|
||||||
@ -477,15 +484,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
table_grid[row_idx + i][col_idx + j] = ""
|
table_grid[row_idx + i][col_idx + j] = ""
|
||||||
|
|
||||||
cell = TableCell(
|
cell = TableCell(
|
||||||
text=cell.text,
|
text=cell_text,
|
||||||
row_span=row_span,
|
row_span=row_span,
|
||||||
col_span=col_span,
|
col_span=col_span,
|
||||||
start_row_offset_idx=row_idx,
|
start_row_offset_idx=row_idx,
|
||||||
end_row_offset_idx=row_idx + row_span,
|
end_row_offset_idx=row_idx + row_span,
|
||||||
start_col_offset_idx=col_idx,
|
start_col_offset_idx=col_idx,
|
||||||
end_col_offset_idx=col_idx + col_span,
|
end_col_offset_idx=col_idx + col_span,
|
||||||
col_header=False, # col_header,
|
col_header=False,
|
||||||
row_header=False, # ((not col_header) and html_cell.name=='th')
|
row_header=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
data.table_cells.append(cell)
|
data.table_cells.append(cell)
|
||||||
|
Loading…
Reference in New Issue
Block a user