fix: Handling of single-cell tables in DOCX backend (#314)
* Handling of single-cell tables in DOCX backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * returned try-catch on tables handling Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * cleaned Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * proceed processing the content of single cell table as if its just part of the body Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added example of trickly 1 cell table docx Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
7f5d35ea3c
commit
fb8ba861e2
@ -130,7 +130,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
||||||
for element in body:
|
for element in body:
|
||||||
tag_name = etree.QName(element).localname
|
tag_name = etree.QName(element).localname
|
||||||
|
|
||||||
# Check for Inline Images (drawings or blip elements)
|
# Check for Inline Images (drawings or blip elements)
|
||||||
found_drawing = etree.ElementBase.xpath(
|
found_drawing = etree.ElementBase.xpath(
|
||||||
element, ".//w:drawing", namespaces=self.xml_namespaces
|
element, ".//w:drawing", namespaces=self.xml_namespaces
|
||||||
@ -201,7 +200,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
label_str = ""
|
label_str = ""
|
||||||
label_level = 0
|
label_level = 0
|
||||||
if parts[0] == "Heading":
|
if parts[0] == "Heading":
|
||||||
# print("{} - {}".format(parts[0], parts[1]))
|
|
||||||
label_str = parts[0]
|
label_str = parts[0]
|
||||||
label_level = self.str_to_int(parts[1], default=None)
|
label_level = self.str_to_int(parts[1], default=None)
|
||||||
if parts[1] == "Heading":
|
if parts[1] == "Heading":
|
||||||
@ -217,19 +215,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if paragraph.text is None:
|
if paragraph.text is None:
|
||||||
# _log.warn(f"paragraph has text==None")
|
# _log.warn(f"paragraph has text==None")
|
||||||
return
|
return
|
||||||
|
|
||||||
text = paragraph.text.strip()
|
text = paragraph.text.strip()
|
||||||
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
||||||
|
|
||||||
# Common styles for bullet and numbered lists.
|
# Common styles for bullet and numbered lists.
|
||||||
# "List Bullet", "List Number", "List Paragraph"
|
# "List Bullet", "List Number", "List Paragraph"
|
||||||
# TODO: reliably identify wether list is a numbered list or not
|
# Identify wether list is a numbered list or not
|
||||||
# is_numbered = "List Bullet" not in paragraph.style.name
|
# is_numbered = "List Bullet" not in paragraph.style.name
|
||||||
is_numbered = False
|
is_numbered = False
|
||||||
|
|
||||||
p_style_name, p_level = self.get_label_and_level(paragraph)
|
p_style_name, p_level = self.get_label_and_level(paragraph)
|
||||||
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
||||||
# print("numid: {}, ilevel: {}, text: {}".format(numid, ilevel, text))
|
|
||||||
|
|
||||||
if numid == 0:
|
if numid == 0:
|
||||||
numid = None
|
numid = None
|
||||||
@ -450,8 +445,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
for row in table.rows:
|
for row in table.rows:
|
||||||
# Calculate the max number of columns
|
# Calculate the max number of columns
|
||||||
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
|
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
|
||||||
# if row.cells:
|
|
||||||
# num_cols = max(num_cols, len(row.cells))
|
if num_rows == 1 and num_cols == 1:
|
||||||
|
cell_element = table.rows[0].cells[0]
|
||||||
|
# In case we have a table of only 1 cell, we consider it furniture
|
||||||
|
# And proceed processing the content of the cell as though it's in the document body
|
||||||
|
self.walk_linear(cell_element._element, docx_obj, doc)
|
||||||
|
return
|
||||||
|
|
||||||
# Initialize the table grid
|
# Initialize the table grid
|
||||||
table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||||
|
BIN
tests/data/docx/tablecell.docx
Normal file
BIN
tests/data/docx/tablecell.docx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user