fix(docx): merged table cells not properly converted (#857)
* fix(docx): merged cells not properly converted Fix conversion issue of merged cells in Word tables leading to repeated text. Simplify Word table conversion code. Add docx file with several table formats for regression tests. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * chore: add type hinting to docx backend Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
eff16b62cc
commit
0cd81a8122
@ -2,21 +2,28 @@ import logging
|
|||||||
import re
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
import docx
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
DocumentOrigin,
|
||||||
GroupLabel,
|
GroupLabel,
|
||||||
ImageRef,
|
ImageRef,
|
||||||
|
NodeItem,
|
||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
|
from docx import Document
|
||||||
|
from docx.document import Document as DocxDocument
|
||||||
|
from docx.oxml.table import CT_Tc
|
||||||
|
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||||
|
from docx.table import Table, _Cell
|
||||||
|
from docx.text.paragraph import Paragraph
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from lxml.etree import XPath
|
from lxml.etree import XPath
|
||||||
from PIL import Image, UnidentifiedImageError
|
from PIL import Image, UnidentifiedImageError
|
||||||
|
from typing_extensions import override
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
@ -26,7 +33,10 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
@override
|
||||||
|
def __init__(
|
||||||
|
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
|
||||||
|
) -> None:
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
self.XML_KEY = (
|
self.XML_KEY = (
|
||||||
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
||||||
@ -36,19 +46,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
}
|
}
|
||||||
# self.initialise(path_or_stream)
|
# self.initialise(path_or_stream)
|
||||||
# Word file:
|
# Word file:
|
||||||
self.path_or_stream = path_or_stream
|
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
|
||||||
self.valid = False
|
self.valid: bool = False
|
||||||
# Initialise the parents for the hierarchy
|
# Initialise the parents for the hierarchy
|
||||||
self.max_levels = 10
|
self.max_levels: int = 10
|
||||||
self.level_at_new_list = None
|
self.level_at_new_list: Optional[int] = None
|
||||||
self.parents = {} # type: ignore
|
self.parents: dict[int, Optional[NodeItem]] = {}
|
||||||
for i in range(-1, self.max_levels):
|
for i in range(-1, self.max_levels):
|
||||||
self.parents[i] = None
|
self.parents[i] = None
|
||||||
|
|
||||||
self.level = 0
|
self.level = 0
|
||||||
self.listIter = 0
|
self.listIter = 0
|
||||||
|
|
||||||
self.history = {
|
self.history: dict[str, Any] = {
|
||||||
"names": [None],
|
"names": [None],
|
||||||
"levels": [None],
|
"levels": [None],
|
||||||
"numids": [None],
|
"numids": [None],
|
||||||
@ -58,9 +68,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.docx_obj = None
|
self.docx_obj = None
|
||||||
try:
|
try:
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
self.docx_obj = docx.Document(self.path_or_stream)
|
self.docx_obj = Document(self.path_or_stream)
|
||||||
elif isinstance(self.path_or_stream, Path):
|
elif isinstance(self.path_or_stream, Path):
|
||||||
self.docx_obj = docx.Document(str(self.path_or_stream))
|
self.docx_obj = Document(str(self.path_or_stream))
|
||||||
|
|
||||||
self.valid = True
|
self.valid = True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -68,13 +78,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
|
@override
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return self.valid
|
return self.valid
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@override
|
||||||
def supports_pagination(cls) -> bool:
|
def supports_pagination(cls) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@override
|
||||||
def unload(self):
|
def unload(self):
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
self.path_or_stream.close()
|
self.path_or_stream.close()
|
||||||
@ -82,11 +95,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.path_or_stream = None
|
self.path_or_stream = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def supported_formats(cls) -> Set[InputFormat]:
|
@override
|
||||||
|
def supported_formats(cls) -> set[InputFormat]:
|
||||||
return {InputFormat.DOCX}
|
return {InputFormat.DOCX}
|
||||||
|
|
||||||
|
@override
|
||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
# Parses the DOCX into a structured document model.
|
"""Parses the DOCX into a structured document model.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The parsed document.
|
||||||
|
"""
|
||||||
|
|
||||||
origin = DocumentOrigin(
|
origin = DocumentOrigin(
|
||||||
filename=self.file.name or "file",
|
filename=self.file.name or "file",
|
||||||
@ -104,23 +123,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
||||||
)
|
)
|
||||||
|
|
||||||
def update_history(self, name, level, numid, ilevel):
|
def update_history(
|
||||||
|
self,
|
||||||
|
name: str,
|
||||||
|
level: Optional[int],
|
||||||
|
numid: Optional[int],
|
||||||
|
ilevel: Optional[int],
|
||||||
|
):
|
||||||
self.history["names"].append(name)
|
self.history["names"].append(name)
|
||||||
self.history["levels"].append(level)
|
self.history["levels"].append(level)
|
||||||
|
|
||||||
self.history["numids"].append(numid)
|
self.history["numids"].append(numid)
|
||||||
self.history["indents"].append(ilevel)
|
self.history["indents"].append(ilevel)
|
||||||
|
|
||||||
def prev_name(self):
|
def prev_name(self) -> Optional[str]:
|
||||||
return self.history["names"][-1]
|
return self.history["names"][-1]
|
||||||
|
|
||||||
def prev_level(self):
|
def prev_level(self) -> Optional[int]:
|
||||||
return self.history["levels"][-1]
|
return self.history["levels"][-1]
|
||||||
|
|
||||||
def prev_numid(self):
|
def prev_numid(self) -> Optional[int]:
|
||||||
return self.history["numids"][-1]
|
return self.history["numids"][-1]
|
||||||
|
|
||||||
def prev_indent(self):
|
def prev_indent(self) -> Optional[int]:
|
||||||
return self.history["indents"][-1]
|
return self.history["indents"][-1]
|
||||||
|
|
||||||
def get_level(self) -> int:
|
def get_level(self) -> int:
|
||||||
@ -130,7 +155,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return k
|
return k
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
def walk_linear(
|
||||||
|
self,
|
||||||
|
body: BaseOxmlElement,
|
||||||
|
docx_obj: DocxDocument,
|
||||||
|
doc: DoclingDocument,
|
||||||
|
) -> DoclingDocument:
|
||||||
for element in body:
|
for element in body:
|
||||||
tag_name = etree.QName(element).localname
|
tag_name = etree.QName(element).localname
|
||||||
# Check for Inline Images (blip elements)
|
# Check for Inline Images (blip elements)
|
||||||
@ -150,7 +180,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
_log.debug("could not parse a table, broken docx table")
|
_log.debug("could not parse a table, broken docx table")
|
||||||
|
|
||||||
elif drawing_blip:
|
elif drawing_blip:
|
||||||
self.handle_pictures(element, docx_obj, drawing_blip, doc)
|
self.handle_pictures(docx_obj, drawing_blip, doc)
|
||||||
# Check for the sdt containers, like table of contents
|
# Check for the sdt containers, like table of contents
|
||||||
elif tag_name in ["sdt"]:
|
elif tag_name in ["sdt"]:
|
||||||
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
||||||
@ -167,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def str_to_int(self, s, default=0):
|
def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]:
|
||||||
if s is None:
|
if s is None:
|
||||||
return None
|
return None
|
||||||
try:
|
try:
|
||||||
@ -175,7 +205,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
return default
|
return default
|
||||||
|
|
||||||
def split_text_and_number(self, input_string):
|
def split_text_and_number(self, input_string: str) -> list[str]:
|
||||||
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
|
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
|
||||||
if match:
|
if match:
|
||||||
parts = list(filter(None, match.groups()))
|
parts = list(filter(None, match.groups()))
|
||||||
@ -183,7 +213,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
else:
|
else:
|
||||||
return [input_string]
|
return [input_string]
|
||||||
|
|
||||||
def get_numId_and_ilvl(self, paragraph):
|
def get_numId_and_ilvl(
|
||||||
|
self, paragraph: Paragraph
|
||||||
|
) -> tuple[Optional[int], Optional[int]]:
|
||||||
# Access the XML element of the paragraph
|
# Access the XML element of the paragraph
|
||||||
numPr = paragraph._element.find(
|
numPr = paragraph._element.find(
|
||||||
".//w:numPr", namespaces=paragraph._element.nsmap
|
".//w:numPr", namespaces=paragraph._element.nsmap
|
||||||
@ -196,13 +228,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
|
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
|
||||||
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
|
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
|
||||||
|
|
||||||
return self.str_to_int(numId, default=None), self.str_to_int(
|
return self.str_to_int(numId, None), self.str_to_int(ilvl, None)
|
||||||
ilvl, default=None
|
|
||||||
)
|
|
||||||
|
|
||||||
return None, None # If the paragraph is not part of a list
|
return None, None # If the paragraph is not part of a list
|
||||||
|
|
||||||
def get_label_and_level(self, paragraph):
|
def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]:
|
||||||
if paragraph.style is None:
|
if paragraph.style is None:
|
||||||
return "Normal", None
|
return "Normal", None
|
||||||
label = paragraph.style.style_id
|
label = paragraph.style.style_id
|
||||||
@ -218,20 +248,25 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
if "Heading" in label and len(parts) == 2:
|
if "Heading" in label and len(parts) == 2:
|
||||||
parts.sort()
|
parts.sort()
|
||||||
label_str = ""
|
label_str: str = ""
|
||||||
label_level = 0
|
label_level: Optional[int] = 0
|
||||||
if parts[0] == "Heading":
|
if parts[0] == "Heading":
|
||||||
label_str = parts[0]
|
label_str = parts[0]
|
||||||
label_level = self.str_to_int(parts[1], default=None)
|
label_level = self.str_to_int(parts[1], None)
|
||||||
if parts[1] == "Heading":
|
if parts[1] == "Heading":
|
||||||
label_str = parts[1]
|
label_str = parts[1]
|
||||||
label_level = self.str_to_int(parts[0], default=None)
|
label_level = self.str_to_int(parts[0], None)
|
||||||
return label_str, label_level
|
return label_str, label_level
|
||||||
else:
|
else:
|
||||||
return label, None
|
return label, None
|
||||||
|
|
||||||
def handle_text_elements(self, element, docx_obj, doc):
|
def handle_text_elements(
|
||||||
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
|
self,
|
||||||
|
element: BaseOxmlElement,
|
||||||
|
docx_obj: DocxDocument,
|
||||||
|
doc: DoclingDocument,
|
||||||
|
) -> None:
|
||||||
|
paragraph = Paragraph(element, docx_obj)
|
||||||
|
|
||||||
if paragraph.text is None:
|
if paragraph.text is None:
|
||||||
return
|
return
|
||||||
@ -255,11 +290,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
and p_style_id not in ["Title", "Heading"]
|
and p_style_id not in ["Title", "Heading"]
|
||||||
):
|
):
|
||||||
self.add_listitem(
|
self.add_listitem(
|
||||||
element,
|
|
||||||
docx_obj,
|
|
||||||
doc,
|
doc,
|
||||||
p_style_id,
|
|
||||||
p_level,
|
|
||||||
numid,
|
numid,
|
||||||
ilevel,
|
ilevel,
|
||||||
text,
|
text,
|
||||||
@ -284,13 +315,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.level = 0
|
self.level = 0
|
||||||
|
|
||||||
if p_style_id in ["Title"]:
|
if p_style_id in ["Title"]:
|
||||||
for key, val in self.parents.items():
|
for key in range(len(self.parents)):
|
||||||
self.parents[key] = None
|
self.parents[key] = None
|
||||||
self.parents[0] = doc.add_text(
|
self.parents[0] = doc.add_text(
|
||||||
parent=None, label=DocItemLabel.TITLE, text=text
|
parent=None, label=DocItemLabel.TITLE, text=text
|
||||||
)
|
)
|
||||||
elif "Heading" in p_style_id:
|
elif "Heading" in p_style_id:
|
||||||
self.add_header(element, docx_obj, doc, p_style_id, p_level, text)
|
self.add_header(doc, p_level, text)
|
||||||
|
|
||||||
elif p_style_id in [
|
elif p_style_id in [
|
||||||
"Paragraph",
|
"Paragraph",
|
||||||
@ -318,7 +349,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.update_history(p_style_id, p_level, numid, ilevel)
|
self.update_history(p_style_id, p_level, numid, ilevel)
|
||||||
return
|
return
|
||||||
|
|
||||||
def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
|
def add_header(
|
||||||
|
self, doc: DoclingDocument, curr_level: Optional[int], text: str
|
||||||
|
) -> None:
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
if isinstance(curr_level, int):
|
if isinstance(curr_level, int):
|
||||||
if curr_level > level:
|
if curr_level > level:
|
||||||
@ -331,7 +364,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
elif curr_level < level:
|
elif curr_level < level:
|
||||||
# remove the tail
|
# remove the tail
|
||||||
for key, val in self.parents.items():
|
for key in range(len(self.parents)):
|
||||||
if key >= curr_level:
|
if key >= curr_level:
|
||||||
self.parents[key] = None
|
self.parents[key] = None
|
||||||
|
|
||||||
@ -350,22 +383,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
def add_listitem(
|
def add_listitem(
|
||||||
self,
|
self,
|
||||||
element,
|
doc: DoclingDocument,
|
||||||
docx_obj,
|
numid: int,
|
||||||
doc,
|
ilevel: int,
|
||||||
p_style_id,
|
|
||||||
p_level,
|
|
||||||
numid,
|
|
||||||
ilevel,
|
|
||||||
text: str,
|
text: str,
|
||||||
is_numbered=False,
|
is_numbered: bool = False,
|
||||||
):
|
) -> None:
|
||||||
# is_numbered = is_numbered
|
|
||||||
enum_marker = ""
|
enum_marker = ""
|
||||||
|
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
|
prev_indent = self.prev_indent()
|
||||||
if self.prev_numid() is None: # Open new list
|
if self.prev_numid() is None: # Open new list
|
||||||
self.level_at_new_list = level # type: ignore
|
self.level_at_new_list = level
|
||||||
|
|
||||||
self.parents[level] = doc.add_group(
|
self.parents[level] = doc.add_group(
|
||||||
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
|
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
|
||||||
@ -384,10 +413,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
self.prev_numid() == numid and self.prev_indent() < ilevel
|
self.prev_numid() == numid
|
||||||
|
and self.level_at_new_list is not None
|
||||||
|
and prev_indent is not None
|
||||||
|
and prev_indent < ilevel
|
||||||
): # Open indented list
|
): # Open indented list
|
||||||
for i in range(
|
for i in range(
|
||||||
self.level_at_new_list + self.prev_indent() + 1,
|
self.level_at_new_list + prev_indent + 1,
|
||||||
self.level_at_new_list + ilevel + 1,
|
self.level_at_new_list + ilevel + 1,
|
||||||
):
|
):
|
||||||
# Determine if this is an unordered list or an ordered list.
|
# Determine if this is an unordered list or an ordered list.
|
||||||
@ -416,7 +448,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
text=text,
|
text=text,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif self.prev_numid() == numid and ilevel < self.prev_indent(): # Close list
|
elif (
|
||||||
|
self.prev_numid() == numid
|
||||||
|
and self.level_at_new_list is not None
|
||||||
|
and prev_indent is not None
|
||||||
|
and ilevel < prev_indent
|
||||||
|
): # Close list
|
||||||
for k, v in self.parents.items():
|
for k, v in self.parents.items():
|
||||||
if k > self.level_at_new_list + ilevel:
|
if k > self.level_at_new_list + ilevel:
|
||||||
self.parents[k] = None
|
self.parents[k] = None
|
||||||
@ -434,7 +471,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
self.listIter = 0
|
self.listIter = 0
|
||||||
|
|
||||||
elif self.prev_numid() == numid or self.prev_indent() == ilevel:
|
elif self.prev_numid() == numid or prev_indent == ilevel:
|
||||||
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
# TODO: Set marker and enumerated arguments if this is an enumeration element.
|
||||||
self.listIter += 1
|
self.listIter += 1
|
||||||
if is_numbered:
|
if is_numbered:
|
||||||
@ -448,31 +485,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
def handle_tables(self, element, docx_obj, doc):
|
def handle_tables(
|
||||||
|
self,
|
||||||
# Function to check if a cell has a colspan (gridSpan)
|
element: BaseOxmlElement,
|
||||||
def get_colspan(cell):
|
docx_obj: DocxDocument,
|
||||||
grid_span = cell._element.xpath("@w:gridSpan")
|
doc: DoclingDocument,
|
||||||
if grid_span:
|
) -> None:
|
||||||
return int(grid_span[0]) # Return the number of columns spanned
|
table: Table = Table(element, docx_obj)
|
||||||
return 1 # Default is 1 (no colspan)
|
|
||||||
|
|
||||||
# Function to check if a cell has a rowspan (vMerge)
|
|
||||||
def get_rowspan(cell):
|
|
||||||
v_merge = cell._element.xpath("@w:vMerge")
|
|
||||||
if v_merge:
|
|
||||||
return v_merge[
|
|
||||||
0
|
|
||||||
] # 'restart' indicates the beginning of a rowspan, others are continuation
|
|
||||||
return 1
|
|
||||||
|
|
||||||
table = docx.table.Table(element, docx_obj)
|
|
||||||
|
|
||||||
num_rows = len(table.rows)
|
num_rows = len(table.rows)
|
||||||
num_cols = 0
|
num_cols = len(table.columns)
|
||||||
for row in table.rows:
|
_log.debug(f"Table grid with {num_rows} rows and {num_cols} columns")
|
||||||
# Calculate the max number of columns
|
|
||||||
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
|
|
||||||
|
|
||||||
if num_rows == 1 and num_cols == 1:
|
if num_rows == 1 and num_cols == 1:
|
||||||
cell_element = table.rows[0].cells[0]
|
cell_element = table.rows[0].cells[0]
|
||||||
@ -481,59 +503,56 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.walk_linear(cell_element._element, docx_obj, doc)
|
self.walk_linear(cell_element._element, docx_obj, doc)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Initialize the table grid
|
data = TableData(num_rows=num_rows, num_cols=num_cols)
|
||||||
table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
cell_set: set[CT_Tc] = set()
|
||||||
|
|
||||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
|
||||||
|
|
||||||
for row_idx, row in enumerate(table.rows):
|
for row_idx, row in enumerate(table.rows):
|
||||||
|
_log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells")
|
||||||
col_idx = 0
|
col_idx = 0
|
||||||
for c, cell in enumerate(row.cells):
|
while col_idx < num_cols:
|
||||||
row_span = get_rowspan(cell)
|
cell: _Cell = row.cells[col_idx]
|
||||||
col_span = get_colspan(cell)
|
_log.debug(
|
||||||
|
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
|
||||||
|
)
|
||||||
|
if cell is None or cell._tc in cell_set:
|
||||||
|
_log.debug(f" skipped since repeated content")
|
||||||
|
col_idx += cell.grid_span
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
cell_set.add(cell._tc)
|
||||||
|
|
||||||
cell_text = cell.text
|
spanned_idx = row_idx
|
||||||
# In case cell doesn't return text via docx library:
|
spanned_tc: Optional[CT_Tc] = cell._tc
|
||||||
if len(cell_text) == 0:
|
while spanned_tc == cell._tc:
|
||||||
cell_xml = cell._element
|
spanned_idx += 1
|
||||||
|
spanned_tc = (
|
||||||
|
table.rows[spanned_idx].cells[col_idx]._tc
|
||||||
|
if spanned_idx < num_rows
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
_log.debug(f" spanned before row {spanned_idx}")
|
||||||
|
|
||||||
texts = [""]
|
table_cell = TableCell(
|
||||||
for elem in cell_xml.iter():
|
text=cell.text,
|
||||||
if elem.tag.endswith("t"): # <w:t> tags that contain text
|
row_span=spanned_idx - row_idx,
|
||||||
if elem.text:
|
col_span=cell.grid_span,
|
||||||
texts.append(elem.text)
|
start_row_offset_idx=row.grid_cols_before + row_idx,
|
||||||
# Join the collected text
|
end_row_offset_idx=row.grid_cols_before + spanned_idx,
|
||||||
cell_text = " ".join(texts).strip()
|
|
||||||
|
|
||||||
# Find the next available column in the grid
|
|
||||||
while table_grid[row_idx][col_idx] is not None:
|
|
||||||
col_idx += 1
|
|
||||||
|
|
||||||
# Fill the grid with the cell value, considering rowspan and colspan
|
|
||||||
for i in range(row_span if row_span == "restart" else 1):
|
|
||||||
for j in range(col_span):
|
|
||||||
table_grid[row_idx + i][col_idx + j] = ""
|
|
||||||
|
|
||||||
cell = TableCell(
|
|
||||||
text=cell_text,
|
|
||||||
row_span=row_span,
|
|
||||||
col_span=col_span,
|
|
||||||
start_row_offset_idx=row_idx,
|
|
||||||
end_row_offset_idx=row_idx + row_span,
|
|
||||||
start_col_offset_idx=col_idx,
|
start_col_offset_idx=col_idx,
|
||||||
end_col_offset_idx=col_idx + col_span,
|
end_col_offset_idx=col_idx + cell.grid_span,
|
||||||
col_header=False,
|
col_header=False,
|
||||||
row_header=False,
|
row_header=False,
|
||||||
)
|
)
|
||||||
|
data.table_cells.append(table_cell)
|
||||||
data.table_cells.append(cell)
|
col_idx += cell.grid_span
|
||||||
|
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
doc.add_table(data=data, parent=self.parents[level - 1])
|
doc.add_table(data=data, parent=self.parents[level - 1])
|
||||||
return
|
return
|
||||||
|
|
||||||
def handle_pictures(self, element, docx_obj, drawing_blip, doc):
|
def handle_pictures(
|
||||||
def get_docx_image(element, drawing_blip):
|
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
|
||||||
|
) -> None:
|
||||||
|
def get_docx_image(drawing_blip):
|
||||||
rId = drawing_blip[0].get(
|
rId = drawing_blip[0].get(
|
||||||
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
|
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
|
||||||
)
|
)
|
||||||
@ -546,7 +565,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
# Open the BytesIO object with PIL to create an Image
|
# Open the BytesIO object with PIL to create an Image
|
||||||
try:
|
try:
|
||||||
image_data = get_docx_image(element, drawing_blip)
|
image_data = get_docx_image(drawing_blip)
|
||||||
image_bytes = BytesIO(image_data)
|
image_bytes = BytesIO(image_data)
|
||||||
pil_image = Image.open(image_bytes)
|
pil_image = Image.open(image_bytes)
|
||||||
doc.add_picture(
|
doc.add_picture(
|
||||||
|
BIN
tests/data/docx/word_tables.docx
Normal file
BIN
tests/data/docx/word_tables.docx
Normal file
Binary file not shown.
75
tests/data/groundtruth/docling_v2/word_tables.docx.html
Normal file
75
tests/data/groundtruth/docling_v2/word_tables.docx.html
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<link rel="icon" type="image/png"
|
||||||
|
href="https://ds4sd.github.io/docling/assets/logo.png"/>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title>
|
||||||
|
Powered by Docling
|
||||||
|
</title>
|
||||||
|
<style>
|
||||||
|
html {
|
||||||
|
background-color: LightGray;
|
||||||
|
}
|
||||||
|
body {
|
||||||
|
margin: 0 auto;
|
||||||
|
width:800px;
|
||||||
|
padding: 30px;
|
||||||
|
background-color: White;
|
||||||
|
font-family: Arial, sans-serif;
|
||||||
|
box-shadow: 10px 10px 10px grey;
|
||||||
|
}
|
||||||
|
figure{
|
||||||
|
display: block;
|
||||||
|
width: 100%;
|
||||||
|
margin: 0px;
|
||||||
|
margin-top: 10px;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
}
|
||||||
|
img {
|
||||||
|
display: block;
|
||||||
|
margin: auto;
|
||||||
|
margin-top: 10px;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
max-width: 640px;
|
||||||
|
max-height: 640px;
|
||||||
|
}
|
||||||
|
table {
|
||||||
|
min-width:500px;
|
||||||
|
background-color: White;
|
||||||
|
border-collapse: collapse;
|
||||||
|
cell-padding: 5px;
|
||||||
|
margin: auto;
|
||||||
|
margin-top: 10px;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
}
|
||||||
|
th, td {
|
||||||
|
border: 1px solid black;
|
||||||
|
padding: 8px;
|
||||||
|
}
|
||||||
|
th {
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
table tr:nth-child(even) td{
|
||||||
|
background-color: LightGray;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<h2>Test with tables</h2>
|
||||||
|
<p>A uniform table</p>
|
||||||
|
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td>Cell 1.1</td><td>Cell 1.2</td></tr><tr><td>Cell 2.0</td><td>Cell 2.1</td><td>Cell 2.2</td></tr></tbody></table>
|
||||||
|
<p></p>
|
||||||
|
<p>A non-uniform table with horizontal spans</p>
|
||||||
|
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td colspan="2">Merged Cell 1.1 1.2</td></tr><tr><td>Cell 2.0</td><td colspan="2">Merged Cell 2.1 2.2</td></tr></tbody></table>
|
||||||
|
<p></p>
|
||||||
|
<p>A non-uniform table with horizontal spans in inner columns</p>
|
||||||
|
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td><td>Header 0.3</td></tr><tr><td>Cell 1.0</td><td colspan="2">Merged Cell 1.1 1.2</td><td>Cell 1.3</td></tr><tr><td>Cell 2.0</td><td colspan="2">Merged Cell 2.1 2.2</td><td>Cell 2.3</td></tr></tbody></table>
|
||||||
|
<p></p>
|
||||||
|
<p>A non-uniform table with vertical spans</p>
|
||||||
|
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td rowspan="2">Merged Cell 1.1 2.1</td><td>Cell 1.2</td></tr><tr><td>Cell 2.0</td><td>Cell 2.2</td></tr><tr><td>Cell 3.0</td><td rowspan="2">Merged Cell 3.1 4.1</td><td>Cell 3.2</td></tr><tr><td>Cell 4.0</td><td>Cell 4.2</td></tr></tbody></table>
|
||||||
|
<p></p>
|
||||||
|
<p>A non-uniform table with all kinds of spans and empty cells</p>
|
||||||
|
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td><td></td><td></td></tr><tr><td>Cell 1.0</td><td rowspan="2">Merged Cell 1.1 2.1</td><td>Cell 1.2</td><td></td><td></td></tr><tr><td>Cell 2.0</td><td>Cell 2.2</td><td></td><td></td></tr><tr><td>Cell 3.0</td><td rowspan="2">Merged Cell 3.1 4.1</td><td>Cell 3.2</td><td rowspan="3"></td><td></td></tr><tr><td>Cell 4.0</td><td>Cell 4.2</td><td rowspan="2">Merged Cell 4.4 5.4</td></tr><tr><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td></tr><tr><td colspan="5"></td></tr><tr><td></td><td></td><td></td><td></td><td>Cell 8.4</td></tr></tbody></table>
|
||||||
|
<p></p>
|
||||||
|
<p></p>
|
||||||
|
</html>
|
19
tests/data/groundtruth/docling_v2/word_tables.docx.itxt
Normal file
19
tests/data/groundtruth/docling_v2/word_tables.docx.itxt
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
item-0 at level 0: unspecified: group _root_
|
||||||
|
item-1 at level 1: section: group header-0
|
||||||
|
item-2 at level 2: section_header: Test with tables
|
||||||
|
item-3 at level 3: paragraph: A uniform table
|
||||||
|
item-4 at level 3: table with [3x3]
|
||||||
|
item-5 at level 3: paragraph:
|
||||||
|
item-6 at level 3: paragraph: A non-uniform table with horizontal spans
|
||||||
|
item-7 at level 3: table with [3x3]
|
||||||
|
item-8 at level 3: paragraph:
|
||||||
|
item-9 at level 3: paragraph: A non-uniform table with horizontal spans in inner columns
|
||||||
|
item-10 at level 3: table with [3x4]
|
||||||
|
item-11 at level 3: paragraph:
|
||||||
|
item-12 at level 3: paragraph: A non-uniform table with vertical spans
|
||||||
|
item-13 at level 3: table with [5x3]
|
||||||
|
item-14 at level 3: paragraph:
|
||||||
|
item-15 at level 3: paragraph: A non-uniform table with all kinds of spans and empty cells
|
||||||
|
item-16 at level 3: table with [9x5]
|
||||||
|
item-17 at level 3: paragraph:
|
||||||
|
item-18 at level 3: paragraph:
|
2356
tests/data/groundtruth/docling_v2/word_tables.docx.json
Normal file
2356
tests/data/groundtruth/docling_v2/word_tables.docx.json
Normal file
File diff suppressed because it is too large
Load Diff
44
tests/data/groundtruth/docling_v2/word_tables.docx.md
Normal file
44
tests/data/groundtruth/docling_v2/word_tables.docx.md
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
## Test with tables
|
||||||
|
|
||||||
|
A uniform table
|
||||||
|
|
||||||
|
| Header 0.0 | Header 0.1 | Header 0.2 |
|
||||||
|
|--------------|--------------|--------------|
|
||||||
|
| Cell 1.0 | Cell 1.1 | Cell 1.2 |
|
||||||
|
| Cell 2.0 | Cell 2.1 | Cell 2.2 |
|
||||||
|
|
||||||
|
A non-uniform table with horizontal spans
|
||||||
|
|
||||||
|
| Header 0.0 | Header 0.1 | Header 0.2 |
|
||||||
|
|--------------|---------------------|---------------------|
|
||||||
|
| Cell 1.0 | Merged Cell 1.1 1.2 | Merged Cell 1.1 1.2 |
|
||||||
|
| Cell 2.0 | Merged Cell 2.1 2.2 | Merged Cell 2.1 2.2 |
|
||||||
|
|
||||||
|
A non-uniform table with horizontal spans in inner columns
|
||||||
|
|
||||||
|
| Header 0.0 | Header 0.1 | Header 0.2 | Header 0.3 |
|
||||||
|
|--------------|---------------------|---------------------|--------------|
|
||||||
|
| Cell 1.0 | Merged Cell 1.1 1.2 | Merged Cell 1.1 1.2 | Cell 1.3 |
|
||||||
|
| Cell 2.0 | Merged Cell 2.1 2.2 | Merged Cell 2.1 2.2 | Cell 2.3 |
|
||||||
|
|
||||||
|
A non-uniform table with vertical spans
|
||||||
|
|
||||||
|
| Header 0.0 | Header 0.1 | Header 0.2 |
|
||||||
|
|--------------|---------------------|--------------|
|
||||||
|
| Cell 1.0 | Merged Cell 1.1 2.1 | Cell 1.2 |
|
||||||
|
| Cell 2.0 | Merged Cell 1.1 2.1 | Cell 2.2 |
|
||||||
|
| Cell 3.0 | Merged Cell 3.1 4.1 | Cell 3.2 |
|
||||||
|
| Cell 4.0 | Merged Cell 3.1 4.1 | Cell 4.2 |
|
||||||
|
|
||||||
|
A non-uniform table with all kinds of spans and empty cells
|
||||||
|
|
||||||
|
| Header 0.0 | Header 0.1 | Header 0.2 | | |
|
||||||
|
|--------------|---------------------|--------------|----|---------------------|
|
||||||
|
| Cell 1.0 | Merged Cell 1.1 2.1 | Cell 1.2 | | |
|
||||||
|
| Cell 2.0 | Merged Cell 1.1 2.1 | Cell 2.2 | | |
|
||||||
|
| Cell 3.0 | Merged Cell 3.1 4.1 | Cell 3.2 | | |
|
||||||
|
| Cell 4.0 | Merged Cell 3.1 4.1 | Cell 4.2 | | Merged Cell 4.4 5.4 |
|
||||||
|
| | | | | Merged Cell 4.4 5.4 |
|
||||||
|
| | | | | |
|
||||||
|
| | | | | |
|
||||||
|
| | | | | Cell 8.4 |
|
@ -69,7 +69,6 @@ def verify_export(pred_text: str, gtfile: str):
|
|||||||
with open(gtfile, "r") as fr:
|
with open(gtfile, "r") as fr:
|
||||||
true_text = fr.read()
|
true_text = fr.read()
|
||||||
|
|
||||||
assert pred_text == true_text, "pred_itxt==true_itxt"
|
|
||||||
return pred_text == true_text
|
return pred_text == true_text
|
||||||
|
|
||||||
|
|
||||||
@ -101,3 +100,7 @@ def test_e2e_docx_conversions():
|
|||||||
|
|
||||||
pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
|
pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
|
||||||
assert verify_export(pred_json, str(gt_path) + ".json"), "export to json"
|
assert verify_export(pred_json, str(gt_path) + ".json"), "export to json"
|
||||||
|
|
||||||
|
if docx_path.name == "word_tables.docx":
|
||||||
|
pred_html: str = doc.export_to_html()
|
||||||
|
assert verify_export(pred_html, str(gt_path) + ".html"), "export to html"
|
||||||
|
75
word_tables.html
Normal file
75
word_tables.html
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<link rel="icon" type="image/png"
|
||||||
|
href="https://ds4sd.github.io/docling/assets/logo.png"/>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<title>
|
||||||
|
Powered by Docling
|
||||||
|
</title>
|
||||||
|
<style>
|
||||||
|
html {
|
||||||
|
background-color: LightGray;
|
||||||
|
}
|
||||||
|
body {
|
||||||
|
margin: 0 auto;
|
||||||
|
width:800px;
|
||||||
|
padding: 30px;
|
||||||
|
background-color: White;
|
||||||
|
font-family: Arial, sans-serif;
|
||||||
|
box-shadow: 10px 10px 10px grey;
|
||||||
|
}
|
||||||
|
figure{
|
||||||
|
display: block;
|
||||||
|
width: 100%;
|
||||||
|
margin: 0px;
|
||||||
|
margin-top: 10px;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
}
|
||||||
|
img {
|
||||||
|
display: block;
|
||||||
|
margin: auto;
|
||||||
|
margin-top: 10px;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
max-width: 640px;
|
||||||
|
max-height: 640px;
|
||||||
|
}
|
||||||
|
table {
|
||||||
|
min-width:500px;
|
||||||
|
background-color: White;
|
||||||
|
border-collapse: collapse;
|
||||||
|
cell-padding: 5px;
|
||||||
|
margin: auto;
|
||||||
|
margin-top: 10px;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
}
|
||||||
|
th, td {
|
||||||
|
border: 1px solid black;
|
||||||
|
padding: 8px;
|
||||||
|
}
|
||||||
|
th {
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
table tr:nth-child(even) td{
|
||||||
|
background-color: LightGray;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<h2>Test with tables</h2>
|
||||||
|
<p>A uniform table</p>
|
||||||
|
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td>Cell 1.1</td><td>Cell 1.2</td></tr><tr><td>Cell 2.0</td><td>Cell 2.1</td><td>Cell 2.2</td></tr></tbody></table>
|
||||||
|
<p></p>
|
||||||
|
<p>A non-uniform table with horizontal spans</p>
|
||||||
|
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td colspan="2">Merged Cell 1.1 1.2</td></tr><tr><td>Cell 2.0</td><td colspan="2">Merged Cell 2.1 2.2</td></tr></tbody></table>
|
||||||
|
<p></p>
|
||||||
|
<p>A non-uniform table with horizontal spans in inner columns</p>
|
||||||
|
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td><td>Header 0.3</td></tr><tr><td>Cell 1.0</td><td colspan="2">Merged Cell 1.1 1.2</td><td>Cell 1.3</td></tr><tr><td>Cell 2.0</td><td colspan="2">Merged Cell 2.1 2.2</td><td>Cell 2.3</td></tr></tbody></table>
|
||||||
|
<p></p>
|
||||||
|
<p>A non-uniform table with vertical spans</p>
|
||||||
|
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td></tr><tr><td>Cell 1.0</td><td rowspan="2">Merged Cell 1.1 2.1</td><td>Cell 1.2</td></tr><tr><td>Cell 2.0</td><td>Cell 2.2</td></tr><tr><td>Cell 3.0</td><td rowspan="2">Merged Cell 3.1 4.1</td><td>Cell 3.2</td></tr><tr><td>Cell 4.0</td><td>Cell 4.2</td></tr></tbody></table>
|
||||||
|
<p></p>
|
||||||
|
<p>A non-uniform table with all kinds of spans and empty cells</p>
|
||||||
|
<table><tbody><tr><td>Header 0.0</td><td>Header 0.1</td><td>Header 0.2</td><td></td><td></td></tr><tr><td>Cell 1.0</td><td rowspan="2">Merged Cell 1.1 2.1</td><td>Cell 1.2</td><td></td><td></td></tr><tr><td>Cell 2.0</td><td>Cell 2.2</td><td></td><td></td></tr><tr><td>Cell 3.0</td><td rowspan="2">Merged Cell 3.1 4.1</td><td>Cell 3.2</td><td rowspan="3"></td><td></td></tr><tr><td>Cell 4.0</td><td>Cell 4.2</td><td rowspan="2">Merged Cell 4.4 5.4</td></tr><tr><td></td><td></td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td></tr><tr><td colspan="5"></td></tr><tr><td></td><td></td><td></td><td></td><td>Cell 8.4</td></tr></tbody></table>
|
||||||
|
<p></p>
|
||||||
|
<p></p>
|
||||||
|
</html>
|
Loading…
Reference in New Issue
Block a user