diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 02f8c86..4d4026e 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -2,21 +2,28 @@ import logging import re from io import BytesIO from pathlib import Path -from typing import Set, Union +from typing import Any, Optional, Union -import docx from docling_core.types.doc import ( DocItemLabel, DoclingDocument, DocumentOrigin, GroupLabel, ImageRef, + NodeItem, TableCell, TableData, ) +from docx import Document +from docx.document import Document as DocxDocument +from docx.oxml.table import CT_Tc +from docx.oxml.xmlchemy import BaseOxmlElement +from docx.table import Table, _Cell +from docx.text.paragraph import Paragraph from lxml import etree from lxml.etree import XPath from PIL import Image, UnidentifiedImageError +from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.datamodel.base_models import InputFormat @@ -26,7 +33,10 @@ _log = logging.getLogger(__name__) class MsWordDocumentBackend(DeclarativeDocumentBackend): - def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + @override + def __init__( + self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] + ) -> None: super().__init__(in_doc, path_or_stream) self.XML_KEY = ( "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val" @@ -36,19 +46,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): } # self.initialise(path_or_stream) # Word file: - self.path_or_stream = path_or_stream - self.valid = False + self.path_or_stream: Union[BytesIO, Path] = path_or_stream + self.valid: bool = False # Initialise the parents for the hierarchy - self.max_levels = 10 - self.level_at_new_list = None - self.parents = {} # type: ignore + self.max_levels: int = 10 + self.level_at_new_list: Optional[int] = None + self.parents: dict[int, Optional[NodeItem]] = {} for i in range(-1, self.max_levels): self.parents[i] = None self.level = 0 self.listIter = 0 - self.history = { + self.history: dict[str, Any] = { "names": [None], "levels": [None], "numids": [None], @@ -58,9 +68,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.docx_obj = None try: if isinstance(self.path_or_stream, BytesIO): - self.docx_obj = docx.Document(self.path_or_stream) + self.docx_obj = Document(self.path_or_stream) elif isinstance(self.path_or_stream, Path): - self.docx_obj = docx.Document(str(self.path_or_stream)) + self.docx_obj = Document(str(self.path_or_stream)) self.valid = True except Exception as e: @@ -68,13 +78,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}" ) from e + @override def is_valid(self) -> bool: return self.valid @classmethod + @override def supports_pagination(cls) -> bool: return False + @override def unload(self): if isinstance(self.path_or_stream, BytesIO): self.path_or_stream.close() @@ -82,11 +95,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.path_or_stream = None @classmethod - def supported_formats(cls) -> Set[InputFormat]: + @override + def supported_formats(cls) -> set[InputFormat]: return {InputFormat.DOCX} + @override def convert(self) -> DoclingDocument: - # Parses the DOCX into a structured document model. + """Parses the DOCX into a structured document model. + + Returns: + The parsed document. + """ origin = DocumentOrigin( filename=self.file.name or "file", @@ -104,23 +123,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): f"Cannot convert doc with {self.document_hash} because the backend failed to init." ) - def update_history(self, name, level, numid, ilevel): + def update_history( + self, + name: str, + level: Optional[int], + numid: Optional[int], + ilevel: Optional[int], + ): self.history["names"].append(name) self.history["levels"].append(level) self.history["numids"].append(numid) self.history["indents"].append(ilevel) - def prev_name(self): + def prev_name(self) -> Optional[str]: return self.history["names"][-1] - def prev_level(self): + def prev_level(self) -> Optional[int]: return self.history["levels"][-1] - def prev_numid(self): + def prev_numid(self) -> Optional[int]: return self.history["numids"][-1] - def prev_indent(self): + def prev_indent(self) -> Optional[int]: return self.history["indents"][-1] def get_level(self) -> int: @@ -130,7 +155,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): return k return 0 - def walk_linear(self, body, docx_obj, doc) -> DoclingDocument: + def walk_linear( + self, + body: BaseOxmlElement, + docx_obj: DocxDocument, + doc: DoclingDocument, + ) -> DoclingDocument: for element in body: tag_name = etree.QName(element).localname # Check for Inline Images (blip elements) @@ -150,7 +180,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): _log.debug("could not parse a table, broken docx table") elif drawing_blip: - self.handle_pictures(element, docx_obj, drawing_blip, doc) + self.handle_pictures(docx_obj, drawing_blip, doc) # Check for the sdt containers, like table of contents elif tag_name in ["sdt"]: sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) @@ -167,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): _log.debug(f"Ignoring element in DOCX with tag: {tag_name}") return doc - def str_to_int(self, s, default=0): + def str_to_int(self, s: Optional[str], default: Optional[int] = 0) -> Optional[int]: if s is None: return None try: @@ -175,7 +205,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): except ValueError: return default - def split_text_and_number(self, input_string): + def split_text_and_number(self, input_string: str) -> list[str]: match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string) if match: parts = list(filter(None, match.groups())) @@ -183,7 +213,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): else: return [input_string] - def get_numId_and_ilvl(self, paragraph): + def get_numId_and_ilvl( + self, paragraph: Paragraph + ) -> tuple[Optional[int], Optional[int]]: # Access the XML element of the paragraph numPr = paragraph._element.find( ".//w:numPr", namespaces=paragraph._element.nsmap @@ -196,13 +228,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None - return self.str_to_int(numId, default=None), self.str_to_int( - ilvl, default=None - ) + return self.str_to_int(numId, None), self.str_to_int(ilvl, None) return None, None # If the paragraph is not part of a list - def get_label_and_level(self, paragraph): + def get_label_and_level(self, paragraph: Paragraph) -> tuple[str, Optional[int]]: if paragraph.style is None: return "Normal", None label = paragraph.style.style_id @@ -218,20 +248,25 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if "Heading" in label and len(parts) == 2: parts.sort() - label_str = "" - label_level = 0 + label_str: str = "" + label_level: Optional[int] = 0 if parts[0] == "Heading": label_str = parts[0] - label_level = self.str_to_int(parts[1], default=None) + label_level = self.str_to_int(parts[1], None) if parts[1] == "Heading": label_str = parts[1] - label_level = self.str_to_int(parts[0], default=None) + label_level = self.str_to_int(parts[0], None) return label_str, label_level else: return label, None - def handle_text_elements(self, element, docx_obj, doc): - paragraph = docx.text.paragraph.Paragraph(element, docx_obj) + def handle_text_elements( + self, + element: BaseOxmlElement, + docx_obj: DocxDocument, + doc: DoclingDocument, + ) -> None: + paragraph = Paragraph(element, docx_obj) if paragraph.text is None: return @@ -255,11 +290,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): and p_style_id not in ["Title", "Heading"] ): self.add_listitem( - element, - docx_obj, doc, - p_style_id, - p_level, numid, ilevel, text, @@ -284,13 +315,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.level = 0 if p_style_id in ["Title"]: - for key, val in self.parents.items(): + for key in range(len(self.parents)): self.parents[key] = None self.parents[0] = doc.add_text( parent=None, label=DocItemLabel.TITLE, text=text ) elif "Heading" in p_style_id: - self.add_header(element, docx_obj, doc, p_style_id, p_level, text) + self.add_header(doc, p_level, text) elif p_style_id in [ "Paragraph", @@ -318,7 +349,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.update_history(p_style_id, p_level, numid, ilevel) return - def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str): + def add_header( + self, doc: DoclingDocument, curr_level: Optional[int], text: str + ) -> None: level = self.get_level() if isinstance(curr_level, int): if curr_level > level: @@ -331,7 +364,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) elif curr_level < level: # remove the tail - for key, val in self.parents.items(): + for key in range(len(self.parents)): if key >= curr_level: self.parents[key] = None @@ -350,22 +383,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def add_listitem( self, - element, - docx_obj, - doc, - p_style_id, - p_level, - numid, - ilevel, + doc: DoclingDocument, + numid: int, + ilevel: int, text: str, - is_numbered=False, - ): - # is_numbered = is_numbered + is_numbered: bool = False, + ) -> None: enum_marker = "" level = self.get_level() + prev_indent = self.prev_indent() if self.prev_numid() is None: # Open new list - self.level_at_new_list = level # type: ignore + self.level_at_new_list = level self.parents[level] = doc.add_group( label=GroupLabel.LIST, name="list", parent=self.parents[level - 1] @@ -384,10 +413,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) elif ( - self.prev_numid() == numid and self.prev_indent() < ilevel + self.prev_numid() == numid + and self.level_at_new_list is not None + and prev_indent is not None + and prev_indent < ilevel ): # Open indented list for i in range( - self.level_at_new_list + self.prev_indent() + 1, + self.level_at_new_list + prev_indent + 1, self.level_at_new_list + ilevel + 1, ): # Determine if this is an unordered list or an ordered list. @@ -416,7 +448,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): text=text, ) - elif self.prev_numid() == numid and ilevel < self.prev_indent(): # Close list + elif ( + self.prev_numid() == numid + and self.level_at_new_list is not None + and prev_indent is not None + and ilevel < prev_indent + ): # Close list for k, v in self.parents.items(): if k > self.level_at_new_list + ilevel: self.parents[k] = None @@ -434,7 +471,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) self.listIter = 0 - elif self.prev_numid() == numid or self.prev_indent() == ilevel: + elif self.prev_numid() == numid or prev_indent == ilevel: # TODO: Set marker and enumerated arguments if this is an enumeration element. self.listIter += 1 if is_numbered: @@ -448,31 +485,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) return - def handle_tables(self, element, docx_obj, doc): - - # Function to check if a cell has a colspan (gridSpan) - def get_colspan(cell): - grid_span = cell._element.xpath("@w:gridSpan") - if grid_span: - return int(grid_span[0]) # Return the number of columns spanned - return 1 # Default is 1 (no colspan) - - # Function to check if a cell has a rowspan (vMerge) - def get_rowspan(cell): - v_merge = cell._element.xpath("@w:vMerge") - if v_merge: - return v_merge[ - 0 - ] # 'restart' indicates the beginning of a rowspan, others are continuation - return 1 - - table = docx.table.Table(element, docx_obj) - + def handle_tables( + self, + element: BaseOxmlElement, + docx_obj: DocxDocument, + doc: DoclingDocument, + ) -> None: + table: Table = Table(element, docx_obj) num_rows = len(table.rows) - num_cols = 0 - for row in table.rows: - # Calculate the max number of columns - num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells)) + num_cols = len(table.columns) + _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns") if num_rows == 1 and num_cols == 1: cell_element = table.rows[0].cells[0] @@ -481,59 +503,56 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.walk_linear(cell_element._element, docx_obj, doc) return - # Initialize the table grid - table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)] - - data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) - + data = TableData(num_rows=num_rows, num_cols=num_cols) + cell_set: set[CT_Tc] = set() for row_idx, row in enumerate(table.rows): + _log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells") col_idx = 0 - for c, cell in enumerate(row.cells): - row_span = get_rowspan(cell) - col_span = get_colspan(cell) + while col_idx < num_cols: + cell: _Cell = row.cells[col_idx] + _log.debug( + f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}" + ) + if cell is None or cell._tc in cell_set: + _log.debug(f" skipped since repeated content") + col_idx += cell.grid_span + continue + else: + cell_set.add(cell._tc) - cell_text = cell.text - # In case cell doesn't return text via docx library: - if len(cell_text) == 0: - cell_xml = cell._element + spanned_idx = row_idx + spanned_tc: Optional[CT_Tc] = cell._tc + while spanned_tc == cell._tc: + spanned_idx += 1 + spanned_tc = ( + table.rows[spanned_idx].cells[col_idx]._tc + if spanned_idx < num_rows + else None + ) + _log.debug(f" spanned before row {spanned_idx}") - texts = [""] - for elem in cell_xml.iter(): - if elem.tag.endswith("t"): # tags that contain text - if elem.text: - texts.append(elem.text) - # Join the collected text - cell_text = " ".join(texts).strip() - - # Find the next available column in the grid - while table_grid[row_idx][col_idx] is not None: - col_idx += 1 - - # Fill the grid with the cell value, considering rowspan and colspan - for i in range(row_span if row_span == "restart" else 1): - for j in range(col_span): - table_grid[row_idx + i][col_idx + j] = "" - - cell = TableCell( - text=cell_text, - row_span=row_span, - col_span=col_span, - start_row_offset_idx=row_idx, - end_row_offset_idx=row_idx + row_span, + table_cell = TableCell( + text=cell.text, + row_span=spanned_idx - row_idx, + col_span=cell.grid_span, + start_row_offset_idx=row.grid_cols_before + row_idx, + end_row_offset_idx=row.grid_cols_before + spanned_idx, start_col_offset_idx=col_idx, - end_col_offset_idx=col_idx + col_span, + end_col_offset_idx=col_idx + cell.grid_span, col_header=False, row_header=False, ) - - data.table_cells.append(cell) + data.table_cells.append(table_cell) + col_idx += cell.grid_span level = self.get_level() doc.add_table(data=data, parent=self.parents[level - 1]) return - def handle_pictures(self, element, docx_obj, drawing_blip, doc): - def get_docx_image(element, drawing_blip): + def handle_pictures( + self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument + ) -> None: + def get_docx_image(drawing_blip): rId = drawing_blip[0].get( "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed" ) @@ -546,7 +565,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): level = self.get_level() # Open the BytesIO object with PIL to create an Image try: - image_data = get_docx_image(element, drawing_blip) + image_data = get_docx_image(drawing_blip) image_bytes = BytesIO(image_data) pil_image = Image.open(image_bytes) doc.add_picture( diff --git a/tests/data/docx/word_tables.docx b/tests/data/docx/word_tables.docx new file mode 100644 index 0000000..1513796 Binary files /dev/null and b/tests/data/docx/word_tables.docx differ diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.html b/tests/data/groundtruth/docling_v2/word_tables.docx.html new file mode 100644 index 0000000..30f6e8d --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.html @@ -0,0 +1,75 @@ + + + + + + + Powered by Docling + + + +

Test with tables

+

A uniform table

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Cell 1.1Cell 1.2
Cell 2.0Cell 2.1Cell 2.2
+

+

A non-uniform table with horizontal spans

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 1.2
Cell 2.0Merged Cell 2.1 2.2
+

+

A non-uniform table with horizontal spans in inner columns

+
Header 0.0Header 0.1Header 0.2Header 0.3
Cell 1.0Merged Cell 1.1 1.2Cell 1.3
Cell 2.0Merged Cell 2.1 2.2Cell 2.3
+

+

A non-uniform table with vertical spans

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2
+

+

A non-uniform table with all kinds of spans and empty cells

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2Merged Cell 4.4 5.4
Cell 8.4
+

+

+ \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.itxt b/tests/data/groundtruth/docling_v2/word_tables.docx.itxt new file mode 100644 index 0000000..dd42eb0 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.itxt @@ -0,0 +1,19 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group header-0 + item-2 at level 2: section_header: Test with tables + item-3 at level 3: paragraph: A uniform table + item-4 at level 3: table with [3x3] + item-5 at level 3: paragraph: + item-6 at level 3: paragraph: A non-uniform table with horizontal spans + item-7 at level 3: table with [3x3] + item-8 at level 3: paragraph: + item-9 at level 3: paragraph: A non-uniform table with horizontal spans in inner columns + item-10 at level 3: table with [3x4] + item-11 at level 3: paragraph: + item-12 at level 3: paragraph: A non-uniform table with vertical spans + item-13 at level 3: table with [5x3] + item-14 at level 3: paragraph: + item-15 at level 3: paragraph: A non-uniform table with all kinds of spans and empty cells + item-16 at level 3: table with [9x5] + item-17 at level 3: paragraph: + item-18 at level 3: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.json b/tests/data/groundtruth/docling_v2/word_tables.docx.json new file mode 100644 index 0000000..957a83c --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.json @@ -0,0 +1,2356 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "word_tables", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 8379738677198259833, + "filename": "word_tables.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + } + ], + "name": "header-0", + "label": "section" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/tables/0" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/tables/1" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/tables/2" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/tables/3" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/tables/4" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + } + ], + "label": "section_header", + "prov": [], + "orig": "Test with tables", + "text": "Test with tables", + "level": 1 + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A uniform table", + "text": "A uniform table" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A non-uniform table with horizontal spans", + "text": "A non-uniform table with horizontal spans" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A non-uniform table with horizontal spans in inner columns", + "text": "A non-uniform table with horizontal spans in inner columns" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A non-uniform table with vertical spans", + "text": "A non-uniform table with vertical spans" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A non-uniform table with all kinds of spans and empty cells", + "text": "A non-uniform table with all kinds of spans and empty cells" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 1.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 3, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 1.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 3, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/2", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Header 0.3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cell 1.3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cell 2.3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 3, + "num_cols": 4, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Header 0.3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cell 1.3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cell 2.3", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/3", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 3.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 4.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 4.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 5, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 3.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 4.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 4.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/4", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 3.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 4.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 4.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Merged Cell 4.4 5.4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Cell 8.4", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 9, + "num_cols": 5, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 3.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 4.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 4.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Merged Cell 4.4 5.4", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Merged Cell 4.4 5.4", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Cell 8.4", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + } + ], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.md b/tests/data/groundtruth/docling_v2/word_tables.docx.md new file mode 100644 index 0000000..90123c3 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.md @@ -0,0 +1,44 @@ +## Test with tables + +A uniform table + +| Header 0.0 | Header 0.1 | Header 0.2 | +|--------------|--------------|--------------| +| Cell 1.0 | Cell 1.1 | Cell 1.2 | +| Cell 2.0 | Cell 2.1 | Cell 2.2 | + +A non-uniform table with horizontal spans + +| Header 0.0 | Header 0.1 | Header 0.2 | +|--------------|---------------------|---------------------| +| Cell 1.0 | Merged Cell 1.1 1.2 | Merged Cell 1.1 1.2 | +| Cell 2.0 | Merged Cell 2.1 2.2 | Merged Cell 2.1 2.2 | + +A non-uniform table with horizontal spans in inner columns + +| Header 0.0 | Header 0.1 | Header 0.2 | Header 0.3 | +|--------------|---------------------|---------------------|--------------| +| Cell 1.0 | Merged Cell 1.1 1.2 | Merged Cell 1.1 1.2 | Cell 1.3 | +| Cell 2.0 | Merged Cell 2.1 2.2 | Merged Cell 2.1 2.2 | Cell 2.3 | + +A non-uniform table with vertical spans + +| Header 0.0 | Header 0.1 | Header 0.2 | +|--------------|---------------------|--------------| +| Cell 1.0 | Merged Cell 1.1 2.1 | Cell 1.2 | +| Cell 2.0 | Merged Cell 1.1 2.1 | Cell 2.2 | +| Cell 3.0 | Merged Cell 3.1 4.1 | Cell 3.2 | +| Cell 4.0 | Merged Cell 3.1 4.1 | Cell 4.2 | + +A non-uniform table with all kinds of spans and empty cells + +| Header 0.0 | Header 0.1 | Header 0.2 | | | +|--------------|---------------------|--------------|----|---------------------| +| Cell 1.0 | Merged Cell 1.1 2.1 | Cell 1.2 | | | +| Cell 2.0 | Merged Cell 1.1 2.1 | Cell 2.2 | | | +| Cell 3.0 | Merged Cell 3.1 4.1 | Cell 3.2 | | | +| Cell 4.0 | Merged Cell 3.1 4.1 | Cell 4.2 | | Merged Cell 4.4 5.4 | +| | | | | Merged Cell 4.4 5.4 | +| | | | | | +| | | | | | +| | | | | Cell 8.4 | \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 9edcb3e..86bd837 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -69,7 +69,6 @@ def verify_export(pred_text: str, gtfile: str): with open(gtfile, "r") as fr: true_text = fr.read() - assert pred_text == true_text, "pred_itxt==true_itxt" return pred_text == true_text @@ -101,3 +100,7 @@ def test_e2e_docx_conversions(): pred_json: str = json.dumps(doc.export_to_dict(), indent=2) assert verify_export(pred_json, str(gt_path) + ".json"), "export to json" + + if docx_path.name == "word_tables.docx": + pred_html: str = doc.export_to_html() + assert verify_export(pred_html, str(gt_path) + ".html"), "export to html" diff --git a/word_tables.html b/word_tables.html new file mode 100644 index 0000000..30f6e8d --- /dev/null +++ b/word_tables.html @@ -0,0 +1,75 @@ + + + + + + + Powered by Docling + + + +

Test with tables

+

A uniform table

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Cell 1.1Cell 1.2
Cell 2.0Cell 2.1Cell 2.2
+

+

A non-uniform table with horizontal spans

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 1.2
Cell 2.0Merged Cell 2.1 2.2
+

+

A non-uniform table with horizontal spans in inner columns

+
Header 0.0Header 0.1Header 0.2Header 0.3
Cell 1.0Merged Cell 1.1 1.2Cell 1.3
Cell 2.0Merged Cell 2.1 2.2Cell 2.3
+

+

A non-uniform table with vertical spans

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2
+

+

A non-uniform table with all kinds of spans and empty cells

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2Merged Cell 4.4 5.4
Cell 8.4
+

+

+ \ No newline at end of file