feat: Implement new reading-order model (#916)

* Implement new reading-order model, replacing DS GLM model (WIP) Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update reading-order model branch Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update lockfile [skip ci] Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add captions, footnotes and merges [skip ci] Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updates for reading-order implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updates for reading-order implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests and lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes, update tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add normalization, update tests again Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests with code Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Push final lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * sanitize text Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Inlcude furniture, Update tests with furniture Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix content_layer assignment Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * chore: Delete empty file docling/models/ds_glm_model.py Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Nikos Livathinos <nli@zurich.ibm.com>
2025-02-20 17:51:17 +01:00 · 2025-02-20 17:51:17 +01:00 · c93e36988f
commit c93e36988f
parent c031a7ae47
67 changed files with 757 additions and 871 deletions
--- a/docling/models/ds_glm_model.py
+++ b/docling/models/ds_glm_model.py
@ -1,386 +0,0 @@
-import copy
-import random
-from pathlib import Path
-from typing import List, Union
-
-from deepsearch_glm.andromeda_nlp import nlp_model
-from docling_core.types.doc import (
-    BoundingBox,
-    CoordOrigin,
-    DocItemLabel,
-    DoclingDocument,
-)
-from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
-from docling_core.types.legacy_doc.base import (
-    Figure,
-    PageDimensions,
-    PageReference,
-    Prov,
-    Ref,
-)
-from docling_core.types.legacy_doc.base import Table as DsSchemaTable
-from docling_core.types.legacy_doc.base import TableCell
-from docling_core.types.legacy_doc.document import BaseText
-from docling_core.types.legacy_doc.document import (
-    CCSDocumentDescription as DsDocumentDescription,
-)
-from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
-from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
-from PIL import ImageDraw
-from pydantic import BaseModel, ConfigDict, TypeAdapter
-
-from docling.datamodel.base_models import (
-    Cluster,
-    ContainerElement,
-    FigureElement,
-    Table,
-    TextElement,
-)
-from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
-from docling.datamodel.settings import settings
-from docling.utils.glm_utils import to_docling_document
-from docling.utils.profiling import ProfilingScope, TimeRecorder
-from docling.utils.utils import create_hash
-
-
-class GlmOptions(BaseModel):
-    model_config = ConfigDict(protected_namespaces=())
-
-    model_names: str = ""  # e.g. "language;term;reference"
-
-
-class GlmModel:
-    def __init__(self, options: GlmOptions):
-        self.options = options
-
-        self.model = nlp_model(loglevel="error", text_ordering=True)
-
-    def _to_legacy_document(self, conv_res) -> DsDocument:
-        title = ""
-        desc: DsDocumentDescription = DsDocumentDescription(logs=[])
-
-        page_hashes = [
-            PageReference(
-                hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
-                page=p.page_no + 1,
-                model="default",
-            )
-            for p in conv_res.pages
-        ]
-
-        file_info = DsFileInfoObject(
-            filename=conv_res.input.file.name,
-            document_hash=conv_res.input.document_hash,
-            num_pages=conv_res.input.page_count,
-            page_hashes=page_hashes,
-        )
-
-        main_text: List[Union[Ref, BaseText]] = []
-        page_headers: List[Union[Ref, BaseText]] = []
-        page_footers: List[Union[Ref, BaseText]] = []
-
-        tables: List[DsSchemaTable] = []
-        figures: List[Figure] = []
-
-        page_no_to_page = {p.page_no: p for p in conv_res.pages}
-
-        for element in conv_res.assembled.body:
-            # Convert bboxes to lower-left origin.
-            target_bbox = DsBoundingBox(
-                element.cluster.bbox.to_bottom_left_origin(
-                    page_no_to_page[element.page_no].size.height
-                ).as_tuple()
-            )
-
-            if isinstance(element, TextElement):
-                main_text.append(
-                    BaseText(
-                        text=element.text,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        name=element.label,
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, len(element.text)],
-                            )
-                        ],
-                    )
-                )
-            elif isinstance(element, Table):
-                index = len(tables)
-                ref_str = f"#/tables/{index}"
-                main_text.append(
-                    Ref(
-                        name=element.label,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        ref=ref_str,
-                    ),
-                )
-
-                # Initialise empty table data grid (only empty cells)
-                table_data = [
-                    [
-                        TableCell(
-                            text="",
-                            # bbox=[0,0,0,0],
-                            spans=[[i, j]],
-                            obj_type="body",
-                        )
-                        for j in range(element.num_cols)
-                    ]
-                    for i in range(element.num_rows)
-                ]
-
-                # Overwrite cells in table data for which there is actual cell content.
-                for cell in element.table_cells:
-                    for i in range(
-                        min(cell.start_row_offset_idx, element.num_rows),
-                        min(cell.end_row_offset_idx, element.num_rows),
-                    ):
-                        for j in range(
-                            min(cell.start_col_offset_idx, element.num_cols),
-                            min(cell.end_col_offset_idx, element.num_cols),
-                        ):
-                            celltype = "body"
-                            if cell.column_header:
-                                celltype = "col_header"
-                            elif cell.row_header:
-                                celltype = "row_header"
-                            elif cell.row_section:
-                                celltype = "row_section"
-
-                            def make_spans(cell):
-                                for rspan in range(
-                                    min(cell.start_row_offset_idx, element.num_rows),
-                                    min(cell.end_row_offset_idx, element.num_rows),
-                                ):
-                                    for cspan in range(
-                                        min(
-                                            cell.start_col_offset_idx, element.num_cols
-                                        ),
-                                        min(cell.end_col_offset_idx, element.num_cols),
-                                    ):
-                                        yield [rspan, cspan]
-
-                            spans = list(make_spans(cell))
-                            if cell.bbox is not None:
-                                bbox = cell.bbox.to_bottom_left_origin(
-                                    page_no_to_page[element.page_no].size.height
-                                ).as_tuple()
-                            else:
-                                bbox = None
-
-                            table_data[i][j] = TableCell(
-                                text=cell.text,
-                                bbox=bbox,
-                                # col=j,
-                                # row=i,
-                                spans=spans,
-                                obj_type=celltype,
-                                # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
-                                # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
-                            )
-
-                tables.append(
-                    DsSchemaTable(
-                        num_cols=element.num_cols,
-                        num_rows=element.num_rows,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        data=table_data,
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, 0],
-                            )
-                        ],
-                    )
-                )
-
-            elif isinstance(element, FigureElement):
-                index = len(figures)
-                ref_str = f"#/figures/{index}"
-                main_text.append(
-                    Ref(
-                        name=element.label,
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        ref=ref_str,
-                    ),
-                )
-                figures.append(
-                    Figure(
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, 0],
-                            )
-                        ],
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        payload={
-                            "children": TypeAdapter(List[Cluster]).dump_python(
-                                element.cluster.children
-                            )
-                        },  # hack to channel child clusters through GLM
-                    )
-                )
-            elif isinstance(element, ContainerElement):
-                main_text.append(
-                    BaseText(
-                        text="",
-                        payload={
-                            "children": TypeAdapter(List[Cluster]).dump_python(
-                                element.cluster.children
-                            )
-                        },  # hack to channel child clusters through GLM
-                        obj_type=layout_label_to_ds_type.get(element.label),
-                        name=element.label,
-                        prov=[
-                            Prov(
-                                bbox=target_bbox,
-                                page=element.page_no + 1,
-                                span=[0, 0],
-                            )
-                        ],
-                    )
-                )
-
-        # We can throw in headers and footers at the end of the legacy doc
-        # since the reading-order will re-sort it later.
-        for element in conv_res.assembled.headers:
-            # Convert bboxes to lower-left origin.
-            target_bbox = DsBoundingBox(
-                element.cluster.bbox.to_bottom_left_origin(
-                    page_no_to_page[element.page_no].size.height
-                ).as_tuple()
-            )
-
-            if isinstance(element, TextElement):
-
-                tel = BaseText(
-                    text=element.text,
-                    obj_type=layout_label_to_ds_type.get(element.label),
-                    name=element.label,
-                    prov=[
-                        Prov(
-                            bbox=target_bbox,
-                            page=element.page_no + 1,
-                            span=[0, len(element.text)],
-                        )
-                    ],
-                )
-                if element.label == DocItemLabel.PAGE_HEADER:
-                    index = len(page_headers)
-                    ref_str = f"#/page-headers/{index}"
-                    main_text.append(
-                        Ref(
-                            name=element.label,
-                            obj_type=layout_label_to_ds_type.get(element.label),
-                            ref=ref_str,
-                        ),
-                    )
-                    page_headers.append(tel)
-                elif element.label == DocItemLabel.PAGE_FOOTER:
-                    index = len(page_footers)
-                    ref_str = f"#/page-footers/{index}"
-                    main_text.append(
-                        Ref(
-                            name=element.label,
-                            obj_type=layout_label_to_ds_type.get(element.label),
-                            ref=ref_str,
-                        ),
-                    )
-                    page_footers.append(tel)
-
-        page_dimensions = [
-            PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
-            for p in conv_res.pages
-            if p.size is not None
-        ]
-
-        ds_doc: DsDocument = DsDocument(
-            name=title,
-            description=desc,
-            file_info=file_info,
-            main_text=main_text,
-            tables=tables,
-            figures=figures,
-            page_dimensions=page_dimensions,
-            page_headers=page_headers,
-            page_footers=page_footers,
-        )
-
-        return ds_doc
-
-    def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
-        with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
-            ds_doc = self._to_legacy_document(conv_res)
-            ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
-
-            glm_doc = self.model.apply_on_doc(ds_doc_dict)
-
-            docling_doc: DoclingDocument = to_docling_document(glm_doc)  # Experimental
-            1 == 1
-
-        # DEBUG code:
-        def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
-            clusters_to_draw = []
-            image = copy.deepcopy(conv_res.pages[page_no].image)
-            for ix, elem in enumerate(ds_document.main_text):
-                if isinstance(elem, BaseText):
-                    prov = elem.prov[0]  # type: ignore
-                elif isinstance(elem, Ref):
-                    _, arr, index = elem.ref.split("/")
-                    index = int(index)  # type: ignore
-                    if arr == "tables":
-                        prov = ds_document.tables[index].prov[0]
-                    elif arr == "figures":
-                        prov = ds_document.pictures[index].prov[0]
-                    else:
-                        prov = None
-
-                if prov and prov.page == page_no:
-                    clusters_to_draw.append(
-                        Cluster(
-                            id=ix,
-                            label=elem.name,
-                            bbox=BoundingBox.from_tuple(
-                                coord=prov.bbox,  # type: ignore
-                                origin=CoordOrigin.BOTTOMLEFT,
-                            ).to_top_left_origin(conv_res.pages[page_no].size.height),
-                        )
-                    )
-
-            draw = ImageDraw.Draw(image)
-            for c in clusters_to_draw:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
-                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
-                draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
-
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                for tc in c.cells:  # [:1]:
-                    x0, y0, x1, y1 = tc.bbox.as_tuple()
-                    draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-
-            if show:
-                image.show()
-            else:
-                out_path: Path = (
-                    Path(settings.debug.debug_output_path)
-                    / f"debug_{conv_res.input.file.stem}"
-                )
-                out_path.mkdir(parents=True, exist_ok=True)
-
-                out_file = out_path / f"doc_page_{page_no:05}.png"
-                image.save(str(out_file), format="png")
-
-        # for item in ds_doc.page_dimensions:
-        #    page_no = item.page
-        #    draw_clusters_and_cells(ds_doc, page_no)
-
-        return docling_doc
--- a/docling/models/page_assemble_model.py
+++ b/docling/models/page_assemble_model.py
@ -52,6 +52,14 @@ class PageAssembleModel(BasePageModel):

        sanitized_text = "".join(lines)

+        # Text normalization
+        sanitized_text = sanitized_text.replace("⁄", "/")
+        sanitized_text = sanitized_text.replace("’", "'")
+        sanitized_text = sanitized_text.replace("‘", "'")
+        sanitized_text = sanitized_text.replace("“", '"')
+        sanitized_text = sanitized_text.replace("”", '"')
+        sanitized_text = sanitized_text.replace("•", "·")
+
        return sanitized_text.strip()  # Strip any leading or trailing whitespace

    def __call__(
--- a/docling/models/readingorder_model.py
+++ b/docling/models/readingorder_model.py
@ -0,0 +1,389 @@
+import copy
+import random
+from pathlib import Path
+from typing import Dict, List
+
+from docling_core.types.doc import (
+    BoundingBox,
+    CoordOrigin,
+    DocItem,
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupLabel,
+    NodeItem,
+    ProvenanceItem,
+    RefItem,
+    TableData,
+)
+from docling_core.types.doc.document import ContentLayer
+from docling_core.types.legacy_doc.base import Ref
+from docling_core.types.legacy_doc.document import BaseText
+from docling_ibm_models.reading_order.reading_order_rb import (
+    PageElement as ReadingOrderPageElement,
+)
+from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
+from PIL import ImageDraw
+from pydantic import BaseModel, ConfigDict
+
+from docling.datamodel.base_models import (
+    BasePageElement,
+    Cluster,
+    ContainerElement,
+    FigureElement,
+    Table,
+    TextElement,
+)
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.settings import settings
+from docling.utils.profiling import ProfilingScope, TimeRecorder
+
+
+class ReadingOrderOptions(BaseModel):
+    model_config = ConfigDict(protected_namespaces=())
+
+    model_names: str = ""  # e.g. "language;term;reference"
+
+
+class ReadingOrderModel:
+    def __init__(self, options: ReadingOrderOptions):
+        self.options = options
+        self.ro_model = ReadingOrderPredictor()
+
+    def _assembled_to_readingorder_elements(
+        self, conv_res: ConversionResult
+    ) -> List[ReadingOrderPageElement]:
+
+        elements: List[ReadingOrderPageElement] = []
+        page_no_to_pages = {p.page_no: p for p in conv_res.pages}
+
+        for element in conv_res.assembled.elements:
+
+            page_height = page_no_to_pages[element.page_no].size.height  # type: ignore
+            bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
+            text = element.text or ""
+
+            elements.append(
+                ReadingOrderPageElement(
+                    cid=len(elements),
+                    ref=RefItem(cref=f"#/{element.page_no}/{element.cluster.id}"),
+                    text=text,
+                    page_no=element.page_no,
+                    page_size=page_no_to_pages[element.page_no].size,
+                    label=element.label,
+                    l=bbox.l,
+                    r=bbox.r,
+                    b=bbox.b,
+                    t=bbox.t,
+                    coord_origin=bbox.coord_origin,
+                )
+            )
+
+        return elements
+
+    def _add_child_elements(
+        self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
+    ):
+
+        child: Cluster
+        for child in element.cluster.children:
+            c_label = child.label
+            c_bbox = child.bbox.to_bottom_left_origin(
+                doc.pages[element.page_no + 1].size.height
+            )
+            c_text = " ".join(
+                [
+                    cell.text.replace("\x02", "-").strip()
+                    for cell in child.cells
+                    if len(cell.text.strip()) > 0
+                ]
+            )
+
+            c_prov = ProvenanceItem(
+                page_no=element.page_no + 1, charspan=(0, len(c_text)), bbox=c_bbox
+            )
+            if c_label == DocItemLabel.LIST_ITEM:
+                # TODO: Infer if this is a numbered or a bullet list item
+                doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
+            elif c_label == DocItemLabel.SECTION_HEADER:
+                doc.add_heading(parent=doc_item, text=c_text, prov=c_prov)
+            else:
+                doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
+
+    def _readingorder_elements_to_docling_doc(
+        self,
+        conv_res: ConversionResult,
+        ro_elements: List[ReadingOrderPageElement],
+        el_to_captions_mapping: Dict[int, List[int]],
+        el_to_footnotes_mapping: Dict[int, List[int]],
+        el_merges_mapping: Dict[int, List[int]],
+    ) -> DoclingDocument:
+
+        id_to_elem = {
+            RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
+            for elem in conv_res.assembled.elements
+        }
+        cid_to_rels = {rel.cid: rel for rel in ro_elements}
+
+        origin = DocumentOrigin(
+            mimetype="application/pdf",
+            filename=conv_res.input.file.name,
+            binary_hash=conv_res.input.document_hash,
+        )
+        doc_name = Path(origin.filename).stem
+        out_doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
+
+        for page in conv_res.pages:
+            page_no = page.page_no + 1
+            size = page.size
+
+            assert size is not None
+
+            out_doc.add_page(page_no=page_no, size=size)
+
+        current_list = None
+        skippable_cids = {
+            cid
+            for mapping in (
+                el_to_captions_mapping,
+                el_to_footnotes_mapping,
+                el_merges_mapping,
+            )
+            for lst in mapping.values()
+            for cid in lst
+        }
+
+        page_no_to_pages = {p.page_no: p for p in conv_res.pages}
+
+        for rel in ro_elements:
+            if rel.cid in skippable_cids:
+                continue
+            element = id_to_elem[rel.ref.cref]
+
+            page_height = page_no_to_pages[element.page_no].size.height  # type: ignore
+
+            if isinstance(element, TextElement):
+                if element.label == DocItemLabel.CODE:
+                    cap_text = element.text
+                    prov = ProvenanceItem(
+                        page_no=element.page_no + 1,
+                        charspan=(0, len(cap_text)),
+                        bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
+                    )
+                    code_item = out_doc.add_code(text=cap_text, prov=prov)
+
+                    if rel.cid in el_to_captions_mapping.keys():
+                        for caption_cid in el_to_captions_mapping[rel.cid]:
+                            caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
+                            new_cap_item = self._add_caption_or_footnote(
+                                caption_elem, out_doc, code_item, page_height
+                            )
+
+                            code_item.captions.append(new_cap_item.get_ref())
+
+                    if rel.cid in el_to_footnotes_mapping.keys():
+                        for footnote_cid in el_to_footnotes_mapping[rel.cid]:
+                            footnote_elem = id_to_elem[
+                                cid_to_rels[footnote_cid].ref.cref
+                            ]
+                            new_footnote_item = self._add_caption_or_footnote(
+                                footnote_elem, out_doc, code_item, page_height
+                            )
+
+                            code_item.footnotes.append(new_footnote_item.get_ref())
+                else:
+
+                    new_item, current_list = self._handle_text_element(
+                        element, out_doc, current_list, page_height
+                    )
+
+                    if rel.cid in el_merges_mapping.keys():
+                        for merged_cid in el_merges_mapping[rel.cid]:
+                            merged_elem = id_to_elem[cid_to_rels[merged_cid].ref.cref]
+
+                            self._merge_elements(
+                                element, merged_elem, new_item, page_height
+                            )
+
+            elif isinstance(element, Table):
+
+                tbl_data = TableData(
+                    num_rows=element.num_rows,
+                    num_cols=element.num_cols,
+                    table_cells=element.table_cells,
+                )
+
+                prov = ProvenanceItem(
+                    page_no=element.page_no + 1,
+                    charspan=(0, 0),
+                    bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
+                )
+
+                tbl = out_doc.add_table(
+                    data=tbl_data, prov=prov, label=element.cluster.label
+                )
+
+                if rel.cid in el_to_captions_mapping.keys():
+                    for caption_cid in el_to_captions_mapping[rel.cid]:
+                        caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
+                        new_cap_item = self._add_caption_or_footnote(
+                            caption_elem, out_doc, tbl, page_height
+                        )
+
+                        tbl.captions.append(new_cap_item.get_ref())
+
+                if rel.cid in el_to_footnotes_mapping.keys():
+                    for footnote_cid in el_to_footnotes_mapping[rel.cid]:
+                        footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
+                        new_footnote_item = self._add_caption_or_footnote(
+                            footnote_elem, out_doc, tbl, page_height
+                        )
+
+                        tbl.footnotes.append(new_footnote_item.get_ref())
+
+                # TODO: Consider adding children of Table.
+
+            elif isinstance(element, FigureElement):
+                cap_text = ""
+                prov = ProvenanceItem(
+                    page_no=element.page_no + 1,
+                    charspan=(0, len(cap_text)),
+                    bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
+                )
+                pic = out_doc.add_picture(prov=prov)
+
+                if rel.cid in el_to_captions_mapping.keys():
+                    for caption_cid in el_to_captions_mapping[rel.cid]:
+                        caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
+                        new_cap_item = self._add_caption_or_footnote(
+                            caption_elem, out_doc, pic, page_height
+                        )
+
+                        pic.captions.append(new_cap_item.get_ref())
+
+                if rel.cid in el_to_footnotes_mapping.keys():
+                    for footnote_cid in el_to_footnotes_mapping[rel.cid]:
+                        footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
+                        new_footnote_item = self._add_caption_or_footnote(
+                            footnote_elem, out_doc, pic, page_height
+                        )
+
+                        pic.footnotes.append(new_footnote_item.get_ref())
+
+                self._add_child_elements(element, pic, out_doc)
+
+            elif isinstance(element, ContainerElement):  # Form, KV region
+                label = element.label
+                group_label = GroupLabel.UNSPECIFIED
+                if label == DocItemLabel.FORM:
+                    group_label = GroupLabel.FORM_AREA
+                elif label == DocItemLabel.KEY_VALUE_REGION:
+                    group_label = GroupLabel.KEY_VALUE_AREA
+
+                container_el = out_doc.add_group(label=group_label)
+
+                self._add_child_elements(element, container_el, out_doc)
+
+        return out_doc
+
+    def _add_caption_or_footnote(self, elem, out_doc, parent, page_height):
+        assert isinstance(elem, TextElement)
+        text = elem.text
+        prov = ProvenanceItem(
+            page_no=elem.page_no + 1,
+            charspan=(0, len(text)),
+            bbox=elem.cluster.bbox.to_bottom_left_origin(page_height),
+        )
+        new_item = out_doc.add_text(
+            label=elem.label, text=text, prov=prov, parent=parent
+        )
+        return new_item
+
+    def _handle_text_element(self, element, out_doc, current_list, page_height):
+        cap_text = element.text
+
+        prov = ProvenanceItem(
+            page_no=element.page_no + 1,
+            charspan=(0, len(cap_text)),
+            bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
+        )
+        label = element.label
+        if label == DocItemLabel.LIST_ITEM:
+            if current_list is None:
+                current_list = out_doc.add_group(label=GroupLabel.LIST, name="list")
+
+            # TODO: Infer if this is a numbered or a bullet list item
+            new_item = out_doc.add_list_item(
+                text=cap_text, enumerated=False, prov=prov, parent=current_list
+            )
+        elif label == DocItemLabel.SECTION_HEADER:
+            current_list = None
+
+            new_item = out_doc.add_heading(text=cap_text, prov=prov)
+        elif label == DocItemLabel.FORMULA:
+            current_list = None
+
+            new_item = out_doc.add_text(
+                label=DocItemLabel.FORMULA, text="", orig=cap_text, prov=prov
+            )
+        else:
+            current_list = None
+
+            content_layer = ContentLayer.BODY
+            if element.label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
+                content_layer = ContentLayer.FURNITURE
+
+            new_item = out_doc.add_text(
+                label=element.label,
+                text=cap_text,
+                prov=prov,
+                content_layer=content_layer,
+            )
+        return new_item, current_list
+
+    def _merge_elements(self, element, merged_elem, new_item, page_height):
+        assert isinstance(
+            merged_elem, type(element)
+        ), "Merged element must be of same type as element."
+        assert (
+            merged_elem.label == new_item.label
+        ), "Labels of merged elements must match."
+        prov = ProvenanceItem(
+            page_no=element.page_no + 1,
+            charspan=(
+                len(new_item.text) + 1,
+                len(new_item.text) + 1 + len(merged_elem.text),
+            ),
+            bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
+        )
+        new_item.text += f" {merged_elem.text}"
+        new_item.orig += f" {merged_elem.text}"  # TODO: This is incomplete, we don't have the `orig` field of the merged element.
+        new_item.prov.append(prov)
+
+    def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
+        with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
+            page_elements = self._assembled_to_readingorder_elements(conv_res)
+
+            # Apply reading order
+            sorted_elements = self.ro_model.predict_reading_order(
+                page_elements=page_elements
+            )
+            el_to_captions_mapping = self.ro_model.predict_to_captions(
+                sorted_elements=sorted_elements
+            )
+            el_to_footnotes_mapping = self.ro_model.predict_to_footnotes(
+                sorted_elements=sorted_elements
+            )
+            el_merges_mapping = self.ro_model.predict_merges(
+                sorted_elements=sorted_elements
+            )
+
+            docling_doc: DoclingDocument = self._readingorder_elements_to_docling_doc(
+                conv_res,
+                sorted_elements,
+                el_to_captions_mapping,
+                el_to_footnotes_mapping,
+                el_merges_mapping,
+            )
+
+        return docling_doc
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -27,7 +27,6 @@ from docling.models.document_picture_classifier import (
    DocumentPictureClassifier,
    DocumentPictureClassifierOptions,
 )
-from docling.models.ds_glm_model import GlmModel, GlmOptions
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.ocr_mac_model import OcrMacModel
@ -40,6 +39,7 @@ from docling.models.picture_description_api_model import PictureDescriptionApiMo
 from docling.models.picture_description_base_model import PictureDescriptionBaseModel
 from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
 from docling.models.rapid_ocr_model import RapidOcrModel
+from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
 from docling.models.table_structure_model import TableStructureModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
 from docling.models.tesseract_ocr_model import TesseractOcrModel
@ -76,7 +76,7 @@ class StandardPdfPipeline(PaginatedPipeline):
            or self.pipeline_options.generate_table_images
        )

-        self.glm_model = GlmModel(options=GlmOptions())
+        self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())

        if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
            raise RuntimeError(
--- a/docs/examples/batch_convert.py
+++ b/docs/examples/batch_convert.py
@ -5,16 +5,18 @@ from pathlib import Path
 from typing import Iterable

 import yaml
+from docling_core.types.doc import ImageRefMode

-from docling.datamodel.base_models import ConversionStatus
+from docling.datamodel.base_models import ConversionStatus, InputFormat
 from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.datamodel.settings import settings
-from docling.document_converter import DocumentConverter
+from docling.document_converter import DocumentConverter, PdfFormatOption

 _log = logging.getLogger(__name__)

 USE_V2 = True
-USE_LEGACY = True
+USE_LEGACY = False


 def export_documents(
@ -33,26 +35,31 @@ def export_documents(
            doc_filename = conv_res.input.file.stem

            if USE_V2:
-                # Export Docling document format to JSON:
-                with (output_dir / f"{doc_filename}.json").open("w") as fp:
-                    fp.write(json.dumps(conv_res.document.export_to_dict()))
+                conv_res.document.save_as_json(
+                    output_dir / f"{doc_filename}.json",
+                    image_mode=ImageRefMode.PLACEHOLDER,
+                )
+                conv_res.document.save_as_html(
+                    output_dir / f"{doc_filename}.html",
+                    image_mode=ImageRefMode.EMBEDDED,
+                )
+                conv_res.document.save_as_document_tokens(
+                    output_dir / f"{doc_filename}.doctags.txt"
+                )
+                conv_res.document.save_as_markdown(
+                    output_dir / f"{doc_filename}.md",
+                    image_mode=ImageRefMode.PLACEHOLDER,
+                )
+                conv_res.document.save_as_markdown(
+                    output_dir / f"{doc_filename}.txt",
+                    image_mode=ImageRefMode.PLACEHOLDER,
+                    strict_text=True,
+                )

                # Export Docling document format to YAML:
                with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
                    fp.write(yaml.safe_dump(conv_res.document.export_to_dict()))

-                # Export Docling document format to doctags:
-                with (output_dir / f"{doc_filename}.doctags.txt").open("w") as fp:
-                    fp.write(conv_res.document.export_to_document_tokens())
-
-                # Export Docling document format to markdown:
-                with (output_dir / f"{doc_filename}.md").open("w") as fp:
-                    fp.write(conv_res.document.export_to_markdown())
-
-                # Export Docling document format to text:
-                with (output_dir / f"{doc_filename}.txt").open("w") as fp:
-                    fp.write(conv_res.document.export_to_markdown(strict_text=True))
-
            if USE_LEGACY:
                # Export Deep Search document JSON format:
                with (output_dir / f"{doc_filename}.legacy.json").open(
@ -119,13 +126,20 @@ def main():
    # settings.debug.visualize_tables = True
    # settings.debug.visualize_cells = True

-    doc_converter = DocumentConverter()
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.generate_page_images = True
+
+    doc_converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+        }
+    )

    start_time = time.time()

    conv_results = doc_converter.convert_all(
        input_doc_paths,
-        raises_on_error=False,  # to let conversion run through all and examine results at the end
+        raises_on_error=True,  # to let conversion run through all and examine results at the end
    )
    success_count, partial_success_count, failure_count = export_documents(
        conv_results, output_dir=Path("scratch")
--- a/poetry.lock
+++ b/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.

 [[package]]
 name = "aiohappyeyeballs"
@ -187,8 +187,8 @@ files = [
 lazy-object-proxy = ">=1.4.0"
 typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
 wrapt = [
-    {version = ">=1.11,<2", markers = "python_version < \"3.11\""},
    {version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
+    {version = ">=1.11,<2", markers = "python_version < \"3.11\""},
 ]

 [[package]]
@ -782,52 +782,6 @@ files = [
    {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"},
 ]

-[[package]]
-name = "deepsearch-glm"
-version = "1.0.0"
-description = "Graph Language Models"
-optional = false
-python-versions = "<4.0,>=3.9"
-files = [
-    {file = "deepsearch_glm-1.0.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:94792b57df7a1c4ba8b47ebd8f36ea0a090d4f27a4fba39bd7b166b6b537260a"},
-    {file = "deepsearch_glm-1.0.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ff46e352e96a2f56ce7ae4fdf04b271ee841c29ff159b1dec0e5ecaaadba8d4d"},
-    {file = "deepsearch_glm-1.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d77d3d94d49641888aa15f3ad23e81158e791aa9d9608dd8168dc71788e56f3"},
-    {file = "deepsearch_glm-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:143de0fd111a570be12935d8799a2715fe1775d4dc4e256337860b429cee5d36"},
-    {file = "deepsearch_glm-1.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:9f2872dd573cd2206ce7f9e2e6016c38b66d9ecbd983283ff5e8c6023813c311"},
-    {file = "deepsearch_glm-1.0.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:e64d94ff5209f0a11e8c75c6b28b033ef27b95a22c2fbcbd945e7fe8cc421545"},
-    {file = "deepsearch_glm-1.0.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:a5702205677b768b51f881d15d933370f6ef3c826dfac3b9aa0b904d2e6c495a"},
-    {file = "deepsearch_glm-1.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0417a2ae998e1709f03458cfb9adb55423bb1328224eb055300796baa757879f"},
-    {file = "deepsearch_glm-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f0e1efe9af0d28e9b473fe599246deb3a0be7c3d546a478da284747144d086a"},
-    {file = "deepsearch_glm-1.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:807faf13eb0deea55a1951d479a85d5e20de0ff8b2e0b57b2f7939552759a426"},
-    {file = "deepsearch_glm-1.0.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:56d9575df9eceb8c2ae33e3d15e133924cc195714c3d268599b6f8414c1f6bb8"},
-    {file = "deepsearch_glm-1.0.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:51f5c6522f60ba73eb12eeb7217bd98d871ba7c078337a4059d05878d8baf2d6"},
-    {file = "deepsearch_glm-1.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6211eaf497ad7cfcb68f80f9b5387940be0204fe149a9fc03988a95145f410a"},
-    {file = "deepsearch_glm-1.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b003bf457fce61ea4de79e2d7d0228a1ae349f677eb6570e745f79d4429804f"},
-    {file = "deepsearch_glm-1.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:9d61f66048e6ab60fe9f84c823fd593bf8517755833bd9efb59156d77a2b42d0"},
-    {file = "deepsearch_glm-1.0.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:7d558e8b365c27ee665d0589165fd074fb252c73715f9cc6aeb4304a63683f37"},
-    {file = "deepsearch_glm-1.0.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:3199093a9472e5756214b9b6563f827c19c001c7dd8ae00e03eed1140c12930d"},
-    {file = "deepsearch_glm-1.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f18d1ee68a0479592e0c714e6cbf9e2d0fa8edd692d580da64431c84cbef5c2"},
-    {file = "deepsearch_glm-1.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:62c1c0ea0a544219da15c017632f9e0be116ecdc335b865c6c5760429557fe23"},
-    {file = "deepsearch_glm-1.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:962f393dcec2204de1a5cb0f635c65258bde2424ad2d4e0f5df770139c3958de"},
-    {file = "deepsearch_glm-1.0.0-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:4d328336950975c583d318a70e3511075d1ac1c599c2090a2a7928a4662fe8f2"},
-    {file = "deepsearch_glm-1.0.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:748d077a4cacd714ff23a095c873549c176fa5ffe1a656be1bd11873148e58db"},
-    {file = "deepsearch_glm-1.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c0953d1983e902327f0cc152ff8267056ec2699106eefc70a41eec6eebdbe1b"},
-    {file = "deepsearch_glm-1.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:105c50b2e5b8f9a6ea5fb0b755a9cd38a1fb12ecb07f1a13d1290ad3cdfeaa90"},
-    {file = "deepsearch_glm-1.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:25bb899317f6af062083daa578f343c93a2b12755c174549fb58596de0bc7b9d"},
-    {file = "deepsearch_glm-1.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:e2315cc4ffe7032dada294a0cd72a47dbc6c0121fd07d4b5719f9a9e9519d091"},
-    {file = "deepsearch_glm-1.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:707b92f51bacbd0f799ee3351474766bf916ef82f97c1bcc0e7696532ba03535"},
-    {file = "deepsearch_glm-1.0.0.tar.gz", hash = "sha256:e8dce88ac519a693c260f28bd3c4ec409811e65ade84fb508f6c6e37ca065e62"},
-]
-
-[package.dependencies]
-pywin32 = {version = ">=307,<308", markers = "sys_platform == \"win32\""}
-
-[package.extras]
-docling = ["docling-core (>=2.0,<3.0)", "pandas (>=1.5.1,<3.0.0)"]
-pyplot = ["matplotlib (>=3.7.1,<4.0.0)"]
-toolkit = ["deepsearch-toolkit (>=1.1.0,<2.0.0)", "python-dotenv (>=1.0.0,<2.0.0)"]
-utils = ["pandas (>=1.5.1,<3.0.0)", "python-dotenv (>=1.0.0,<2.0.0)", "requests (>=2.32.3,<3.0.0)", "rich (>=13.7.0,<14.0.0)", "tabulate (>=0.8.9)", "tqdm (>=4.64.0,<5.0.0)"]
-
 [[package]]
 name = "defusedxml"
 version = "0.7.1"
@ -895,16 +849,17 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"]

 [[package]]
 name = "docling-ibm-models"
-version = "3.3.2"
+version = "3.4.0"
 description = "This package contains the AI models used by the Docling PDF conversion package"
 optional = false
 python-versions = "<4.0,>=3.9"
 files = [
-    {file = "docling_ibm_models-3.3.2-py3-none-any.whl", hash = "sha256:9f82a2ef73c6cd8d729ab2fcc4223079ccb8b6eec0bf0643c56e55352b97b5cb"},
-    {file = "docling_ibm_models-3.3.2.tar.gz", hash = "sha256:f6ed59dfb3f98a71ccdd003c13c9a868e3003c22bd5adc554197da7eec227cde"},
+    {file = "docling_ibm_models-3.4.0-py3-none-any.whl", hash = "sha256:186517ff1f76e76113600fa1e5a699927325081a8013fdd5d0551121c2e34190"},
+    {file = "docling_ibm_models-3.4.0.tar.gz", hash = "sha256:fb79beeb07d1bb9bc8acf9d0a44643cd7ce1910aa418cd685e2e477b13eeafee"},
 ]

 [package.dependencies]
+docling-core = ">=2.19.0,<3.0.0"
 huggingface_hub = ">=0.23,<1"
 jsonlines = ">=3.1.0,<4.0.0"
 numpy = [
@ -913,6 +868,7 @@ numpy = [
 ]
 opencv-python-headless = ">=4.6.0.66,<5.0.0.0"
 Pillow = ">=10.0.0,<12.0.0"
+pydantic = ">=2.0.0,<3.0.0"
 safetensors = {version = ">=0.4.3,<1", extras = ["torch"]}
 torch = ">=2.2.2,<3.0.0"
 torchvision = ">=0,<1"
@ -1495,13 +1451,13 @@ zstd = ["zstandard (>=0.18.0)"]

 [[package]]
 name = "huggingface-hub"
-version = "0.29.0"
+version = "0.29.1"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.29.0-py3-none-any.whl", hash = "sha256:c02daa0b6bafbdacb1320fdfd1dc7151d0940825c88c4ef89837fdb1f6ea0afe"},
-    {file = "huggingface_hub-0.29.0.tar.gz", hash = "sha256:64034c852be270cac16c5743fe1f659b14515a9de6342d6f42cbb2ede191fc80"},
+    {file = "huggingface_hub-0.29.1-py3-none-any.whl", hash = "sha256:352f69caf16566c7b6de84b54a822f6238e17ddd8ae3da4f8f2272aea5b198d5"},
+    {file = "huggingface_hub-0.29.1.tar.gz", hash = "sha256:9524eae42077b8ff4fc459ceb7a514eca1c1232b775276b009709fe2a084f250"},
 ]

 [package.dependencies]
@ -2728,13 +2684,13 @@ pygments = ">2.12.0"

 [[package]]
 name = "mkdocs-material"
-version = "9.6.4"
+version = "9.6.5"
 description = "Documentation that simply works"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "mkdocs_material-9.6.4-py3-none-any.whl", hash = "sha256:414e8376551def6d644b8e6f77226022868532a792eb2c9accf52199009f568f"},
-    {file = "mkdocs_material-9.6.4.tar.gz", hash = "sha256:4d1d35e1c1d3e15294cb7fa5d02e0abaee70d408f75027dc7be6e30fb32e6867"},
+    {file = "mkdocs_material-9.6.5-py3-none-any.whl", hash = "sha256:aad3e6fb860c20870f75fb2a69ef901f1be727891e41adb60b753efcae19453b"},
+    {file = "mkdocs_material-9.6.5.tar.gz", hash = "sha256:b714679a8c91b0ffe2188e11ed58c44d2523e9c2ae26a29cc652fa7478faa21f"},
 ]

 [package.dependencies]
@ -2835,8 +2791,8 @@ files = [

 [package.dependencies]
 multiprocess = [
-    {version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""},
    {version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""},
+    {version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""},
 ]
 pygments = ">=2.0"
 pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""}
@ -3845,10 +3801,10 @@ files = [

 [package.dependencies]
 numpy = [
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
    {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
-    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
-    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
    {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
    {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
 ]
@ -3871,10 +3827,10 @@ files = [

 [package.dependencies]
 numpy = [
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
    {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
-    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
-    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
    {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
    {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
 ]
@ -4060,9 +4016,9 @@ files = [

 [package.dependencies]
 numpy = [
-    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
-    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
 ]
 python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
@ -4249,13 +4205,13 @@ xmp = ["defusedxml"]

 [[package]]
 name = "pkginfo"
-version = "1.12.1.1"
+version = "1.12.1.2"
 description = "Query metadata from sdists / bdists / installed packages."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pkginfo-1.12.1.1-py3-none-any.whl", hash = "sha256:57f0b56061c84d5875556fb9437ef80b69863dea56dbad30b0b9a9887652b47c"},
-    {file = "pkginfo-1.12.1.1.tar.gz", hash = "sha256:4a09d6ab00af353d24021a42c1b00ac9d0beef0ccc012a5c122430acbeb01c0e"},
+    {file = "pkginfo-1.12.1.2-py3-none-any.whl", hash = "sha256:c783ac885519cab2c34927ccfa6bf64b5a704d7c69afaea583dd9b7afe969343"},
+    {file = "pkginfo-1.12.1.2.tar.gz", hash = "sha256:5cd957824ac36f140260964eba3c6be6442a8359b8c48f4adf90210f33a04b7b"},
 ]

 [package.extras]
@ -4810,8 +4766,8 @@ files = [
 astroid = ">=2.15.8,<=2.17.0-dev0"
 colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
 dill = [
-    {version = ">=0.2", markers = "python_version < \"3.11\""},
    {version = ">=0.3.6", markers = "python_version >= \"3.11\""},
+    {version = ">=0.2", markers = "python_version < \"3.11\""},
 ]
 isort = ">=4.2.5,<6"
 mccabe = ">=0.6,<0.8"
@ -5301,29 +5257,29 @@ files = [

 [[package]]
 name = "pywin32"
-version = "307"
+version = "308"
 description = "Python for Window Extensions"
 optional = false
 python-versions = "*"
 files = [
-    {file = "pywin32-307-cp310-cp310-win32.whl", hash = "sha256:f8f25d893c1e1ce2d685ef6d0a481e87c6f510d0f3f117932781f412e0eba31b"},
-    {file = "pywin32-307-cp310-cp310-win_amd64.whl", hash = "sha256:36e650c5e5e6b29b5d317385b02d20803ddbac5d1031e1f88d20d76676dd103d"},
-    {file = "pywin32-307-cp310-cp310-win_arm64.whl", hash = "sha256:0c12d61e0274e0c62acee79e3e503c312426ddd0e8d4899c626cddc1cafe0ff4"},
-    {file = "pywin32-307-cp311-cp311-win32.whl", hash = "sha256:fec5d27cc893178fab299de911b8e4d12c5954e1baf83e8a664311e56a272b75"},
-    {file = "pywin32-307-cp311-cp311-win_amd64.whl", hash = "sha256:987a86971753ed7fdd52a7fb5747aba955b2c7fbbc3d8b76ec850358c1cc28c3"},
-    {file = "pywin32-307-cp311-cp311-win_arm64.whl", hash = "sha256:fd436897c186a2e693cd0437386ed79f989f4d13d6f353f8787ecbb0ae719398"},
-    {file = "pywin32-307-cp312-cp312-win32.whl", hash = "sha256:07649ec6b01712f36debf39fc94f3d696a46579e852f60157a729ac039df0815"},
-    {file = "pywin32-307-cp312-cp312-win_amd64.whl", hash = "sha256:00d047992bb5dcf79f8b9b7c81f72e0130f9fe4b22df613f755ab1cc021d8347"},
-    {file = "pywin32-307-cp312-cp312-win_arm64.whl", hash = "sha256:b53658acbfc6a8241d72cc09e9d1d666be4e6c99376bc59e26cdb6223c4554d2"},
-    {file = "pywin32-307-cp313-cp313-win32.whl", hash = "sha256:ea4d56e48dc1ab2aa0a5e3c0741ad6e926529510516db7a3b6981a1ae74405e5"},
-    {file = "pywin32-307-cp313-cp313-win_amd64.whl", hash = "sha256:576d09813eaf4c8168d0bfd66fb7cb3b15a61041cf41598c2db4a4583bf832d2"},
-    {file = "pywin32-307-cp313-cp313-win_arm64.whl", hash = "sha256:b30c9bdbffda6a260beb2919f918daced23d32c79109412c2085cbc513338a0a"},
-    {file = "pywin32-307-cp37-cp37m-win32.whl", hash = "sha256:5101472f5180c647d4525a0ed289ec723a26231550dbfd369ec19d5faf60e511"},
-    {file = "pywin32-307-cp37-cp37m-win_amd64.whl", hash = "sha256:05de55a7c110478dc4b202230e98af5e0720855360d2b31a44bb4e296d795fba"},
-    {file = "pywin32-307-cp38-cp38-win32.whl", hash = "sha256:13d059fb7f10792542082f5731d5d3d9645320fc38814759313e5ee97c3fac01"},
-    {file = "pywin32-307-cp38-cp38-win_amd64.whl", hash = "sha256:7e0b2f93769d450a98ac7a31a087e07b126b6d571e8b4386a5762eb85325270b"},
-    {file = "pywin32-307-cp39-cp39-win32.whl", hash = "sha256:55ee87f2f8c294e72ad9d4261ca423022310a6e79fb314a8ca76ab3f493854c6"},
-    {file = "pywin32-307-cp39-cp39-win_amd64.whl", hash = "sha256:e9d5202922e74985b037c9ef46778335c102b74b95cec70f629453dbe7235d87"},
+    {file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"},
+    {file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"},
+    {file = "pywin32-308-cp310-cp310-win_arm64.whl", hash = "sha256:a5ab5381813b40f264fa3495b98af850098f814a25a63589a8e9eb12560f450c"},
+    {file = "pywin32-308-cp311-cp311-win32.whl", hash = "sha256:5d8c8015b24a7d6855b1550d8e660d8daa09983c80e5daf89a273e5c6fb5095a"},
+    {file = "pywin32-308-cp311-cp311-win_amd64.whl", hash = "sha256:575621b90f0dc2695fec346b2d6302faebd4f0f45c05ea29404cefe35d89442b"},
+    {file = "pywin32-308-cp311-cp311-win_arm64.whl", hash = "sha256:100a5442b7332070983c4cd03f2e906a5648a5104b8a7f50175f7906efd16bb6"},
+    {file = "pywin32-308-cp312-cp312-win32.whl", hash = "sha256:587f3e19696f4bf96fde9d8a57cec74a57021ad5f204c9e627e15c33ff568897"},
+    {file = "pywin32-308-cp312-cp312-win_amd64.whl", hash = "sha256:00b3e11ef09ede56c6a43c71f2d31857cf7c54b0ab6e78ac659497abd2834f47"},
+    {file = "pywin32-308-cp312-cp312-win_arm64.whl", hash = "sha256:9b4de86c8d909aed15b7011182c8cab38c8850de36e6afb1f0db22b8959e3091"},
+    {file = "pywin32-308-cp313-cp313-win32.whl", hash = "sha256:1c44539a37a5b7b21d02ab34e6a4d314e0788f1690d65b48e9b0b89f31abbbed"},
+    {file = "pywin32-308-cp313-cp313-win_amd64.whl", hash = "sha256:fd380990e792eaf6827fcb7e187b2b4b1cede0585e3d0c9e84201ec27b9905e4"},
+    {file = "pywin32-308-cp313-cp313-win_arm64.whl", hash = "sha256:ef313c46d4c18dfb82a2431e3051ac8f112ccee1a34f29c263c583c568db63cd"},
+    {file = "pywin32-308-cp37-cp37m-win32.whl", hash = "sha256:1f696ab352a2ddd63bd07430080dd598e6369152ea13a25ebcdd2f503a38f1ff"},
+    {file = "pywin32-308-cp37-cp37m-win_amd64.whl", hash = "sha256:13dcb914ed4347019fbec6697a01a0aec61019c1046c2b905410d197856326a6"},
+    {file = "pywin32-308-cp38-cp38-win32.whl", hash = "sha256:5794e764ebcabf4ff08c555b31bd348c9025929371763b2183172ff4708152f0"},
+    {file = "pywin32-308-cp38-cp38-win_amd64.whl", hash = "sha256:3b92622e29d651c6b783e368ba7d6722b1634b8e70bd376fd7610fe1992e19de"},
+    {file = "pywin32-308-cp39-cp39-win32.whl", hash = "sha256:7873ca4dc60ab3287919881a7d4f88baee4a6e639aa6962de25a98ba6b193341"},
+    {file = "pywin32-308-cp39-cp39-win_amd64.whl", hash = "sha256:71b3322d949b4cc20776436a9c9ba0eeedcbc9c650daa536df63f0ff111bb920"},
 ]

 [[package]]
@ -7830,4 +7786,4 @@ vlm = ["transformers", "transformers"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "63f9271160d39cac74fa3fc959dbb0f91530d76a693c69d81ced006477d04315"
+content-hash = "21c25b86d88aa138f7faa68fcba95af1d9a5edaa22550bd325997aed6eef44fe"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -27,8 +27,7 @@ packages = [{include = "docling"}]
 python = "^3.9"
 pydantic = "^2.0.0"
 docling-core = {extras = ["chunking"], version = "^2.19.0"}
-docling-ibm-models = "^3.3.0"
-deepsearch-glm = "^1.0.0"
+docling-ibm-models = "^3.4.0"
 docling-parse = "^3.3.0"
 filetype = "^1.2.0"
 pypdfium2 = "^4.30.0"
@ -164,7 +163,6 @@ module = [
    "docling_ibm_models.*",
    "easyocr.*",
    "ocrmac.*",
-    "deepsearch_glm.*",
    "lxml.*",
    "huggingface_hub.*",
    "transformers.*",
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.doctags.txt
@ -4,29 +4,27 @@
 <paragraph><location><page_1><loc_34><loc_77><loc_62><loc_78></location>{ ahn,nli,mly,taa } @zurich.ibm.com</paragraph>
 <subtitle-level-1><location><page_1><loc_24><loc_71><loc_31><loc_73></location>Abstract</subtitle-level-1>
 <subtitle-level-1><location><page_1><loc_52><loc_71><loc_67><loc_72></location>a. Picture of a table:</subtitle-level-1>
+<paragraph><location><page_1><loc_8><loc_35><loc_47><loc_70></location>Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.</paragraph>
 <subtitle-level-1><location><page_1><loc_8><loc_30><loc_21><loc_32></location>1. Introduction</subtitle-level-1>
 <paragraph><location><page_1><loc_8><loc_10><loc_47><loc_29></location>The occurrence of tables in documents is ubiquitous. They often summarise quantitative or factual data, which is cumbersome to describe in verbose text but nevertheless extremely valuable. Unfortunately, this compact representation is often not easy to parse by machines. There are many implicit conventions used to obtain a compact table representation. For example, tables often have complex columnand row-headers in order to reduce duplicated cell content. Lines of different shapes and sizes are leveraged to separate content or indicate a tree structure. Additionally, tables can also have empty/missing table-entries or multi-row textual table-entries. Fig. 1 shows a table which presents all these issues.</paragraph>
 <figure>
 <location><page_1><loc_52><loc_62><loc_88><loc_71></location>
 </figure>
-<caption><location><page_1><loc_8><loc_35><loc_47><loc_70></location>Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.</caption>
 <table>
 <location><page_1><loc_52><loc_62><loc_88><loc_71></location>
-<caption>Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.</caption>
 <row_0><col_0><col_header>3</col_0><col_1><col_header>1</col_1></row_0>
 </table>
 <paragraph><location><page_1><loc_52><loc_58><loc_79><loc_60></location>- b. Red-annotation of bounding boxes, Blue-predictions by TableFormer</paragraph>
+<paragraph><location><page_1><loc_52><loc_46><loc_80><loc_47></location>- c. Structure predicted by TableFormer:</paragraph>
 <figure>
 <location><page_1><loc_51><loc_48><loc_88><loc_57></location>
 </figure>
-<paragraph><location><page_1><loc_52><loc_46><loc_80><loc_47></location>- c. Structure predicted by TableFormer:</paragraph>
 <figure>
 <location><page_1><loc_52><loc_37><loc_88><loc_45></location>
+<caption>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption>
 </figure>
-<caption><location><page_1><loc_50><loc_29><loc_89><loc_35></location>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption>
 <table>
 <location><page_1><loc_52><loc_37><loc_88><loc_45></location>
-<caption>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption>
 <row_0><col_0><col_header>0</col_0><col_1><col_header>1</col_1><col_2><col_header>1</col_2><col_3><col_header>2 1</col_3><col_4><col_header>2 1</col_4><col_5><body></col_5></row_0>
 <row_1><col_0><body>3</col_0><col_1><body>4</col_1><col_2><body>5 3</col_2><col_3><body>6</col_3><col_4><body>7</col_4><col_5><body></col_5></row_1>
 <row_2><col_0><body>8</col_0><col_1><body>9</col_1><col_2><body>10</col_2><col_3><body>11</col_3><col_4><body>12</col_4><col_5><body>2</col_5></row_2>
@ -55,7 +53,6 @@
 <paragraph><location><page_3><loc_8><loc_21><loc_47><loc_38></location>Hybrid Deep Learning-Rule-Based approach : A popular current model for table-structure identification is the use of a hybrid Deep Learning-Rule-Based approach similar to [27, 29]. In this approach, one first detects the position of the table-cells with object detection (e.g. YoloVx or MaskRCNN), then classifies the table into different types (from its images) and finally uses different rule-sets to obtain its table-structure. Currently, this approach achieves stateof-the-art results, but is not an end-to-end deep-learning method. As such, new rules need to be written if different types of tables are encountered.</paragraph>
 <subtitle-level-1><location><page_3><loc_8><loc_18><loc_17><loc_20></location>3. Datasets</subtitle-level-1>
 <paragraph><location><page_3><loc_8><loc_10><loc_47><loc_17></location>We rely on large-scale datasets such as PubTabNet [37], FinTabNet [36], and TableBank [17] datasets to train and evaluate our models. These datasets span over various appearance styles and content. We also introduce our own synthetically generated SynthTabNet dataset to fix an im-</paragraph>
-<caption><location><page_3><loc_50><loc_64><loc_89><loc_66></location>Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets</caption>
 <figure>
 <location><page_3><loc_51><loc_68><loc_90><loc_90></location>
 <caption>Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets</caption>
@ -68,7 +65,6 @@
 <paragraph><location><page_4><loc_8><loc_45><loc_47><loc_60></location>As it is illustrated in Fig. 2, the table distributions from all datasets are skewed towards simpler structures with fewer number of rows/columns. Additionally, there is very limited variance in the table styles, which in case of PubTabNet and FinTabNet means one styling format for the majority of the tables. Similar limitations appear also in the type of table content, which in some cases (e.g. FinTabNet) is restricted to a certain domain. Ultimately, the lack of diversity in the training dataset damages the ability of the models to generalize well on unseen data.</paragraph>
 <paragraph><location><page_4><loc_8><loc_21><loc_47><loc_45></location>Motivated by those observations we aimed at generating a synthetic table dataset named SynthTabNet . This approach offers control over: 1) the size of the dataset, 2) the table structure, 3) the table style and 4) the type of content. The complexity of the table structure is described by the size of the table header and the table body, as well as the percentage of the table cells covered by row spans and column spans. A set of carefully designed styling templates provides the basis to build a wide range of table appearances. Lastly, the table content is generated out of a curated collection of text corpora. By controlling the size and scope of the synthetic datasets we are able to train and evaluate our models in a variety of different conditions. For example, we can first generate a highly diverse dataset to train our models and then evaluate their performance on other synthetic datasets which are focused on a specific domain.</paragraph>
 <paragraph><location><page_4><loc_8><loc_10><loc_47><loc_20></location>In this regard, we have prepared four synthetic datasets, each one containing 150k examples. The corpora to generate the table text consists of the most frequent terms appearing in PubTabNet and FinTabNet together with randomly generated text. The first two synthetic datasets have been fine-tuned to mimic the appearance of the original datasets but encompass more complicated table structures. The third</paragraph>
-<caption><location><page_4><loc_50><loc_72><loc_89><loc_79></location>Table 1: Both "Combined-Tabnet" and "CombinedTabnet" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.</caption>
 <table>
 <location><page_4><loc_51><loc_80><loc_89><loc_91></location>
 <caption>Table 1: Both "Combined-Tabnet" and "CombinedTabnet" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.</caption>
@ -80,6 +76,7 @@
 <row_5><col_0><row_header>Combined(**)</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>500k</col_3><col_4><body>PNG</col_4></row_5>
 <row_6><col_0><row_header>SynthTabNet</col_0><col_1><body>3</col_1><col_2><body>3</col_2><col_3><body>600k</col_3><col_4><body>PNG</col_4></row_6>
 </table>
+<caption><location><page_4><loc_50><loc_72><loc_89><loc_79></location>Table 1: Both "Combined-Tabnet" and "CombinedTabnet" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.</caption>
 <paragraph><location><page_4><loc_50><loc_63><loc_89><loc_68></location>one adopts a colorful appearance with high contrast and the last one contains tables with sparse content. Lastly, we have combined all synthetic datasets into one big unified synthetic dataset of 600k examples.</paragraph>
 <paragraph><location><page_4><loc_52><loc_61><loc_89><loc_62></location>Tab. 1 summarizes the various attributes of the datasets.</paragraph>
 <subtitle-level-1><location><page_4><loc_50><loc_58><loc_73><loc_59></location>4. The TableFormer model</subtitle-level-1>
@ -87,12 +84,10 @@
 <subtitle-level-1><location><page_4><loc_50><loc_41><loc_69><loc_42></location>4.1. Model architecture.</subtitle-level-1>
 <paragraph><location><page_4><loc_50><loc_16><loc_89><loc_40></location>We now describe in detail the proposed method, which is composed of three main components, see Fig. 4. Our CNN Backbone Network encodes the input as a feature vector of predefined length. The input feature vector of the encoded image is passed to the Structure Decoder to produce a sequence of HTML tags that represent the structure of the table. With each prediction of an HTML standard data cell (' < td > ') the hidden state of that cell is passed to the Cell BBox Decoder. As for spanning cells, such as row or column span, the tag is broken down to ' < ', 'rowspan=' or 'colspan=', with the number of spanning cells (attribute), and ' > '. The hidden state attached to ' < ' is passed to the Cell BBox Decoder. A shared feed forward network (FFN) receives the hidden states from the Structure Decoder, to provide the final detection predictions of the bounding box coordinates and their classification.</paragraph>
 <paragraph><location><page_4><loc_50><loc_10><loc_89><loc_16></location>CNN Backbone Network. A ResNet-18 CNN is the backbone that receives the table image and encodes it as a vector of predefined length. The network has been modified by removing the linear and pooling layer, as we are not per-</paragraph>
-<caption><location><page_5><loc_8><loc_72><loc_89><loc_74></location>Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.</caption>
 <figure>
 <location><page_5><loc_12><loc_77><loc_85><loc_90></location>
 <caption>Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.</caption>
 </figure>
-<caption><location><page_5><loc_8><loc_14><loc_47><loc_33></location>Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.</caption>
 <figure>
 <location><page_5><loc_9><loc_36><loc_47><loc_67></location>
 <caption>Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.</caption>
@ -110,8 +105,7 @@
 <subtitle-level-1><location><page_6><loc_8><loc_28><loc_28><loc_30></location>5. Experimental Results</subtitle-level-1>
 <subtitle-level-1><location><page_6><loc_8><loc_26><loc_29><loc_27></location>5.1. Implementation Details</subtitle-level-1>
 <paragraph><location><page_6><loc_8><loc_19><loc_47><loc_25></location>TableFormer uses ResNet-18 as the CNN Backbone Network . The input images are resized to 448*448 pixels and the feature map has a dimension of 28*28. Additionally, we enforce the following input constraints:</paragraph>
-<paragraph><location><page_6><loc_8><loc_10><loc_47><loc_13></location>Although input constraints are used also by other methods, such as EDD, ours are less restrictive due to the improved</paragraph>
-<paragraph><location><page_6><loc_50><loc_86><loc_89><loc_91></location>runtime performance and lower memory footprint of TableFormer. This allows to utilize input samples with longer sequences and images with larger dimensions.</paragraph>
+<paragraph><location><page_6><loc_8><loc_10><loc_47><loc_13></location><location><page_6><loc_8><loc_10><loc_47><loc_13></location>Although input constraints are used also by other methods, such as EDD, ours are less restrictive due to the improved runtime performance and lower memory footprint of TableFormer. This allows to utilize input samples with longer sequences and images with larger dimensions.</paragraph>
 <paragraph><location><page_6><loc_50><loc_59><loc_89><loc_85></location>The Transformer Encoder consists of two "Transformer Encoder Layers", with an input feature size of 512, feed forward network of 1024, and 4 attention heads. As for the Transformer Decoder it is composed of four "Transformer Decoder Layers" with similar input and output dimensions as the "Transformer Encoder Layers". Even though our model uses fewer layers and heads than the default implementation parameters, our extensive experimentation has proved this setup to be more suitable for table images. We attribute this finding to the inherent design of table images, which contain mostly lines and text, unlike the more elaborate content present in other scopes (e.g. the COCO dataset). Moreover, we have added ResNet blocks to the inputs of the Structure Decoder and Cell BBox Decoder. This prevents a decoder having a stronger influence over the learned weights which would damage the other prediction task (structure vs bounding boxes), but learn task specific weights instead. Lastly our dropout layers are set to 0.5.</paragraph>
 <paragraph><location><page_6><loc_50><loc_46><loc_89><loc_58></location>For training, TableFormer is trained with 3 Adam optimizers, each one for the CNN Backbone Network , Structure Decoder , and Cell BBox Decoder . Taking the PubTabNet as an example for our parameter set up, the initializing learning rate is 0.001 for 12 epochs with a batch size of 24, and λ set to 0.5. Afterwards, we reduce the learning rate to 0.0001, the batch size to 18 and train for 12 more epochs or convergence.</paragraph>
 <paragraph><location><page_6><loc_50><loc_30><loc_89><loc_45></location>TableFormer is implemented with PyTorch and Torchvision libraries [22]. To speed up the inference, the image undergoes a single forward pass through the CNN Backbone Network and transformer encoder. This eliminates the overhead of generating the same features for each decoding step. Similarly, we employ a 'caching' technique to preform faster autoregressive decoding. This is achieved by storing the features of decoded tokens so we can reuse them for each time step. Therefore, we only compute the attention for each new tag.</paragraph>
@ -123,10 +117,8 @@
 <paragraph><location><page_7><loc_8><loc_73><loc_47><loc_77></location>where T$_{a}$ and T$_{b}$ represent tables in tree structure HTML format. EditDist denotes the tree-edit distance, and | T | represents the number of nodes in T .</paragraph>
 <subtitle-level-1><location><page_7><loc_8><loc_70><loc_28><loc_72></location>5.4. Quantitative Analysis</subtitle-level-1>
 <paragraph><location><page_7><loc_8><loc_50><loc_47><loc_69></location>Structure. As shown in Tab. 2, TableFormer outperforms all SOTA methods across different datasets by a large margin for predicting the table structure from an image. All the more, our model outperforms pre-trained methods. During the evaluation we do not apply any table filtering. We also provide our baseline results on the SynthTabNet dataset. It has been observed that large tables (e.g. tables that occupy half of the page or more) yield poor predictions. We attribute this issue to the image resizing during the preprocessing step, that produces downsampled images with indistinguishable features. This problem can be addressed by treating such big tables with a separate model which accepts a large input image size.</paragraph>
-<caption><location><page_7><loc_8><loc_23><loc_47><loc_25></location>Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).</caption>
 <table>
 <location><page_7><loc_9><loc_26><loc_46><loc_48></location>
-<caption>Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).</caption>
 <row_0><col_0><col_header>Model</col_0><col_1><col_header>Dataset</col_1><col_2><col_header>Simple</col_2><col_3><col_header>TEDS Complex</col_3><col_4><col_header>All</col_4></row_0>
 <row_1><col_0><row_header>EDD</col_0><col_1><body>PTN</col_1><col_2><body>91.1</col_2><col_3><body>88.7</col_3><col_4><body>89.9</col_4></row_1>
 <row_2><col_0><row_header>GTE</col_0><col_1><body>PTN</col_1><col_2><body>-</col_2><col_3><body>-</col_3><col_4><body>93.01</col_4></row_2>
@ -139,10 +131,9 @@
 <row_9><col_0><row_header>TableFormer</col_0><col_1><body>TB</col_1><col_2><body>89.6</col_2><col_3><body>-</col_3><col_4><body>89.6</col_4></row_9>
 <row_10><col_0><row_header>TableFormer</col_0><col_1><body>STN</col_1><col_2><body>96.9</col_2><col_3><body>95.7</col_3><col_4><body>96.7</col_4></row_10>
 </table>
+<paragraph><location><page_7><loc_8><loc_23><loc_47><loc_25></location>Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).</paragraph>
 <paragraph><location><page_7><loc_8><loc_21><loc_43><loc_22></location>FT: Model was trained on PubTabNet then finetuned.</paragraph>
-<paragraph><location><page_7><loc_8><loc_10><loc_47><loc_19></location>Cell Detection. Like any object detector, our Cell BBox Detector provides bounding boxes that can be improved with post-processing during inference. We make use of the grid-like structure of tables to refine the predictions. A detailed explanation on the post-processing is available in the supplementary material. As shown in Tab. 3, we evaluate</paragraph>
-<paragraph><location><page_7><loc_50><loc_71><loc_89><loc_91></location>our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.</paragraph>
-<caption><location><page_7><loc_50><loc_57><loc_89><loc_60></location>Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.</caption>
+<paragraph><location><page_7><loc_8><loc_10><loc_47><loc_19></location><location><page_7><loc_8><loc_10><loc_47><loc_19></location>Cell Detection. Like any object detector, our Cell BBox Detector provides bounding boxes that can be improved with post-processing during inference. We make use of the grid-like structure of tables to refine the predictions. A detailed explanation on the post-processing is available in the supplementary material. As shown in Tab. 3, we evaluate our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.</paragraph>
 <table>
 <location><page_7><loc_50><loc_62><loc_87><loc_69></location>
 <caption>Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.</caption>
@ -151,8 +142,8 @@
 <row_2><col_0><body>TableFormer</col_0><col_1><body>PubTabNet</col_1><col_2><body>82.1</col_2><col_3><body>86.8</col_3></row_2>
 <row_3><col_0><body>TableFormer</col_0><col_1><body>SynthTabNet</col_1><col_2><body>87.7</col_2><col_3><body>-</col_3></row_3>
 </table>
+<caption><location><page_7><loc_50><loc_57><loc_89><loc_60></location>Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.</caption>
 <paragraph><location><page_7><loc_50><loc_34><loc_89><loc_54></location>Cell Content. In this section, we evaluate the entire pipeline of recovering a table with content. Here we put our approach to test by capitalizing on extracting content from the PDF cells rather than decoding from images. Tab. 4 shows the TEDs score of HTML code representing the structure of the table along with the content inserted in the data cell and compared with the ground-truth. Our method achieved a 5.3% increase over the state-of-the-art, and commercial solutions. We believe our scores would be higher if the HTML ground-truth matched the extracted PDF cell content. Unfortunately, there are small discrepancies such as spacings around words or special characters with various unicode representations.</paragraph>
-<caption><location><page_7><loc_50><loc_13><loc_89><loc_17></location>Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.</caption>
 <table>
 <location><page_7><loc_54><loc_19><loc_85><loc_32></location>
 <caption>Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.</caption>
@ -164,6 +155,7 @@
 <row_5><col_0><row_header>EDD</col_0><col_1><body>91.2</col_1><col_2><body>85.4</col_2><col_3><body>88.3</col_3></row_5>
 <row_6><col_0><row_header>TableFormer</col_0><col_1><body>95.4</col_1><col_2><body>90.1</col_2><col_3><body>93.6</col_3></row_6>
 </table>
+<caption><location><page_7><loc_50><loc_13><loc_89><loc_17></location>Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.</caption>
 <paragraph><location><page_8><loc_9><loc_89><loc_10><loc_90></location>- a.</paragraph>
 <paragraph><location><page_8><loc_11><loc_89><loc_82><loc_90></location>- Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells</paragraph>
 <subtitle-level-1><location><page_8><loc_9><loc_87><loc_46><loc_88></location>Japanese language (previously unseen by TableFormer):</subtitle-level-1>
@ -171,13 +163,13 @@
 <figure>
 <location><page_8><loc_8><loc_76><loc_49><loc_87></location>
 </figure>
-<caption><location><page_8><loc_9><loc_73><loc_63><loc_74></location>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
 <figure>
 <location><page_8><loc_50><loc_77><loc_91><loc_88></location>
 <caption>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
 </figure>
 <table>
 <location><page_8><loc_9><loc_63><loc_49><loc_72></location>
+<caption>Text is aligned to match original for ease of viewing</caption>
 <row_0><col_0><body></col_0><col_1><body></col_1><col_2><col_header>論文ファイル</col_2><col_3><col_header>論文ファイル</col_3><col_4><col_header>参考文献</col_4><col_5><col_header>参考文献</col_5></row_0>
 <row_1><col_0><col_header>出典</col_0><col_1><col_header>ファイル 数</col_1><col_2><col_header>英語</col_2><col_3><col_header>日本語</col_3><col_4><col_header>英語</col_4><col_5><col_header>日本語</col_5></row_1>
 <row_2><col_0><row_header>Association for Computational Linguistics(ACL2003)</col_0><col_1><body>65</col_1><col_2><body>65</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_2>
@ -192,7 +184,6 @@
 <caption><location><page_8><loc_62><loc_62><loc_90><loc_63></location>Text is aligned to match original for ease of viewing</caption>
 <table>
 <location><page_8><loc_50><loc_64><loc_90><loc_72></location>
-<caption>Text is aligned to match original for ease of viewing</caption>
 <row_0><col_0><body></col_0><col_1><col_header>Shares (in millions)</col_1><col_2><col_header>Shares (in millions)</col_2><col_3><col_header>Weighted Average Grant Date Fair Value</col_3><col_4><col_header>Weighted Average Grant Date Fair Value</col_4></row_0>
 <row_1><col_0><body></col_0><col_1><col_header>RS U s</col_1><col_2><col_header>PSUs</col_2><col_3><col_header>RSUs</col_3><col_4><col_header>PSUs</col_4></row_1>
 <row_2><col_0><row_header>Nonvested on Janua ry 1</col_0><col_1><body>1. 1</col_1><col_2><body>0.3</col_2><col_3><body>90.10 $</col_3><col_4><body>$ 91.19</col_4></row_2>
@ -201,38 +192,36 @@
 <row_5><col_0><row_header>Canceled or forfeited</col_0><col_1><body>(0. 1 )</col_1><col_2><body>-</col_2><col_3><body>102.01</col_3><col_4><body>92.18</col_4></row_5>
 <row_6><col_0><row_header>Nonvested on December 31</col_0><col_1><body>1.0</col_1><col_2><body>0.3</col_2><col_3><body>104.85 $</col_3><col_4><body>$ 104.51</col_4></row_6>
 </table>
-<caption><location><page_8><loc_8><loc_54><loc_89><loc_59></location>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
 <figure>
 <location><page_8><loc_8><loc_44><loc_35><loc_52></location>
+<caption>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption>
+</figure>
+<figure>
+<location><page_8><loc_35><loc_44><loc_61><loc_52></location>
 <caption>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
 </figure>
 <figure>
 <location><page_8><loc_63><loc_44><loc_89><loc_52></location>
 </figure>
-<caption><location><page_8><loc_10><loc_41><loc_87><loc_42></location>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption>
-<figure>
-<location><page_8><loc_35><loc_44><loc_61><loc_52></location>
-<caption>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption>
-</figure>
 <subtitle-level-1><location><page_8><loc_8><loc_37><loc_27><loc_38></location>5.5. Qualitative Analysis</subtitle-level-1>
-<paragraph><location><page_8><loc_8><loc_10><loc_47><loc_32></location>We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type.</paragraph>
 <subtitle-level-1><location><page_8><loc_50><loc_37><loc_75><loc_38></location>6. Future Work & Conclusion</subtitle-level-1>
+<paragraph><location><page_8><loc_8><loc_10><loc_47><loc_32></location>We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type.</paragraph>
 <paragraph><location><page_8><loc_50><loc_18><loc_89><loc_35></location>In this paper, we presented TableFormer an end-to-end transformer based approach to predict table structures and bounding boxes of cells from an image. This approach enables us to recreate the table structure, and extract the cell content from PDF or OCR by using bounding boxes. Additionally, it provides the versatility required in real-world scenarios when dealing with various types of PDF documents, and languages. Furthermore, our method outperforms all state-of-the-arts with a wide margin. Finally, we introduce "SynthTabNet" a challenging synthetically generated dataset that reinforces missing characteristics from other datasets.</paragraph>
 <subtitle-level-1><location><page_8><loc_50><loc_14><loc_60><loc_15></location>References</subtitle-level-1>
 <paragraph><location><page_8><loc_51><loc_10><loc_89><loc_12></location>- [1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-</paragraph>
 <paragraph><location><page_9><loc_11><loc_85><loc_47><loc_90></location>- end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5</paragraph>
 <paragraph><location><page_9><loc_9><loc_81><loc_47><loc_85></location>- [2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3</paragraph>
 <paragraph><location><page_9><loc_9><loc_77><loc_47><loc_81></location>- [3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2</paragraph>
-<paragraph><location><page_9><loc_9><loc_71><loc_47><loc_76></location>- [4] Herv'e D'ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2</paragraph>
+<paragraph><location><page_9><loc_9><loc_71><loc_47><loc_76></location>- [4] Herv´e D´ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2</paragraph>
 <paragraph><location><page_9><loc_9><loc_66><loc_47><loc_71></location>- [5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2</paragraph>
-<paragraph><location><page_9><loc_9><loc_60><loc_47><loc_65></location>- [6] Max Gobel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2</paragraph>
+<paragraph><location><page_9><loc_9><loc_60><loc_47><loc_65></location>- [6] Max G¨obel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2</paragraph>
 <paragraph><location><page_9><loc_9><loc_56><loc_47><loc_60></location>- [7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2</paragraph>
 <paragraph><location><page_9><loc_9><loc_49><loc_47><loc_56></location>- [8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1</paragraph>
 <paragraph><location><page_9><loc_9><loc_45><loc_47><loc_49></location>- [9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1</paragraph>
 <paragraph><location><page_9><loc_8><loc_39><loc_47><loc_44></location>- [10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2</paragraph>
 <paragraph><location><page_9><loc_8><loc_32><loc_47><loc_39></location>- [11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2</paragraph>
 <paragraph><location><page_9><loc_8><loc_25><loc_47><loc_32></location>- [12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2</paragraph>
-<paragraph><location><page_9><loc_8><loc_18><loc_47><loc_25></location>- [13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl'ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2</paragraph>
+<paragraph><location><page_9><loc_8><loc_18><loc_47><loc_25></location>- [13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl´ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2</paragraph>
 <paragraph><location><page_9><loc_8><loc_14><loc_47><loc_18></location>- [14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2</paragraph>
 <paragraph><location><page_9><loc_8><loc_10><loc_47><loc_14></location>- [15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6</paragraph>
 <paragraph><location><page_9><loc_50><loc_82><loc_89><loc_90></location>- [16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4</paragraph>
@ -241,7 +230,7 @@
 <paragraph><location><page_9><loc_50><loc_59><loc_89><loc_67></location>- [19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1</paragraph>
 <paragraph><location><page_9><loc_50><loc_53><loc_89><loc_58></location>- [20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2</paragraph>
 <paragraph><location><page_9><loc_50><loc_45><loc_89><loc_53></location>- [21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1</paragraph>
-<paragraph><location><page_9><loc_50><loc_30><loc_89><loc_44></location>- [22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch'e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6</paragraph>
+<paragraph><location><page_9><loc_50><loc_30><loc_89><loc_44></location>- [22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch´e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6</paragraph>
 <paragraph><location><page_9><loc_50><loc_21><loc_89><loc_29></location>- [23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1</paragraph>
 <paragraph><location><page_9><loc_50><loc_16><loc_89><loc_21></location>- [24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3</paragraph>
 <paragraph><location><page_9><loc_50><loc_10><loc_89><loc_15></location>- [25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on</paragraph>
@ -267,8 +256,7 @@
 <paragraph><location><page_11><loc_8><loc_21><loc_47><loc_51></location>We have developed a technique that tries to derive a missing bounding box out of its neighbors. As a first step, we use the annotation data to generate the most fine-grained grid that covers the table structure. In case of strict HTML tables, all grid squares are associated with some table cell and in the presence of table spans a cell extends across multiple grid squares. When enough bounding boxes are known for a rectangular table, it is possible to compute the geometrical border lines between the grid rows and columns. Eventually this information is used to generate the missing bounding boxes. Additionally, the existence of unused grid squares indicates that the table rows have unequal number of columns and the overall structure is non-strict. The generation of missing bounding boxes for non-strict HTML tables is ambiguous and therefore quite challenging. Thus, we have decided to simply discard those tables. In case of PubTabNet we have computed missing bounding boxes for 48% of the simple and 69% of the complex tables. Regarding FinTabNet, 68% of the simple and 98% of the complex tables require the generation of bounding boxes.</paragraph>
 <paragraph><location><page_11><loc_8><loc_18><loc_47><loc_20></location>Figure 7 illustrates the distribution of the tables across different dimensions per dataset.</paragraph>
 <subtitle-level-1><location><page_11><loc_8><loc_15><loc_25><loc_16></location>1.2. Synthetic datasets</subtitle-level-1>
-<paragraph><location><page_11><loc_8><loc_10><loc_47><loc_14></location>Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear-</paragraph>
-<paragraph><location><page_11><loc_50><loc_74><loc_89><loc_79></location>ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%).</paragraph>
+<paragraph><location><page_11><loc_8><loc_10><loc_47><loc_14></location><location><page_11><loc_8><loc_10><loc_47><loc_14></location>Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear- ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%).</paragraph>
 <paragraph><location><page_11><loc_50><loc_71><loc_89><loc_73></location>The process of generating a synthetic dataset can be decomposed into the following steps:</paragraph>
 <paragraph><location><page_11><loc_50><loc_60><loc_89><loc_70></location>- 1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.).</paragraph>
 <paragraph><location><page_11><loc_50><loc_43><loc_89><loc_60></location>- 2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans.</paragraph>
@ -277,13 +265,13 @@
 <paragraph><location><page_11><loc_50><loc_23><loc_89><loc_31></location>- 5. Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process.</paragraph>
 <subtitle-level-1><location><page_11><loc_50><loc_18><loc_89><loc_21></location>2. Prediction post-processing for PDF documents</subtitle-level-1>
 <paragraph><location><page_11><loc_50><loc_10><loc_89><loc_17></location>Although TableFormer can predict the table structure and the bounding boxes for tables recognized inside PDF documents, this is not enough when a full reconstruction of the original table is required. This happens mainly due the following reasons:</paragraph>
-<caption><location><page_12><loc_8><loc_76><loc_89><loc_79></location>Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.</caption>
 <figure>
 <location><page_12><loc_9><loc_81><loc_89><loc_91></location>
 <caption>Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.</caption>
 </figure>
 <paragraph><location><page_12><loc_10><loc_71><loc_47><loc_73></location>- · TableFormer output does not include the table cell content.</paragraph>
 <paragraph><location><page_12><loc_10><loc_67><loc_47><loc_69></location>- · There are occasional inaccuracies in the predictions of the bounding boxes.</paragraph>
+<paragraph><location><page_12><loc_50><loc_68><loc_89><loc_73></location>dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.</paragraph>
 <paragraph><location><page_12><loc_8><loc_50><loc_47><loc_65></location>However, it is possible to mitigate those limitations by combining the TableFormer predictions with the information already present inside a programmatic PDF document. More specifically, PDF documents can be seen as a sequence of PDF cells where each cell is described by its content and bounding box. If we are able to associate the PDF cells with the predicted table cells, we can directly link the PDF cell content to the table cell structure and use the PDF bounding boxes to correct misalignments in the predicted table cell bounding boxes.</paragraph>
 <paragraph><location><page_12><loc_8><loc_47><loc_47><loc_50></location>Here is a step-by-step description of the prediction postprocessing:</paragraph>
 <paragraph><location><page_12><loc_8><loc_42><loc_47><loc_47></location>- 1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.</paragraph>
@ -293,7 +281,6 @@
 <paragraph><location><page_12><loc_8><loc_24><loc_47><loc_28></location>- 4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:</paragraph>
 <paragraph><location><page_12><loc_8><loc_13><loc_47><loc_16></location>where c is one of { left, centroid, right } and x$_{c}$ is the xcoordinate for the corresponding point.</paragraph>
 <paragraph><location><page_12><loc_8><loc_10><loc_47><loc_13></location>- 5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-</paragraph>
-<paragraph><location><page_12><loc_50><loc_68><loc_89><loc_73></location>dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.</paragraph>
 <paragraph><location><page_12><loc_50><loc_65><loc_89><loc_67></location>- 6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.</paragraph>
 <paragraph><location><page_12><loc_50><loc_51><loc_89><loc_64></location>- 7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.</paragraph>
 <paragraph><location><page_12><loc_50><loc_42><loc_89><loc_51></location>- 8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score.</paragraph>
@ -315,14 +302,15 @@
 <table>
 <location><page_13><loc_14><loc_54><loc_39><loc_61></location>
 </table>
-<caption><location><page_13><loc_10><loc_35><loc_45><loc_37></location>Figure 8: Example of a table with multi-line header.</caption>
 <table>
 <location><page_13><loc_14><loc_38><loc_41><loc_50></location>
-<caption>Figure 8: Example of a table with multi-line header.</caption>
 </table>
+<caption><location><page_13><loc_10><loc_35><loc_45><loc_37></location>Figure 8: Example of a table with multi-line header.</caption>
 <table>
 <location><page_13><loc_51><loc_83><loc_91><loc_87></location>
+<caption>Figure 9: Example of a table with big empty distance between cells.</caption>
 </table>
+<caption><location><page_13><loc_50><loc_59><loc_89><loc_61></location>Figure 9: Example of a table with big empty distance between cells.</caption>
 <table>
 <location><page_13><loc_51><loc_77><loc_91><loc_80></location>
 </table>
@ -332,14 +320,14 @@
 <figure>
 <location><page_13><loc_51><loc_63><loc_70><loc_68></location>
 </figure>
-<caption><location><page_13><loc_50><loc_59><loc_89><loc_61></location>Figure 9: Example of a table with big empty distance between cells.</caption>
 <table>
 <location><page_13><loc_51><loc_63><loc_70><loc_68></location>
-<caption>Figure 9: Example of a table with big empty distance between cells.</caption>
 </table>
 <table>
 <location><page_13><loc_55><loc_45><loc_80><loc_51></location>
+<caption>Figure 10: Example of a complex table with empty cells.</caption>
 </table>
+<caption><location><page_13><loc_51><loc_13><loc_89><loc_14></location>Figure 10: Example of a complex table with empty cells.</caption>
 <table>
 <location><page_13><loc_55><loc_37><loc_80><loc_43></location>
 </table>
@ -349,19 +337,16 @@
 <figure>
 <location><page_13><loc_55><loc_16><loc_85><loc_25></location>
 </figure>
-<caption><location><page_13><loc_51><loc_13><loc_89><loc_14></location>Figure 10: Example of a complex table with empty cells.</caption>
 <table>
 <location><page_13><loc_55><loc_16><loc_85><loc_25></location>
-<caption>Figure 10: Example of a complex table with empty cells.</caption>
 </table>
 <table>
 <location><page_14><loc_8><loc_57><loc_46><loc_65></location>
 </table>
-<caption><location><page_14><loc_8><loc_52><loc_47><loc_55></location>Figure 11: Simple table with different style and empty cells.</caption>
 <figure>
 <location><page_14><loc_8><loc_56><loc_46><loc_87></location>
-<caption>Figure 11: Simple table with different style and empty cells.</caption>
 </figure>
+<caption><location><page_14><loc_8><loc_52><loc_47><loc_55></location>Figure 11: Simple table with different style and empty cells.</caption>
 <table>
 <location><page_14><loc_8><loc_38><loc_51><loc_43></location>
 </table>
@ -371,11 +356,10 @@
 <table>
 <location><page_14><loc_8><loc_25><loc_51><loc_30></location>
 </table>
-<caption><location><page_14><loc_9><loc_14><loc_46><loc_15></location>Figure 12: Simple table predictions and post processing.</caption>
 <figure>
 <location><page_14><loc_8><loc_17><loc_29><loc_23></location>
-<caption>Figure 12: Simple table predictions and post processing.</caption>
 </figure>
+<caption><location><page_14><loc_9><loc_14><loc_46><loc_15></location>Figure 12: Simple table predictions and post processing.</caption>
 <table>
 <location><page_14><loc_52><loc_73><loc_87><loc_80></location>
 </table>
@ -385,24 +369,23 @@
 <table>
 <location><page_14><loc_54><loc_55><loc_86><loc_64></location>
 </table>
-<caption><location><page_14><loc_52><loc_52><loc_88><loc_53></location>Figure 13: Table predictions example on colorful table.</caption>
 <figure>
 <location><page_14><loc_52><loc_55><loc_87><loc_89></location>
 <caption>Figure 13: Table predictions example on colorful table.</caption>
 </figure>
 <table>
 <location><page_14><loc_52><loc_40><loc_85><loc_46></location>
+<caption>Figure 14: Example with multi-line text.</caption>
 </table>
+<caption><location><page_14><loc_56><loc_13><loc_83><loc_14></location>Figure 14: Example with multi-line text.</caption>
 <table>
 <location><page_14><loc_52><loc_32><loc_85><loc_38></location>
 </table>
 <table>
 <location><page_14><loc_52><loc_25><loc_85><loc_31></location>
 </table>
-<caption><location><page_14><loc_56><loc_13><loc_83><loc_14></location>Figure 14: Example with multi-line text.</caption>
 <table>
 <location><page_14><loc_52><loc_16><loc_87><loc_23></location>
-<caption>Figure 14: Example with multi-line text.</caption>
 </table>
 <figure>
 <location><page_15><loc_9><loc_69><loc_46><loc_83></location>
@ -422,14 +405,11 @@
 <figure>
 <location><page_15><loc_8><loc_20><loc_52><loc_36></location>
 </figure>
-<caption><location><page_15><loc_14><loc_18><loc_41><loc_19></location>Figure 15: Example with triangular table.</caption>
-<table>
-<location><page_15><loc_8><loc_20><loc_52><loc_36></location>
-<caption>Figure 15: Example with triangular table.</caption>
-</table>
 <table>
 <location><page_15><loc_53><loc_72><loc_86><loc_85></location>
+<caption>Figure 15: Example with triangular table.</caption>
 </table>
+<caption><location><page_15><loc_14><loc_18><loc_41><loc_19></location>Figure 15: Example with triangular table.</caption>
 <table>
 <location><page_15><loc_53><loc_57><loc_86><loc_69></location>
 </table>
@ -442,12 +422,13 @@
 <figure>
 <location><page_15><loc_58><loc_20><loc_81><loc_38></location>
 </figure>
-<caption><location><page_15><loc_50><loc_15><loc_89><loc_18></location>Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.</caption>
 <table>
 <location><page_15><loc_58><loc_20><loc_81><loc_38></location>
-<caption>Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.</caption>
 </table>
-<caption><location><page_16><loc_8><loc_33><loc_89><loc_36></location>Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.</caption>
+<table>
+<location><page_15><loc_8><loc_20><loc_52><loc_36></location>
+</table>
+<caption><location><page_15><loc_50><loc_15><loc_89><loc_18></location>Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.</caption>
 <figure>
 <location><page_16><loc_11><loc_37><loc_86><loc_68></location>
 <caption>Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.</caption>
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.json
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.json
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.md
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.md
@ -8,25 +8,22 @@

 ## a. Picture of a table:

+Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.
+
 ## 1. Introduction

 The occurrence of tables in documents is ubiquitous. They often summarise quantitative or factual data, which is cumbersome to describe in verbose text but nevertheless extremely valuable. Unfortunately, this compact representation is often not easy to parse by machines. There are many implicit conventions used to obtain a compact table representation. For example, tables often have complex columnand row-headers in order to reduce duplicated cell content. Lines of different shapes and sizes are leveraged to separate content or indicate a tree structure. Additionally, tables can also have empty/missing table-entries or multi-row textual table-entries. Fig. 1 shows a table which presents all these issues.

 <!-- image -->

-Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.
-
-
-
 - b. Red-annotation of bounding boxes, Blue-predictions by TableFormer

-<!-- image -->
-
 - c. Structure predicted by TableFormer:

 <!-- image -->

 Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.
+<!-- image -->

 | 0   |   1 | 1   |   2 1 |   2 1 |    |
 |-----|-----|-----|-------|-------|----|
@ -155,9 +152,7 @@ where λ ∈ [0, 1], and λ$_{iou}$, λ$_{l}$$_{1}$ ∈$_{R}$ are hyper-paramete

 TableFormer uses ResNet-18 as the CNN Backbone Network . The input images are resized to 448*448 pixels and the feature map has a dimension of 28*28. Additionally, we enforce the following input constraints:

-Although input constraints are used also by other methods, such as EDD, ours are less restrictive due to the improved
-
-runtime performance and lower memory footprint of TableFormer. This allows to utilize input samples with longer sequences and images with larger dimensions.
+Although input constraints are used also by other methods, such as EDD, ours are less restrictive due to the improved runtime performance and lower memory footprint of TableFormer. This allows to utilize input samples with longer sequences and images with larger dimensions.

 The Transformer Encoder consists of two "Transformer Encoder Layers", with an input feature size of 512, feed forward network of 1024, and 4 attention heads. As for the Transformer Decoder it is composed of four "Transformer Decoder Layers" with similar input and output dimensions as the "Transformer Encoder Layers". Even though our model uses fewer layers and heads than the default implementation parameters, our extensive experimentation has proved this setup to be more suitable for table images. We attribute this finding to the inherent design of table images, which contain mostly lines and text, unlike the more elaborate content present in other scopes (e.g. the COCO dataset). Moreover, we have added ResNet blocks to the inputs of the Structure Decoder and Cell BBox Decoder. This prevents a decoder having a stronger influence over the learned weights which would damage the other prediction task (structure vs bounding boxes), but learn task specific weights instead. Lastly our dropout layers are set to 0.5.

@ -181,8 +176,6 @@ where T$_{a}$ and T$_{b}$ represent tables in tree structure HTML format. EditDi

 Structure. As shown in Tab. 2, TableFormer outperforms all SOTA methods across different datasets by a large margin for predicting the table structure from an image. All the more, our model outperforms pre-trained methods. During the evaluation we do not apply any table filtering. We also provide our baseline results on the SynthTabNet dataset. It has been observed that large tables (e.g. tables that occupy half of the page or more) yield poor predictions. We attribute this issue to the image resizing during the preprocessing step, that produces downsampled images with indistinguishable features. This problem can be addressed by treating such big tables with a separate model which accepts a large input image size.

-Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).
-
 | Model       | Dataset   | Simple   | TEDS Complex   |   All |
 |-------------|-----------|----------|----------------|-------|
 | EDD         | PTN       | 91.1     | 88.7           | 89.9  |
@ -196,11 +189,11 @@ Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) a
 | TableFormer | TB        | 89.6     | -              | 89.6  |
 | TableFormer | STN       | 96.9     | 95.7           | 96.7  |

+Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).
+
 FT: Model was trained on PubTabNet then finetuned.

-Cell Detection. Like any object detector, our Cell BBox Detector provides bounding boxes that can be improved with post-processing during inference. We make use of the grid-like structure of tables to refine the predictions. A detailed explanation on the post-processing is available in the supplementary material. As shown in Tab. 3, we evaluate
-
-our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.
+Cell Detection. Like any object detector, our Cell BBox Detector provides bounding boxes that can be improved with post-processing during inference. We make use of the grid-like structure of tables to refine the predictions. A detailed explanation on the post-processing is available in the supplementary material. As shown in Tab. 3, we evaluate our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.

 Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.

@ -236,6 +229,8 @@ Table 4: Results of structure with content retrieved using cell detection on Pub
 b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
 <!-- image -->

+Text is aligned to match original for ease of viewing
+
 |                                                    |             | 論文ファイル   | 論文ファイル   | 参考文献   | 参考文献   |
 |----------------------------------------------------|-------------|----------------|----------------|------------|------------|
 | 出典                                               | ファイル 数 | 英語           | 日本語         | 英語       | 日本語     |
@ -248,8 +243,6 @@ b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
 | WWW から収集した論文                               | 107         | 73             | 34             | 147        | 96         |
 |                                                    | 945         | 294            | 651            | 1122       | 955        |

-Text is aligned to match original for ease of viewing
-
 |                          | Shares (in millions)   | Shares (in millions)   | Weighted Average Grant Date Fair Value   | Weighted Average Grant Date Fair Value   |
 |--------------------------|------------------------|------------------------|------------------------------------------|------------------------------------------|
 |                          | RS U s                 | PSUs                   | RSUs                                     | PSUs                                     |
@ -259,20 +252,20 @@ Text is aligned to match original for ease of viewing
 | Canceled or forfeited    | (0. 1 )                | -                      | 102.01                                   | 92.18                                    |
 | Nonvested on December 31 | 1.0                    | 0.3                    | 104.85 $                                 | $ 104.51                                 |

+Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.
+<!-- image -->
+
 Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.
 <!-- image -->

 <!-- image -->

-Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.
-<!-- image -->
-
 ## 5.5. Qualitative Analysis

-We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type.
-
 ## 6. Future Work & Conclusion

+We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type.
+
 In this paper, we presented TableFormer an end-to-end transformer based approach to predict table structures and bounding boxes of cells from an image. This approach enables us to recreate the table structure, and extract the cell content from PDF or OCR by using bounding boxes. Additionally, it provides the versatility required in real-world scenarios when dealing with various types of PDF documents, and languages. Furthermore, our method outperforms all state-of-the-arts with a wide margin. Finally, we introduce "SynthTabNet" a challenging synthetically generated dataset that reinforces missing characteristics from other datasets.

 ## References
@ -285,11 +278,11 @@ In this paper, we presented TableFormer an end-to-end transformer based approach

 - [3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2

- [4] Herv'e D'ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2
+- [4] Herv´e D´ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2

 - [5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2

- [6] Max Gobel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2
+- [6] Max G¨obel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2

 - [7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2

@ -303,7 +296,7 @@ In this paper, we presented TableFormer an end-to-end transformer based approach

 - [12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2

- [13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl'ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2
+- [13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl´ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2

 - [14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2

@ -321,7 +314,7 @@ In this paper, we presented TableFormer an end-to-end transformer based approach

 - [21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1

- [22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch'e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6
+- [22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch´e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6

 - [23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1

@ -373,9 +366,7 @@ Figure 7 illustrates the distribution of the tables across different dimensions

 ## 1.2. Synthetic datasets

-Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear-
-
-ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%).
+Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear- ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%).

 The process of generating a synthetic dataset can be decomposed into the following steps:

@ -400,6 +391,8 @@ Figure 7: Distribution of the tables across different dimensions per dataset. Si

 - · There are occasional inaccuracies in the predictions of the bounding boxes.

+dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.
+
 However, it is possible to mitigate those limitations by combining the TableFormer predictions with the information already present inside a programmatic PDF document. More specifically, PDF documents can be seen as a sequence of PDF cells where each cell is described by its content and bounding box. If we are able to associate the PDF cells with the predicted table cells, we can directly link the PDF cell content to the table cell structure and use the PDF bounding boxes to correct misalignments in the predicted table cell bounding boxes.

 Here is a step-by-step description of the prediction postprocessing:
@ -418,8 +411,6 @@ where c is one of { left, centroid, right } and x$_{c}$ is the xcoordinate for t

 - 5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-

-dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.
-
 - 6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.

 - 7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.
@ -446,10 +437,6 @@ Aditional images with examples of TableFormer predictions and post-processing ca

 Figure 8: Example of a table with multi-line header.

-
-
-<!-- image -->
-
 Figure 9: Example of a table with big empty distance between cells.


@ -460,11 +447,15 @@ Figure 10: Example of a complex table with empty cells.



+<!-- image -->
+
+<!-- image -->
+
 Figure 11: Simple table with different style and empty cells.
+
 <!-- image -->

 Figure 12: Simple table predictions and post processing.
-<!-- image -->

 Figure 13: Table predictions example on colorful table.
 <!-- image -->
@ -491,7 +482,5 @@ Figure 15: Example with triangular table.

 Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.

-
-
 Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.
 <!-- image -->
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v1/2206.01062.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/2206.01062.doctags.txt
@ -10,11 +10,10 @@
 <subtitle-level-1><location><page_1><loc_9><loc_29><loc_22><loc_30></location>CCS CONCEPTS</subtitle-level-1>
 <paragraph><location><page_1><loc_9><loc_25><loc_49><loc_29></location>· Information systems → Document structure ; · Applied computing → Document analysis ; · Computing methodologies → Machine learning ; Computer vision ; Object detection ;</paragraph>
 <paragraph><location><page_1><loc_9><loc_15><loc_48><loc_20></location>Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).</paragraph>
-<paragraph><location><page_1><loc_9><loc_14><loc_32><loc_15></location>KDD '22, August 14-18, 2022, Washington, DC, USA</paragraph>
+<paragraph><location><page_1><loc_9><loc_14><loc_32><loc_15></location>KDD ’22, August 14-18, 2022, Washington, DC, USA</paragraph>
 <paragraph><location><page_1><loc_9><loc_13><loc_31><loc_14></location>© 2022 Copyright held by the owner/author(s).</paragraph>
 <paragraph><location><page_1><loc_9><loc_12><loc_26><loc_13></location>ACM ISBN 978-1-4503-9385-0/22/08.</paragraph>
 <paragraph><location><page_1><loc_9><loc_11><loc_27><loc_12></location>https://doi.org/10.1145/3534678.3539043</paragraph>
-<caption><location><page_1><loc_52><loc_29><loc_91><loc_32></location>Figure 1: Four examples of complex page layouts across different document categories</caption>
 <figure>
 <location><page_1><loc_53><loc_34><loc_90><loc_68></location>
 <caption>Figure 1: Four examples of complex page layouts across different document categories</caption>
@ -41,7 +40,6 @@
 <subtitle-level-1><location><page_2><loc_52><loc_27><loc_78><loc_29></location>3 THE DOCLAYNET DATASET</subtitle-level-1>
 <paragraph><location><page_2><loc_52><loc_15><loc_91><loc_25></location>DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of labeled, rectangular boundingboxes. We define 11 distinct labels for layout features, namely Caption , Footnote , Formula , List-item , Page-footer , Page-header , Picture , Section-header , Table , Text , and Title . Our reasoning for picking this particular label set is detailed in Section 4.</paragraph>
 <paragraph><location><page_2><loc_52><loc_11><loc_91><loc_14></location>In addition to open intellectual property constraints for the source documents, we required that the documents in DocLayNet adhere to a few conditions. Firstly, we kept scanned documents</paragraph>
-<caption><location><page_3><loc_9><loc_68><loc_48><loc_70></location>Figure 2: Distribution of DocLayNet pages across document categories.</caption>
 <figure>
 <location><page_3><loc_14><loc_72><loc_43><loc_88></location>
 <caption>Figure 2: Distribution of DocLayNet pages across document categories.</caption>
@ -55,7 +53,6 @@
 <paragraph><location><page_3><loc_52><loc_26><loc_91><loc_65></location>Despite being cost-intense and far less scalable than automation, human annotation has several benefits over automated groundtruth generation. The first and most obvious reason to leverage human annotations is the freedom to annotate any type of document without requiring a programmatic source. For most PDF documents, the original source document is not available. The latter is not a hard constraint with human annotation, but it is for automated methods. A second reason to use human annotations is that the latter usually provide a more natural interpretation of the page layout. The human-interpreted layout can significantly deviate from the programmatic layout used in typesetting. For example, "invisible" tables might be used solely for aligning text paragraphs on columns. Such typesetting tricks might be interpreted by automated methods incorrectly as an actual table, while the human annotation will interpret it correctly as Text or other styles. The same applies to multi-line text elements, when authors decided to space them as "invisible" list elements without bullet symbols. A third reason to gather ground-truth through human annotation is to estimate a "natural" upper bound on the segmentation accuracy. As we will show in Section 4, certain documents featuring complex layouts can have different but equally acceptable layout interpretations. This natural upper bound for segmentation accuracy can be found by annotating the same pages multiple times by different people and evaluating the inter-annotator agreement. Such a baseline consistency evaluation is very useful to define expectations for a good target accuracy in trained deep neural network models and avoid overfitting (see Table 1). On the flip side, achieving high annotation consistency proved to be a key challenge in human annotation, as we outline in Section 4.</paragraph>
 <subtitle-level-1><location><page_3><loc_52><loc_22><loc_77><loc_23></location>4 ANNOTATION CAMPAIGN</subtitle-level-1>
 <paragraph><location><page_3><loc_52><loc_11><loc_91><loc_20></location>The annotation campaign was carried out in four phases. In phase one, we identified and prepared the data sources for annotation. In phase two, we determined the class labels and how annotations should be done on the documents in order to obtain maximum consistency. The latter was guided by a detailed requirement analysis and exhaustive experiments. In phase three, we trained the annotation staff and performed exams for quality assurance. In phase four,</paragraph>
-<caption><location><page_4><loc_9><loc_85><loc_91><loc_89></location>Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.</caption>
 <table>
 <location><page_4><loc_16><loc_63><loc_84><loc_83></location>
 <caption>Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.</caption>
@ -74,14 +71,13 @@
 <row_12><col_0><row_header>Title</col_0><col_1><body>5071</col_1><col_2><body>0.47</col_2><col_3><body>0.30</col_3><col_4><body>0.50</col_4><col_5><body>60-72</col_5><col_6><body>24-63</col_6><col_7><body>50-63</col_7><col_8><body>94-100</col_8><col_9><body>82-96</col_9><col_10><body>68-79</col_10><col_11><body>24-56</col_11></row_12>
 <row_13><col_0><row_header>Total</col_0><col_1><body>1107470</col_1><col_2><body>941123</col_2><col_3><body>99816</col_3><col_4><body>66531</col_4><col_5><body>82-83</col_5><col_6><body>71-74</col_6><col_7><body>79-81</col_7><col_8><body>89-94</col_8><col_9><body>86-91</col_9><col_10><body>71-76</col_10><col_11><body>68-85</col_11></row_13>
 </table>
-<caption><location><page_4><loc_9><loc_23><loc_48><loc_30></location>Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.</caption>
+<caption><location><page_4><loc_9><loc_85><loc_91><loc_89></location>Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.</caption>
 <figure>
 <location><page_4><loc_9><loc_32><loc_48><loc_61></location>
 <caption>Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.</caption>
 </figure>
 <paragraph><location><page_4><loc_9><loc_15><loc_48><loc_20></location>we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised.</paragraph>
-<paragraph><location><page_4><loc_9><loc_11><loc_48><loc_14></location>Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources</paragraph>
-<paragraph><location><page_4><loc_52><loc_53><loc_91><loc_61></location>include publication repositories such as arXiv$^{3}$, government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process.</paragraph>
+<paragraph><location><page_4><loc_9><loc_11><loc_48><loc_14></location><location><page_4><loc_9><loc_11><loc_48><loc_14></location>Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources include publication repositories such as arXiv$^{3}$, government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process.</paragraph>
 <paragraph><location><page_4><loc_52><loc_36><loc_91><loc_52></location>Preparation work included uploading and parsing the sourced PDF documents in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides a visual annotation interface and allows for dataset inspection and analysis. The annotation interface of CCS is shown in Figure 3. The desired balance of pages between the different document categories was achieved by selective subsampling of pages with certain desired properties. For example, we made sure to include the title page of each document and bias the remaining page selection to those with figures or tables. The latter was achieved by leveraging pre-trained object detection models from PubLayNet, which helped us estimate how many figures and tables a given page contains.</paragraph>
 <paragraph><location><page_4><loc_52><loc_12><loc_91><loc_36></location>Phase 2: Label selection and guideline. We reviewed the collected documents and identified the most common structural features they exhibit. This was achieved by identifying recurrent layout elements and lead us to the definition of 11 distinct class labels. These 11 class labels are Caption , Footnote , Formula , List-item , Pagefooter , Page-header , Picture , Section-header , Table , Text , and Title . Critical factors that were considered for the choice of these class labels were (1) the overall occurrence of the label, (2) the specificity of the label, (3) recognisability on a single page (i.e. no need for context from previous or next page) and (4) overall coverage of the page. Specificity ensures that the choice of label is not ambiguous, while coverage ensures that all meaningful items on a page can be annotated. We refrained from class labels that are very specific to a document category, such as Abstract in the Scientific Articles category. We also avoided class labels that are tightly linked to the semantics of the text. Labels such as Author and Affiliation , as seen in DocBank, are often only distinguishable by discriminating on</paragraph>
 <paragraph><location><page_5><loc_9><loc_87><loc_48><loc_89></location>the textual content of an element, which goes beyond visual layout recognition, in particular outside the Scientific Articles category.</paragraph>
@ -95,18 +91,16 @@
 <paragraph><location><page_5><loc_11><loc_34><loc_48><loc_38></location>- (6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line.</paragraph>
 <paragraph><location><page_5><loc_9><loc_27><loc_48><loc_33></location>The complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.</paragraph>
 <paragraph><location><page_5><loc_9><loc_11><loc_48><loc_27></location>Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations</paragraph>
-<caption><location><page_5><loc_52><loc_36><loc_91><loc_40></location>Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.</caption>
 <figure>
 <location><page_5><loc_52><loc_42><loc_91><loc_89></location>
-<caption>Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.</caption>
 </figure>
 <paragraph><location><page_5><loc_65><loc_42><loc_78><loc_42></location>05237a14f2524e3f53c8454b074409d05078038a6a36b770fcc8ec7e540deae0</paragraph>
+<caption><location><page_5><loc_52><loc_36><loc_91><loc_40></location>Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.</caption>
 <paragraph><location><page_5><loc_52><loc_31><loc_91><loc_34></location>were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.</paragraph>
 <paragraph><location><page_5><loc_52><loc_10><loc_91><loc_31></location>Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted</paragraph>
-<caption><location><page_6><loc_9><loc_77><loc_48><loc_89></location>Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.</caption>
+<paragraph><location><page_6><loc_9><loc_77><loc_48><loc_89></location>Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.</paragraph>
 <table>
 <location><page_6><loc_10><loc_56><loc_47><loc_75></location>
-<caption>Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.</caption>
 <row_0><col_0><body></col_0><col_1><col_header>human</col_1><col_2><col_header>MRCNN</col_2><col_3><col_header>MRCNN</col_3><col_4><col_header>FRCNN</col_4><col_5><col_header>YOLO</col_5></row_0>
 <row_1><col_0><body></col_0><col_1><col_header>human</col_1><col_2><col_header>R50</col_2><col_3><col_header>R101</col_3><col_4><col_header>R101</col_4><col_5><col_header>v5x6</col_5></row_1>
 <row_2><col_0><row_header>Caption</col_0><col_1><body>84-89</col_1><col_2><body>68.4</col_2><col_3><body>71.5</col_3><col_4><body>70.1</col_4><col_5><body>77.7</col_5></row_2>
@ -125,7 +119,6 @@
 <paragraph><location><page_6><loc_9><loc_27><loc_48><loc_53></location>to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.</paragraph>
 <subtitle-level-1><location><page_6><loc_9><loc_24><loc_24><loc_26></location>5 EXPERIMENTS</subtitle-level-1>
 <paragraph><location><page_6><loc_9><loc_10><loc_48><loc_23></location>The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this</paragraph>
-<caption><location><page_6><loc_52><loc_57><loc_91><loc_65></location>Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.</caption>
 <figure>
 <location><page_6><loc_53><loc_67><loc_90><loc_89></location>
 <caption>Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.</caption>
@ -135,10 +128,8 @@
 <subtitle-level-1><location><page_6><loc_52><loc_36><loc_76><loc_37></location>Baselines for Object Detection</subtitle-level-1>
 <paragraph><location><page_6><loc_52><loc_11><loc_91><loc_35></location>In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.</paragraph>
 <paragraph><location><page_7><loc_9><loc_84><loc_48><loc_89></location>Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained on DocLayNet with different class label sets. The reduced label sets were obtained by either down-mapping or dropping labels.</paragraph>
-<caption><location><page_7><loc_52><loc_84><loc_91><loc_89></location>Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.</caption>
 <table>
 <location><page_7><loc_13><loc_63><loc_44><loc_81></location>
-<caption>Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.</caption>
 <row_0><col_0><col_header>Class-count</col_0><col_1><col_header>11</col_1><col_2><col_header>6</col_2><col_3><col_header>5</col_3><col_4><col_header>4</col_4></row_0>
 <row_1><col_0><row_header>Caption</col_0><col_1><body>68</col_1><col_2><body>Text</col_2><col_3><body>Text</col_3><col_4><body>Text</col_4></row_1>
 <row_2><col_0><row_header>Footnote</col_0><col_1><body>71</col_1><col_2><body>Text</col_2><col_3><body>Text</col_3><col_4><body>Text</col_4></row_2>
@ -157,6 +148,7 @@
 <paragraph><location><page_7><loc_9><loc_33><loc_48><loc_58></location>One of the fundamental questions related to any dataset is if it is "large enough". To answer this question for DocLayNet, we performed a data ablation study in which we evaluated a Mask R-CNN model trained on increasing fractions of the DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in the beginning and eventually levels out. To estimate the error-bar on the metrics, we ran the training five times on the entire data-set. This resulted in a 1% error-bar, depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the exact same data-points, but with a logarithmic scale on the x-axis. As is expected, the mAP score increases linearly as a function of the data-size in the inset. The curve ultimately flattens out between the 80% and 100% mark, with the 80% mark falling within the error-bars of the 100% mark. This provides a good indication that the model would not improve significantly by yet increasing the data size. Rather, it would probably benefit more from improved data consistency (as discussed in Section 3), data augmentation methods [23], or the addition of more document categories and styles.</paragraph>
 <subtitle-level-1><location><page_7><loc_9><loc_30><loc_27><loc_32></location>Impact of Class Labels</subtitle-level-1>
 <paragraph><location><page_7><loc_9><loc_11><loc_48><loc_30></location>The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of</paragraph>
+<paragraph><location><page_7><loc_52><loc_84><loc_91><loc_89></location>Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.</paragraph>
 <table>
 <location><page_7><loc_58><loc_61><loc_85><loc_81></location>
 <row_0><col_0><body>Class-count</col_0><col_1><col_header>11</col_1><col_2><col_header>11</col_2><col_3><col_header>5</col_3><col_4><col_header>5</col_4></row_0>
@ -179,10 +171,9 @@
 <paragraph><location><page_7><loc_52><loc_25><loc_91><loc_44></location>Many documents in DocLayNet have a unique styling. In order to avoid overfitting on a particular style, we have split the train-, test- and validation-sets of DocLayNet on document boundaries, i.e. every document contributes pages to only one set. To the best of our knowledge, this was not considered in PubLayNet or DocBank. To quantify how this affects model performance, we trained and evaluated a Mask R-CNN R50 model on a modified dataset version. Here, the train-, test- and validation-sets were obtained by a randomised draw over the individual pages. As can be seen in Table 4, the difference in model performance is surprisingly large: pagewise splitting gains ˜ 10% in mAP over the document-wise splitting. Thus, random page-wise splitting of DocLayNet can easily lead to accidental overestimation of model performance and should be avoided.</paragraph>
 <subtitle-level-1><location><page_7><loc_52><loc_22><loc_68><loc_23></location>Dataset Comparison</subtitle-level-1>
 <paragraph><location><page_7><loc_52><loc_11><loc_91><loc_21></location>Throughout this paper, we claim that DocLayNet's wider variety of document layouts leads to more robust layout detection models. In Table 5, we provide evidence for that. We trained models on each of the available datasets (PubLayNet, DocBank and DocLayNet) and evaluated them on the test sets of the other datasets. Due to the different label sets and annotation styles, a direct comparison is not possible. Hence, we focussed on the common labels among the datasets. Between PubLayNet and DocLayNet, these are Picture ,</paragraph>
-<caption><location><page_8><loc_9><loc_81><loc_48><loc_89></location>Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.</caption>
+<paragraph><location><page_8><loc_9><loc_81><loc_48><loc_89></location>Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.</paragraph>
 <table>
 <location><page_8><loc_12><loc_57><loc_45><loc_78></location>
-<caption>Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.</caption>
 <row_0><col_0><body></col_0><col_1><body></col_1><col_2><col_header>Testing on</col_2><col_3><col_header>Testing on</col_3><col_4><col_header>Testing on</col_4></row_0>
 <row_1><col_0><col_header>Training on</col_0><col_1><col_header>labels</col_1><col_2><col_header>PLN</col_2><col_3><col_header>DB</col_3><col_4><col_header>DLN</col_4></row_1>
 <row_2><col_0><row_header>PubLayNet (PLN)</col_0><col_1><row_header>Figure</col_1><col_2><body>96</col_2><col_3><body>43</col_3><col_4><body>23</col_4></row_2>
@ -221,20 +212,19 @@
 <paragraph><location><page_8><loc_52><loc_18><loc_91><loc_21></location>- [11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.</paragraph>
 <paragraph><location><page_8><loc_52><loc_15><loc_91><loc_18></location>- [12] Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017.</paragraph>
 <paragraph><location><page_8><loc_52><loc_11><loc_91><loc_15></location>- [13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu</paragraph>
-<caption><location><page_9><loc_10><loc_43><loc_52><loc_44></location>Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title</caption>
 <figure>
 <location><page_9><loc_9><loc_44><loc_91><loc_89></location>
 <caption>Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title</caption>
 </figure>
 <paragraph><location><page_9><loc_9><loc_36><loc_91><loc_41></location>Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes.</paragraph>
 <paragraph><location><page_9><loc_11><loc_31><loc_48><loc_33></location>Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.</paragraph>
+<paragraph><location><page_9><loc_52><loc_32><loc_91><loc_33></location>- [20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</paragraph>
 <paragraph><location><page_9><loc_9><loc_28><loc_48><loc_30></location>- [14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.</paragraph>
 <paragraph><location><page_9><loc_9><loc_26><loc_48><loc_27></location>- [15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019.</paragraph>
 <paragraph><location><page_9><loc_9><loc_23><loc_48><loc_25></location>- [16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014.</paragraph>
 <paragraph><location><page_9><loc_9><loc_21><loc_48><loc_22></location>- [17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019.</paragraph>
 <paragraph><location><page_9><loc_9><loc_16><loc_48><loc_20></location>- [18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021.</paragraph>
 <paragraph><location><page_9><loc_9><loc_10><loc_48><loc_15></location>- [19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery.</paragraph>
-<paragraph><location><page_9><loc_52><loc_32><loc_91><loc_33></location>- [20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</paragraph>
 <paragraph><location><page_9><loc_52><loc_29><loc_91><loc_31></location>- [21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.</paragraph>
 <paragraph><location><page_9><loc_52><loc_25><loc_91><loc_28></location>- [22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.</paragraph>
 <paragraph><location><page_9><loc_52><loc_23><loc_91><loc_24></location>- [23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.</paragraph>
--- a/tests/data/groundtruth/docling_v1/2206.01062.json
+++ b/tests/data/groundtruth/docling_v1/2206.01062.json
--- a/tests/data/groundtruth/docling_v1/2206.01062.md
+++ b/tests/data/groundtruth/docling_v1/2206.01062.md
@ -20,7 +20,7 @@ Accurate document layout analysis is a key requirement for highquality PDF docum

 Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).

-KDD '22, August 14-18, 2022, Washington, DC, USA
+KDD ’22, August 14-18, 2022, Washington, DC, USA

 © 2022 Copyright held by the owner/author(s).

@ -119,9 +119,7 @@ Figure 3: Corpus Conversion Service annotation user interface. The PDF page is s

 we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised.

-Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources
-
-include publication repositories such as arXiv$^{3}$, government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process.
+Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources include publication repositories such as arXiv$^{3}$, government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process.

 Preparation work included uploading and parsing the sourced PDF documents in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides a visual annotation interface and allows for dataset inspection and analysis. The annotation interface of CCS is shown in Figure 3. The desired balance of pages between the different document categories was achieved by selective subsampling of pages with certain desired properties. For example, we made sure to include the title page of each document and bias the remaining page selection to those with figures or tables. The latter was achieved by leveraging pre-trained object detection models from PubLayNet, which helped us estimate how many figures and tables a given page contains.

@ -149,11 +147,12 @@ The complete annotation guideline is over 100 pages long and a detailed descript

 Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations

-Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.
 <!-- image -->

 05237a14f2524e3f53c8454b074409d05078038a6a36b770fcc8ec7e540deae0

+Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.
+
 were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.

 Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted
@ -195,8 +194,6 @@ In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], F

 Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained on DocLayNet with different class label sets. The reduced label sets were obtained by either down-mapping or dropping labels.

-Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.
-
 | Class-count    |   11 | 6       | 5       | 4       |
 |----------------|------|---------|---------|---------|
 | Caption        |   68 | Text    | Text    | Text    |
@ -220,6 +217,8 @@ One of the fundamental questions related to any dataset is if it is "large enoug

 The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of

+Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.
+
 | Class-count    | 11   | 11   | 5   | 5    |
 |----------------|------|------|-----|------|
 | Split          | Doc  | Page | Doc | Page |
@ -316,6 +315,8 @@ Figure 6: Example layout predictions on selected pages from the DocLayNet test-s

 Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.

+- [20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.
+
 - [14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.

 - [15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019.
@ -328,8 +329,6 @@ Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultraly

 - [19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery.

- [20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.
-
 - [21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.

 - [22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.
--- a/tests/data/groundtruth/docling_v1/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v1/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.doctags.txt
@ -2,7 +2,6 @@
 <paragraph><location><page_1><loc_22><loc_81><loc_79><loc_85></location>order to compute the TED score. Inference timing results for all experiments were obtained from the same machine on a single core with AMD EPYC 7763 CPU @2.45 GHz.</paragraph>
 <subtitle-level-1><location><page_1><loc_22><loc_77><loc_52><loc_79></location>5.1 Hyper Parameter Optimization</subtitle-level-1>
 <paragraph><location><page_1><loc_22><loc_68><loc_79><loc_77></location>We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.</paragraph>
-<caption><location><page_1><loc_22><loc_59><loc_79><loc_66></location>Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.</caption>
 <table>
 <location><page_1><loc_23><loc_41><loc_78><loc_57></location>
 <caption>Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.</caption>
@ -14,6 +13,7 @@
 <row_5><col_0><body>2</col_0><col_1><body>4</col_1><col_2><body>HTML</col_2><col_3><body>0.945</col_3><col_4><body>0.897 0.901</col_4><col_5><body>0.915 0.931</col_5><col_6><body>0.859 0.834</col_6><col_7><body>1.91 3.81</col_7></row_5>
 <row_6><col_0><body>4</col_0><col_1><body>2</col_1><col_2><body>OTSL HTML</col_2><col_3><body>0.952 0.944</col_3><col_4><body>0.92 0.903</col_4><col_5><body>0.942 0.931</col_5><col_6><body>0.857 0.824</col_6><col_7><body>1.22 2</col_7></row_6>
 </table>
+<caption><location><page_1><loc_22><loc_59><loc_79><loc_66></location>Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.</caption>
 <subtitle-level-1><location><page_1><loc_22><loc_35><loc_43><loc_36></location>5.2 Quantitative Results</subtitle-level-1>
 <paragraph><location><page_1><loc_22><loc_22><loc_79><loc_34></location>We picked the model parameter configuration that produced the best prediction quality (enc=6, dec=6, heads=8) with PubTabNet alone, then independently trained and evaluated it on three publicly available data sets: PubTabNet (395k samples), FinTabNet (113k samples) and PubTables-1M (about 1M samples). Performance results are presented in Table. 2. It is clearly evident that the model trained on OTSL outperforms HTML across the board, keeping high TEDs and mAP scores even on difficult financial tables (FinTabNet) that contain sparse and large tables.</paragraph>
 <paragraph><location><page_1><loc_22><loc_16><loc_79><loc_22></location>Additionally, the results show that OTSL has an advantage over HTML when applied on a bigger data set like PubTables-1M and achieves significantly improved scores. Finally, OTSL achieves faster inference due to fewer decoding steps which is a result of the reduced sequence representation.</paragraph>
--- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.doctags.txt
@ -9,7 +9,6 @@
 <subtitle-level-1><location><page_1><loc_22><loc_33><loc_37><loc_34></location>1 Introduction</subtitle-level-1>
 <paragraph><location><page_1><loc_22><loc_21><loc_79><loc_31></location>Tables are ubiquitous in documents such as scientific papers, patents, reports, manuals, specification sheets or marketing material. They often encode highly valuable information and therefore need to be extracted with high accuracy. Unfortunately, tables appear in documents in various sizes, styling and structure, making it difficult to recover their correct structure with simple analytical methods. Therefore, accurate table extraction is achieved these days with machine-learning based methods.</paragraph>
 <paragraph><location><page_1><loc_22><loc_16><loc_79><loc_20></location>In modern document understanding systems [1,15], table extraction is typically a two-step process. Firstly, every table on a page is located with a bounding box, and secondly, their logical row and column structure is recognized. As of</paragraph>
-<caption><location><page_2><loc_22><loc_75><loc_79><loc_84></location>Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).</caption>
 <figure>
 <location><page_2><loc_24><loc_46><loc_76><loc_74></location>
 <caption>Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).</caption>
@ -27,7 +26,6 @@
 <subtitle-level-1><location><page_4><loc_22><loc_22><loc_44><loc_24></location>3 Problem Statement</subtitle-level-1>
 <paragraph><location><page_4><loc_22><loc_16><loc_79><loc_20></location>All known Im2Seq based models for TSR fundamentally work in similar ways. Given an image of a table, the Im2Seq model predicts the structure of the table by generating a sequence of tokens. These tokens originate from a finite vocab-</paragraph>
 <paragraph><location><page_5><loc_22><loc_76><loc_79><loc_85></location>ulary and can be interpreted as a table structure. For example, with the HTML tokens <table> , </table> , <tr> , </tr> , <td> and </td> , one can construct simple table structures without any spanning cells. In reality though, one needs at least 28 HTML tokens to describe the most common complex tables observed in real-world documents [21,22], due to a variety of spanning cells definitions in the HTML token vocabulary.</paragraph>
-<caption><location><page_5><loc_24><loc_71><loc_77><loc_72></location>Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.</caption>
 <figure>
 <location><page_5><loc_22><loc_57><loc_78><loc_71></location>
 <caption>Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.</caption>
@ -48,7 +46,6 @@
 <paragraph><location><page_6><loc_23><loc_22><loc_74><loc_23></location>- -"X" cell cross cell , to merge with both left and upper neighbor cells</paragraph>
 <paragraph><location><page_6><loc_23><loc_20><loc_54><loc_21></location>- -"NL" new-line , switch to the next row.</paragraph>
 <paragraph><location><page_6><loc_22><loc_16><loc_79><loc_19></location>A notable attribute of OTSL is that it has the capability of achieving lossless conversion to HTML.</paragraph>
-<caption><location><page_7><loc_22><loc_80><loc_79><loc_84></location>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption>
 <figure>
 <location><page_7><loc_27><loc_65><loc_73><loc_79></location>
 <caption>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption>
@ -69,7 +66,6 @@
 <paragraph><location><page_8><loc_22><loc_62><loc_79><loc_77></location>The design of OTSL allows to validate a table structure easily on an unfinished sequence. The detection of an invalid sequence token is a clear indication of a prediction mistake, however a valid sequence by itself does not guarantee prediction correctness. Different heuristics can be used to correct token errors in an invalid sequence and thus increase the chances for accurate predictions. Such heuristics can be applied either after the prediction of each token, or at the end on the entire predicted sequence. For example a simple heuristic which can correct the predicted OTSL sequence on-the-fly is to verify if the token with the highest prediction confidence invalidates the predicted sequence, and replace it by the token with the next highest confidence until OTSL rules are satisfied.</paragraph>
 <subtitle-level-1><location><page_8><loc_22><loc_58><loc_37><loc_59></location>5 Experiments</subtitle-level-1>
 <paragraph><location><page_8><loc_22><loc_43><loc_79><loc_56></location>To evaluate the impact of OTSL on prediction accuracy and inference times, we conducted a series of experiments based on the TableFormer model (Figure 4) with two objectives: Firstly we evaluate the prediction quality and performance of OTSL vs. HTML after performing Hyper Parameter Optimization (HPO) on the canonical PubTabNet data set. Secondly we pick the best hyper-parameters found in the first step and evaluate how OTSL impacts the performance of TableFormer after training on other publicly available data sets (FinTabNet, PubTables-1M [14]). The ground truth (GT) from all data sets has been converted into OTSL format for this purpose, and will be made publicly available.</paragraph>
-<caption><location><page_8><loc_22><loc_36><loc_79><loc_39></location>Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.</caption>
 <figure>
 <location><page_8><loc_23><loc_25><loc_77><loc_36></location>
 <caption>Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.</caption>
@ -78,7 +74,6 @@
 <paragraph><location><page_9><loc_22><loc_81><loc_79><loc_85></location>order to compute the TED score. Inference timing results for all experiments were obtained from the same machine on a single core with AMD EPYC 7763 CPU @2.45 GHz.</paragraph>
 <subtitle-level-1><location><page_9><loc_22><loc_78><loc_52><loc_79></location>5.1 Hyper Parameter Optimization</subtitle-level-1>
 <paragraph><location><page_9><loc_22><loc_68><loc_79><loc_77></location>We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.</paragraph>
-<caption><location><page_9><loc_22><loc_59><loc_79><loc_65></location>Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.</caption>
 <table>
 <location><page_9><loc_23><loc_41><loc_78><loc_57></location>
 <caption>Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.</caption>
@ -90,10 +85,10 @@
 <row_5><col_0><body></col_0><col_1><body></col_1><col_2><body>HTML</col_2><col_3><body></col_3><col_4><body>0.901</col_4><col_5><body>0.915 0.931</col_5><col_6><body>0.859 0.834</col_6><col_7><body>1.91 3.81</col_7></row_5>
 <row_6><col_0><body>4</col_0><col_1><body>2</col_1><col_2><body>OTSL HTML</col_2><col_3><body>0.952 0.944</col_3><col_4><body>0.92 0.903</col_4><col_5><body>0.942 0.931</col_5><col_6><body>0.857 0.824</col_6><col_7><body>1.22 2</col_7></row_6>
 </table>
+<caption><location><page_9><loc_22><loc_59><loc_79><loc_65></location>Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.</caption>
 <subtitle-level-1><location><page_9><loc_22><loc_35><loc_43><loc_36></location>5.2 Quantitative Results</subtitle-level-1>
 <paragraph><location><page_9><loc_22><loc_22><loc_79><loc_34></location>We picked the model parameter configuration that produced the best prediction quality (enc=6, dec=6, heads=8) with PubTabNet alone, then independently trained and evaluated it on three publicly available data sets: PubTabNet (395k samples), FinTabNet (113k samples) and PubTables-1M (about 1M samples). Performance results are presented in Table. 2. It is clearly evident that the model trained on OTSL outperforms HTML across the board, keeping high TEDs and mAP scores even on difficult financial tables (FinTabNet) that contain sparse and large tables.</paragraph>
 <paragraph><location><page_9><loc_22><loc_16><loc_79><loc_22></location>Additionally, the results show that OTSL has an advantage over HTML when applied on a bigger data set like PubTables-1M and achieves significantly improved scores. Finally, OTSL achieves faster inference due to fewer decoding steps which is a result of the reduced sequence representation.</paragraph>
-<caption><location><page_10><loc_22><loc_82><loc_79><loc_85></location>Table 2. TSR and cell detection results compared between OTSL and HTML on the PubTabNet [22], FinTabNet [21] and PubTables-1M [14] data sets using TableFormer [9] (with enc=6, dec=6, heads=8).</caption>
 <table>
 <location><page_10><loc_23><loc_67><loc_77><loc_80></location>
 <caption>Table 2. TSR and cell detection results compared between OTSL and HTML on the PubTabNet [22], FinTabNet [21] and PubTables-1M [14] data sets using TableFormer [9] (with enc=6, dec=6, heads=8).</caption>
@ -106,16 +101,15 @@
 <row_6><col_0><row_header>PubTables-1M</col_0><col_1><row_header>OTSL</col_1><col_2><body>0.987</col_2><col_3><body>0.964</col_3><col_4><body>0.977</col_4><col_5><body>0.896</col_5><col_6><body>1.79</col_6></row_6>
 <row_7><col_0><row_header>PubTables-1M</col_0><col_1><row_header>HTML</col_1><col_2><body>0.983</col_2><col_3><body>0.944</col_3><col_4><body>0.966</col_4><col_5><body>0.889</col_5><col_6><body>3.26</col_6></row_7>
 </table>
+<caption><location><page_10><loc_22><loc_82><loc_79><loc_85></location>Table 2. TSR and cell detection results compared between OTSL and HTML on the PubTabNet [22], FinTabNet [21] and PubTables-1M [14] data sets using TableFormer [9] (with enc=6, dec=6, heads=8).</caption>
 <subtitle-level-1><location><page_10><loc_22><loc_62><loc_42><loc_64></location>5.3 Qualitative Results</subtitle-level-1>
 <paragraph><location><page_10><loc_22><loc_54><loc_79><loc_61></location>To illustrate the qualitative differences between OTSL and HTML, Figure 5 demonstrates less overlap and more accurate bounding boxes with OTSL. In Figure 6, OTSL proves to be more effective in handling tables with longer token sequences, resulting in even more precise structure prediction and bounding boxes.</paragraph>
-<caption><location><page_10><loc_22><loc_44><loc_79><loc_50></location>Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ</caption>
 <figure>
 <location><page_10><loc_27><loc_16><loc_74><loc_44></location>
 <caption>Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ</caption>
 </figure>
 <paragraph><location><page_10><loc_37><loc_15><loc_38><loc_16></location>μ</paragraph>
 <paragraph><location><page_10><loc_49><loc_12><loc_49><loc_14></location>≥</paragraph>
-<caption><location><page_11><loc_22><loc_78><loc_79><loc_84></location>Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet.</caption>
 <figure>
 <location><page_11><loc_28><loc_20><loc_73><loc_77></location>
 <caption>Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet.</caption>
--- a/tests/data/groundtruth/docling_v1/2305.03393v1.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v1/amt_handbook_sample.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/amt_handbook_sample.doctags.txt
@ -6,7 +6,6 @@
 <paragraph><location><page_1><loc_12><loc_52><loc_53><loc_62></location>The spring keeps the locking and load-carrying sections such a distance apart that the two sets of threads are out of phase or spaced so that a bolt, which has been screwed through the load-carrying section, must push the locking section outward against the force of the spring to engage the threads of the locking section properly.</paragraph>
 <paragraph><location><page_1><loc_12><loc_38><loc_54><loc_50></location>The spring, through the medium of the locking section, exerts a constant locking force on the bolt in the same direction as a force that would tighten the nut. In this nut, the load-carrying section has the thread strength of a standard nut of comparable size, while the locking section presses against the threads of the bolt and locks the nut firmly in position. Only a wrench applied to the nut loosens it. The nut can be removed and reused without impairing its efficiency.</paragraph>
 <paragraph><location><page_1><loc_12><loc_33><loc_53><loc_36></location>Boots self-locking nuts are made with three different spring styles and in various shapes and sizes. The wing type that is</paragraph>
-<caption><location><page_1><loc_12><loc_8><loc_31><loc_9></location>Figure 7-26. Self-locking nuts.</caption>
 <figure>
 <location><page_1><loc_12><loc_10><loc_52><loc_31></location>
 <caption>Figure 7-26. Self-locking nuts.</caption>
@ -17,7 +16,6 @@
 <paragraph><location><page_1><loc_54><loc_54><loc_96><loc_81></location>The stainless steel self-locking nut may be spun on and off by hand as its locking action takes places only when the nut is seated against a solid surface and tightened. The nut consists of two parts: a case with a beveled locking shoulder and key and a thread insert with a locking shoulder and slotted keyway. Until the nut is tightened, it spins on the bolt easily, because the threaded insert is the proper size for the bolt. However, when the nut is seated against a solid surface and tightened, the locking shoulder of the insert is pulled downward and wedged against the locking shoulder of the case. This action compresses the threaded insert and causes it to clench the bolt tightly. The cross-sectional view in Figure 7-27 shows how the key of the case fits into the slotted keyway of the insert so that when the case is turned, the threaded insert is turned with it. Note that the slot is wider than the key. This permits the slot to be narrowed and the insert to be compressed when the nut is tightened.</paragraph>
 <subtitle-level-1><location><page_1><loc_54><loc_51><loc_65><loc_52></location>Elastic Stop Nut</subtitle-level-1>
 <paragraph><location><page_1><loc_54><loc_47><loc_93><loc_50></location>The elastic stop nut is a standard nut with the height increased to accommodate a fiber locking collar. This</paragraph>
-<caption><location><page_1><loc_54><loc_8><loc_81><loc_10></location>Figure 7-27. Stainless steel self-locking nut.</caption>
 <figure>
 <location><page_1><loc_54><loc_11><loc_94><loc_46></location>
 <caption>Figure 7-27. Stainless steel self-locking nut.</caption>
--- a/tests/data/groundtruth/docling_v1/amt_handbook_sample.json
+++ b/tests/data/groundtruth/docling_v1/amt_handbook_sample.json
--- a/tests/data/groundtruth/docling_v1/amt_handbook_sample.pages.json
+++ b/tests/data/groundtruth/docling_v1/amt_handbook_sample.pages.json
--- a/tests/data/groundtruth/docling_v1/code_and_formula.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/code_and_formula.doctags.txt
@ -2,8 +2,8 @@
 <subtitle-level-1><location><page_1><loc_22><loc_83><loc_52><loc_84></location>JavaScript Code Example</subtitle-level-1>
 <paragraph><location><page_1><loc_22><loc_63><loc_78><loc_81></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</paragraph>
 <paragraph><location><page_1><loc_22><loc_57><loc_78><loc_63></location>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet,</paragraph>
-<paragraph><location><page_1><loc_36><loc_55><loc_63><loc_56></location>Listing 1: Simple JavaScript Program</paragraph>
 <paragraph><location><page_1><loc_22><loc_49><loc_43><loc_54></location>function add(a, b) { return a + b; } console.log(add(3, 5));</paragraph>
+<caption><location><page_1><loc_36><loc_55><loc_63><loc_56></location>Listing 1: Simple JavaScript Program</caption>
 <paragraph><location><page_1><loc_22><loc_29><loc_78><loc_47></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</paragraph>
 <paragraph><location><page_1><loc_22><loc_23><loc_78><loc_29></location>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet,</paragraph>
 <subtitle-level-1><location><page_2><loc_22><loc_84><loc_32><loc_85></location>Formula</subtitle-level-1>
--- a/tests/data/groundtruth/docling_v1/code_and_formula.json
+++ b/tests/data/groundtruth/docling_v1/code_and_formula.json
--- a/tests/data/groundtruth/docling_v1/code_and_formula.md
+++ b/tests/data/groundtruth/docling_v1/code_and_formula.md
@ -4,10 +4,10 @@ Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod

 Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet,

-Listing 1: Simple JavaScript Program
-
 function add(a, b) { return a + b; } console.log(add(3, 5));

+Listing 1: Simple JavaScript Program
+
 Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.

 Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet,
--- a/tests/data/groundtruth/docling_v1/picture_classification.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/picture_classification.doctags.txt
@ -1,14 +1,12 @@
 <document>
 <subtitle-level-1><location><page_1><loc_22><loc_83><loc_41><loc_84></location>Figures Example</subtitle-level-1>
 <paragraph><location><page_1><loc_22><loc_63><loc_78><loc_81></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</paragraph>
-<caption><location><page_1><loc_37><loc_32><loc_63><loc_33></location>Figure 1: This is an example image.</caption>
 <figure>
 <location><page_1><loc_22><loc_36><loc_78><loc_62></location>
 <caption>Figure 1: This is an example image.</caption>
 </figure>
 <paragraph><location><page_1><loc_22><loc_15><loc_78><loc_30></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.</paragraph>
 <paragraph><location><page_2><loc_22><loc_66><loc_78><loc_84></location>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</paragraph>
-<caption><location><page_2><loc_37><loc_33><loc_63><loc_34></location>Figure 2: This is an example image.</caption>
 <figure>
 <location><page_2><loc_36><loc_36><loc_64><loc_65></location>
 <caption>Figure 2: This is an example image.</caption>
--- a/tests/data/groundtruth/docling_v1/picture_classification.json
+++ b/tests/data/groundtruth/docling_v1/picture_classification.json
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.doctags.txt
@ -7,9 +7,7 @@
 <figure>
 <location><page_1><loc_5><loc_11><loc_96><loc_63></location>
 </figure>
-<figure>
-<location><page_1><loc_52><loc_2><loc_95><loc_10></location>
-</figure>
+<paragraph><location><page_1><loc_47><loc_94><loc_68><loc_96></location>Front cover</paragraph>
 <subtitle-level-1><location><page_2><loc_11><loc_88><loc_28><loc_91></location>Contents</subtitle-level-1>
 <paragraph><location><page_3><loc_11><loc_89><loc_39><loc_91></location>DB2 for i Center of Excellence</paragraph>
 <paragraph><location><page_3><loc_15><loc_80><loc_38><loc_83></location>Solution Brief IBM Systems Lab Services and Training</paragraph>
@ -49,17 +47,17 @@
 <figure>
 <location><page_4><loc_23><loc_36><loc_41><loc_53></location>
 </figure>
+<paragraph><location><page_4><loc_43><loc_35><loc_88><loc_53></location>Jim Bainbridge is a senior DB2 consultant on the DB2 for i Center of Excellence team in the IBM Lab Services and Training organization. His primary role is training and implementation services for IBM DB2 Web Query for i and business analytics. Jim began his career with IBM 30 years ago in the IBM Rochester Development Lab, where he developed cooperative processing products that paired IBM PCs with IBM S/36 and AS/.400 systems. In the years since, Jim has held numerous technical roles, including independent software vendors technical support on a broad range of IBM technologies and products, and supporting customers in the IBM Executive Briefing Center and IBM Project Office.</paragraph>
 <figure>
 <location><page_4><loc_24><loc_20><loc_41><loc_33></location>
 </figure>
-<paragraph><location><page_4><loc_43><loc_35><loc_88><loc_53></location>Jim Bainbridge is a senior DB2 consultant on the DB2 for i Center of Excellence team in the IBM Lab Services and Training organization. His primary role is training and implementation services for IBM DB2 Web Query for i and business analytics. Jim began his career with IBM 30 years ago in the IBM Rochester Development Lab, where he developed cooperative processing products that paired IBM PCs with IBM S/36 and AS/.400 systems. In the years since, Jim has held numerous technical roles, including independent software vendors technical support on a broad range of IBM technologies and products, and supporting customers in the IBM Executive Briefing Center and IBM Project Office.</paragraph>
 <paragraph><location><page_4><loc_43><loc_14><loc_88><loc_33></location>Hernando Bedoya is a Senior IT Specialist at STG Lab Services and Training in Rochester, Minnesota. He writes extensively and teaches IBM classes worldwide in all areas of DB2 for i. Before joining STG Lab Services, he worked in the ITSO for nine years writing multiple IBM Redbooksfi publications. He also worked for IBM Colombia as an IBM AS/400fi IT Specialist doing presales support for the Andean countries. He has 28 years of experience in the computing field and has taught database classes in Colombian universities. He holds a Master's degree in Computer Science from EAFIT, Colombia. His areas of expertise are database technology, performance, and data warehousing. Hernando can be contacted at hbedoya@us.ibm.com .</paragraph>
 <subtitle-level-1><location><page_4><loc_11><loc_62><loc_20><loc_64></location>Authors</subtitle-level-1>
 <figure>
 <location><page_5><loc_5><loc_70><loc_39><loc_91></location>
 </figure>
-<paragraph><location><page_5><loc_13><loc_65><loc_19><loc_66></location>Chapter 1.</paragraph>
 <paragraph><location><page_5><loc_82><loc_84><loc_85><loc_88></location>1</paragraph>
+<paragraph><location><page_5><loc_13><loc_65><loc_19><loc_66></location>Chapter 1.</paragraph>
 <subtitle-level-1><location><page_5><loc_22><loc_61><loc_89><loc_68></location>Securing and protecting IBM DB2 data</subtitle-level-1>
 <paragraph><location><page_5><loc_22><loc_46><loc_89><loc_56></location>Recent news headlines are filled with reports of data breaches and cyber-attacks impacting global businesses of all sizes. The Identity Theft Resource Center$^{1}$ reports that almost 5000 data breaches have occurred since 2005, exposing over 600 million records of data. The financial cost of these data breaches is skyrocketing. Studies from the Ponemon Institute$^{2}$ revealed that the average cost of a data breach increased in 2013 by 15% globally and resulted in a brand equity loss of $9.4 million per attack. The average cost that is incurred for each lost record containing sensitive information increased more than 9% to $145 per record.</paragraph>
 <paragraph><location><page_5><loc_22><loc_38><loc_86><loc_44></location>Businesses must make a serious effort to secure their data and recognize that securing information assets is a cost of doing business. In many parts of the world and in many industries, securing the data is required by law and subject to audits. Data security is no longer an option; it is a requirement.</paragraph>
@ -83,7 +81,6 @@
 <paragraph><location><page_7><loc_22><loc_68><loc_88><loc_75></location>Some IBM i clients have tried augmenting the all-or-nothing object-level security with SQL views (or logical files) and application logic, as shown in Figure 1-2. However, application-based logic is easy to bypass with all of the different data access interfaces that are provided by the IBM i operating system, such as Open Database Connectivity (ODBC) and System i Navigator.</paragraph>
 <paragraph><location><page_7><loc_22><loc_60><loc_89><loc_66></location>Using SQL views to limit access to a subset of the data in a table also has its own set of challenges. First, there is the complexity of managing all of the SQL view objects that are used for securing data access. Second, scaling a view-based security solution can be difficult as the amount of data grows and the number of users increases.</paragraph>
 <paragraph><location><page_7><loc_22><loc_54><loc_89><loc_59></location>Even if you are willing to live with these performance and management issues, a user with *ALLOBJ access still can directly access all of the data in the underlying DB2 table and easily bypass the security controls that are built into an SQL view.</paragraph>
-<caption><location><page_7><loc_22><loc_12><loc_52><loc_13></location>Figure 1-2 Existing row and column controls</caption>
 <figure>
 <location><page_7><loc_22><loc_13><loc_89><loc_53></location>
 <caption>Figure 1-2 Existing row and column controls</caption>
@ -97,7 +94,6 @@
 <paragraph><location><page_8><loc_22><loc_75><loc_72><loc_76></location>CHGFCNUSG FCNID(QIBM_DB_SECADM) USER(HBEDOYA) USAGE(*ALLOWED)</paragraph>
 <subtitle-level-1><location><page_8><loc_11><loc_71><loc_89><loc_72></location>2.1.7 Verifying function usage IDs for RCAC with the FUNCTION_USAGE view</subtitle-level-1>
 <paragraph><location><page_8><loc_22><loc_66><loc_85><loc_69></location>The FUNCTION_USAGE view contains function usage configuration details. Table 2-1 describes the columns in the FUNCTION_USAGE view.</paragraph>
-<caption><location><page_8><loc_22><loc_64><loc_46><loc_65></location>Table 2-1 FUNCTION_USAGE view</caption>
 <table>
 <location><page_8><loc_22><loc_44><loc_89><loc_63></location>
 <caption>Table 2-1 FUNCTION_USAGE view</caption>
@ -107,8 +103,9 @@
 <row_3><col_0><body>USAGE</col_0><col_1><body>VARCHAR(7)</col_1><col_2><body>Usage setting: GLYPH<SM590000> ALLOWED: The user profile is allowed to use the function. GLYPH<SM590000> DENIED: The user profile is not allowed to use the function.</col_2></row_3>
 <row_4><col_0><body>USER_TYPE</col_0><col_1><body>VARCHAR(5)</col_1><col_2><body>Type of user profile: GLYPH<SM590000> USER: The user profile is a user. GLYPH<SM590000> GROUP: The user profile is a group.</col_2></row_4>
 </table>
+<caption><location><page_8><loc_22><loc_64><loc_46><loc_65></location>Table 2-1 FUNCTION_USAGE view</caption>
 <paragraph><location><page_8><loc_22><loc_40><loc_89><loc_43></location>To discover who has authorization to define and manage RCAC, you can use the query that is shown in Example 2-1.</paragraph>
-<paragraph><location><page_8><loc_22><loc_38><loc_76><loc_39></location>Example 2-1 Query to determine who has authority to define and manage RCAC</paragraph>
+<caption><location><page_8><loc_22><loc_38><loc_76><loc_39></location>Example 2-1 Query to determine who has authority to define and manage RCAC</caption>
 <paragraph><location><page_8><loc_22><loc_35><loc_28><loc_36></location>SELECT</paragraph>
 <paragraph><location><page_8><loc_30><loc_35><loc_41><loc_36></location>function_id,</paragraph>
 <paragraph><location><page_8><loc_27><loc_34><loc_39><loc_35></location>user_name,</paragraph>
@ -128,16 +125,15 @@
 <paragraph><location><page_9><loc_22><loc_65><loc_89><loc_69></location>QIBM_DB_SECADM also is responsible for administering RCAC, which restricts which rows a user is allowed to access in a table and whether a user is allowed to see information in certain columns of a table.</paragraph>
 <paragraph><location><page_9><loc_22><loc_57><loc_88><loc_63></location>A preferred practice is that the RCAC administrator has the QIBM_DB_SECADM function usage ID, but absolutely no other data privileges. The result is that the RCAC administrator can deploy and maintain the RCAC constructs, but cannot grant themselves unauthorized access to data itself.</paragraph>
 <paragraph><location><page_9><loc_22><loc_53><loc_89><loc_56></location>Table 2-2 shows a comparison of the different function usage IDs and *JOBCTL authority to the different CL commands and DB2 for i tools.</paragraph>
-<caption><location><page_9><loc_11><loc_51><loc_64><loc_52></location>Table 2-2 Comparison of the different function usage IDs and *JOBCTL authority</caption>
 <table>
 <location><page_9><loc_11><loc_9><loc_89><loc_50></location>
 <caption>Table 2-2 Comparison of the different function usage IDs and *JOBCTL authority</caption>
 <row_0><col_0><row_header>User action</col_0><col_1><body>*JOBCTL</col_1><col_2><body>QIBM_DB_SECADM</col_2><col_3><body>QIBM_DB_SQLADM</col_3><col_4><body>QIBM_DB_SYSMON</col_4><col_5><body>No Authority</col_5></row_0>
 <row_1><col_0><row_header>SET CURRENT DEGREE  (SQL statement)</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_1>
-<row_2><col_0><row_header>CHGQRYA  command targeting a different user's job</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_2>
-<row_3><col_0><row_header>STRDBMON  or  ENDDBMON  commands targeting a different user's job</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_3>
+<row_2><col_0><row_header>CHGQRYA  command targeting a different user’s job</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_2>
+<row_3><col_0><row_header>STRDBMON  or  ENDDBMON  commands targeting a different user’s job</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_3>
 <row_4><col_0><row_header>STRDBMON  or  ENDDBMON  commands targeting a job that matches the current user</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body>X</col_4><col_5><body>X</col_5></row_4>
-<row_5><col_0><row_header>QUSRJOBI() API format 900 or System i Navigator's SQL Details for Job</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body>X</col_4><col_5><body></col_5></row_5>
+<row_5><col_0><row_header>QUSRJOBI() API format 900 or System i Navigator’s SQL Details for Job</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body>X</col_4><col_5><body></col_5></row_5>
 <row_6><col_0><row_header>Visual Explain within Run SQL scripts</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body>X</col_4><col_5><body>X</col_5></row_6>
 <row_7><col_0><row_header>Visual Explain outside of Run SQL scripts</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_7>
 <row_8><col_0><row_header>ANALYZE PLAN CACHE procedure</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_8>
@ -146,16 +142,15 @@
 <row_11><col_0><row_header>MODIFY PLAN CACHE PROPERTIES procedure (currently does not check authority)</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_11>
 <row_12><col_0><row_header>CHANGE PLAN CACHE SIZE procedure (currently does not check authority)</col_0><col_1><body>X</col_1><col_2><body></col_2><col_3><body>X</col_3><col_4><body></col_4><col_5><body></col_5></row_12>
 </table>
+<caption><location><page_9><loc_11><loc_51><loc_64><loc_52></location>Table 2-2 Comparison of the different function usage IDs and *JOBCTL authority</caption>
 <caption><location><page_10><loc_22><loc_88><loc_86><loc_91></location>The SQL CREATE PERMISSION statement that is shown in Figure 3-1 is used to define and initially enable or disable the row access rules.</caption>
-<caption><location><page_10><loc_22><loc_47><loc_56><loc_48></location>Figure 3-1 CREATE PERMISSION SQL statement</caption>
 <figure>
 <location><page_10><loc_22><loc_48><loc_89><loc_86></location>
-<caption>The SQL CREATE PERMISSION statement that is shown in Figure 3-1 is used to define and initially enable or disable the row access rules.Figure 3-1 CREATE PERMISSION SQL statement</caption>
+<caption>Figure 3-1 CREATE PERMISSION SQL statement</caption>
 </figure>
 <subtitle-level-1><location><page_10><loc_22><loc_43><loc_35><loc_44></location>Column mask</subtitle-level-1>
 <paragraph><location><page_10><loc_22><loc_37><loc_89><loc_43></location>A column mask is a database object that manifests a column value access control rule for a specific column in a specific table. It uses a CASE expression that describes what you see when you access the column. For example, a teller can see only the last four digits of a tax identification number.</paragraph>
-<paragraph><location><page_11><loc_22><loc_90><loc_67><loc_91></location>Table 3-1 summarizes these special registers and their values.</paragraph>
-<caption><location><page_11><loc_22><loc_87><loc_61><loc_88></location>Table 3-1 Special registers and their corresponding values</caption>
+<caption><location><page_11><loc_22><loc_90><loc_67><loc_91></location>Table 3-1 summarizes these special registers and their values.</caption>
 <table>
 <location><page_11><loc_22><loc_74><loc_89><loc_87></location>
 <caption>Table 3-1 Special registers and their corresponding values</caption>
@ -164,13 +159,13 @@
 <row_2><col_0><body>CURRENT_USER</col_0><col_1><body>The effective user of the thread including adopted authority. When no adopted  authority is present, this has the same value as USER.</col_1></row_2>
 <row_3><col_0><body>SYSTEM_USER</col_0><col_1><body>The authorization ID that initiated the connection.</col_1></row_3>
 </table>
+<caption><location><page_11><loc_22><loc_87><loc_61><loc_88></location>Table 3-1 Special registers and their corresponding values</caption>
 <paragraph><location><page_11><loc_22><loc_70><loc_88><loc_73></location>Figure 3-5 shows the difference in the special register values when an adopted authority is used:</paragraph>
 <paragraph><location><page_11><loc_22><loc_68><loc_67><loc_69></location>- GLYPH<SM590000> A user connects to the server using the user profile ALICE.</paragraph>
 <paragraph><location><page_11><loc_22><loc_66><loc_74><loc_67></location>- GLYPH<SM590000> USER and CURRENT USER initially have the same value of ALICE.</paragraph>
 <paragraph><location><page_11><loc_22><loc_62><loc_88><loc_65></location>- GLYPH<SM590000> ALICE calls an SQL procedure that is named proc1, which is owned by user profile JOE and was created to adopt JOE's authority when it is called.</paragraph>
 <paragraph><location><page_11><loc_22><loc_57><loc_89><loc_61></location>- GLYPH<SM590000> While the procedure is running, the special register USER still contains the value of ALICE because it excludes any adopted authority. The special register CURRENT USER contains the value of JOE because it includes any adopted authority.</paragraph>
 <paragraph><location><page_11><loc_22><loc_53><loc_89><loc_56></location>- GLYPH<SM590000> When proc1 ends, the session reverts to its original state with both USER and CURRENT USER having the value of ALICE.</paragraph>
-<caption><location><page_11><loc_22><loc_24><loc_56><loc_25></location>Figure 3-5 Special registers and adopted authority</caption>
 <figure>
 <location><page_11><loc_22><loc_25><loc_49><loc_51></location>
 <caption>Figure 3-5 Special registers and adopted authority</caption>
@ -179,7 +174,6 @@
 <paragraph><location><page_11><loc_22><loc_15><loc_85><loc_18></location>Built-in global variables are provided with the database manager and are used in SQL statements to retrieve scalar values that are associated with the variables.</paragraph>
 <paragraph><location><page_11><loc_22><loc_9><loc_87><loc_13></location>IBM DB2 for i supports nine different built-in global variables that are read only and maintained by the system. These global variables can be used to identify attributes of the database connection and used as part of the RCAC logic.</paragraph>
 <paragraph><location><page_12><loc_22><loc_90><loc_56><loc_91></location>Table 3-2 lists the nine built-in global variables.</paragraph>
-<caption><location><page_12><loc_11><loc_87><loc_33><loc_88></location>Table 3-2 Built-in global variables</caption>
 <table>
 <location><page_12><loc_10><loc_63><loc_90><loc_87></location>
 <caption>Table 3-2 Built-in global variables</caption>
@ -194,6 +188,7 @@
 <row_8><col_0><body>ROUTINE_SPECIFIC_NAME</col_0><col_1><body>VARCHAR(128)</col_1><col_2><body>Name of the currently running routine</col_2></row_8>
 <row_9><col_0><body>ROUTINE_TYPE</col_0><col_1><body>CHAR(1)</col_1><col_2><body>Type of the currently running routine</col_2></row_9>
 </table>
+<caption><location><page_12><loc_11><loc_87><loc_33><loc_88></location>Table 3-2 Built-in global variables</caption>
 <subtitle-level-1><location><page_12><loc_11><loc_57><loc_63><loc_59></location>3.3 VERIFY_GROUP_FOR_USER function</subtitle-level-1>
 <paragraph><location><page_12><loc_22><loc_45><loc_89><loc_55></location>The VERIFY_GROUP_FOR_USER function was added in IBM i 7.2. Although it is primarily intended for use with RCAC permissions and masks, it can be used in other SQL statements. The first parameter must be one of these three special registers: SESSION_USER, USER, or CURRENT_USER. The second and subsequent parameters are a list of user or group profiles. Each of these values must be 1 - 10 characters in length. These values are not validated for their existence, which means that you can specify the names of user profiles that do not exist without receiving any kind of error.</paragraph>
 <paragraph><location><page_12><loc_22><loc_39><loc_89><loc_43></location>If a special register value is in the list of user profiles or it is a member of a group profile included in the list, the function returns a long integer value of 1. Otherwise, it returns a value of 0. It never returns the null value.</paragraph>
@ -211,10 +206,9 @@
 <paragraph><location><page_13><loc_25><loc_55><loc_89><loc_57></location>- -Managers see a masked version of TAX_ID with the first five characters replaced with the X character (for example, XXX-XX-1234).</paragraph>
 <paragraph><location><page_13><loc_25><loc_52><loc_87><loc_54></location>- -Any other person sees the entire TAX_ID as masked, for example, XXX-XX-XXXX.</paragraph>
 <paragraph><location><page_13><loc_25><loc_50><loc_87><loc_51></location>- To implement this column mask, run the SQL statement that is shown in Example 3-9.</paragraph>
-<paragraph><location><page_13><loc_22><loc_48><loc_58><loc_49></location>Example 3-9 Creating a mask on the TAX_ID column</paragraph>
 <paragraph><location><page_13><loc_22><loc_14><loc_86><loc_47></location>CREATE MASK HR_SCHEMA.MASK_TAX_ID_ON_EMPLOYEES ON HR_SCHEMA.EMPLOYEES AS EMPLOYEES FOR COLUMN TAX_ID RETURN CASE WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR' ) = 1 THEN EMPLOYEES . TAX_ID WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER = EMPLOYEES . USER_ID THEN EMPLOYEES . TAX_ID WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER <> EMPLOYEES . USER_ID THEN ( 'XXX-XX-' CONCAT QSYS2 . SUBSTR ( EMPLOYEES . TAX_ID , 8 , 4 ) ) WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'EMP' ) = 1 THEN EMPLOYEES . TAX_ID ELSE 'XXX-XX-XXXX' END ENABLE ;</paragraph>
+<caption><location><page_13><loc_22><loc_48><loc_58><loc_49></location>Example 3-9 Creating a mask on the TAX_ID column</caption>
 <paragraph><location><page_14><loc_22><loc_90><loc_74><loc_91></location>- 3. Figure 3-10 shows the masks that are created in the HR_SCHEMA.</paragraph>
-<caption><location><page_14><loc_11><loc_77><loc_48><loc_78></location>Figure 3-10 Column masks shown in System i Navigator</caption>
 <figure>
 <location><page_14><loc_10><loc_79><loc_89><loc_88></location>
 <caption>Figure 3-10 Column masks shown in System i Navigator</caption>
@ -230,19 +224,16 @@
 <paragraph><location><page_14><loc_22><loc_55><loc_44><loc_56></location>ACTIVATE ROW ACCESS CONTROL</paragraph>
 <paragraph><location><page_14><loc_22><loc_54><loc_48><loc_55></location>ACTIVATE COLUMN ACCESS CONTROL;</paragraph>
 <paragraph><location><page_14><loc_22><loc_48><loc_88><loc_52></location>- 2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition .</paragraph>
-<caption><location><page_14><loc_11><loc_17><loc_57><loc_18></location>Figure 3-11 Selecting the EMPLOYEES table from System i Navigator</caption>
 <figure>
 <location><page_14><loc_10><loc_18><loc_87><loc_46></location>
 <caption>Figure 3-11 Selecting the EMPLOYEES table from System i Navigator</caption>
 </figure>
 <paragraph><location><page_15><loc_22><loc_87><loc_84><loc_91></location>- 2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause.</paragraph>
-<caption><location><page_15><loc_22><loc_38><loc_53><loc_39></location>Figure 4-68 Visual Explain with RCAC enabled</caption>
+<paragraph><location><page_15><loc_22><loc_32><loc_89><loc_36></location>- 3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.</paragraph>
 <figure>
 <location><page_15><loc_22><loc_40><loc_89><loc_85></location>
 <caption>Figure 4-68 Visual Explain with RCAC enabled</caption>
 </figure>
-<paragraph><location><page_15><loc_22><loc_32><loc_89><loc_36></location>- 3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.</paragraph>
-<caption><location><page_15><loc_11><loc_15><loc_37><loc_16></location>Figure 4-69 Index advice with no RCAC</caption>
 <figure>
 <location><page_15><loc_11><loc_16><loc_83><loc_30></location>
 <caption>Figure 4-69 Index advice with no RCAC</caption>
@ -252,8 +243,8 @@
 <subtitle-level-1><location><page_18><loc_4><loc_82><loc_73><loc_91></location>Row and Column Access Control Support in IBM DB2 for i</subtitle-level-1>
 <paragraph><location><page_18><loc_4><loc_66><loc_21><loc_69></location>Implement roles and separation of duties</paragraph>
 <paragraph><location><page_18><loc_4><loc_59><loc_20><loc_64></location>Leverage row permissions on the database</paragraph>
-<paragraph><location><page_18><loc_4><loc_52><loc_20><loc_57></location>Protect columns by defining column masks</paragraph>
 <paragraph><location><page_18><loc_25><loc_59><loc_68><loc_69></location>This IBM Redpaper publication provides information about the IBM i 7.2 feature of IBM DB2 for i Row and Column Access Control (RCAC). It offers a broad description of the function and advantages of controlling access to data in a comprehensive and transparent way. This publication helps you understand the capabilities of RCAC and provides examples of defining, creating, and implementing the row permissions and column masks in a relational database environment.</paragraph>
+<paragraph><location><page_18><loc_4><loc_52><loc_20><loc_57></location>Protect columns by defining column masks</paragraph>
 <paragraph><location><page_18><loc_25><loc_51><loc_68><loc_58></location>This paper is intended for database engineers, data-centric application developers, and security officers who want to design and implement RCAC as a part of their data control and governance policy. A solid background in IBM i object level security, DB2 for i relational database concepts, and SQL is assumed.</paragraph>
 <figure>
 <location><page_18><loc_79><loc_93><loc_93><loc_97></location>
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.json
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.json
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.md
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.md
@ -6,7 +6,7 @@ Front cover

 <!-- image -->

-<!-- image -->
+Front cover

 ## Contents

@ -74,20 +74,20 @@ This paper was produced by the IBM DB2 for i Center of Excellence team in partne

 <!-- image -->

-<!-- image -->
-
 Jim Bainbridge is a senior DB2 consultant on the DB2 for i Center of Excellence team in the IBM Lab Services and Training organization. His primary role is training and implementation services for IBM DB2 Web Query for i and business analytics. Jim began his career with IBM 30 years ago in the IBM Rochester Development Lab, where he developed cooperative processing products that paired IBM PCs with IBM S/36 and AS/.400 systems. In the years since, Jim has held numerous technical roles, including independent software vendors technical support on a broad range of IBM technologies and products, and supporting customers in the IBM Executive Briefing Center and IBM Project Office.

+<!-- image -->
+
 Hernando Bedoya is a Senior IT Specialist at STG Lab Services and Training in Rochester, Minnesota. He writes extensively and teaches IBM classes worldwide in all areas of DB2 for i. Before joining STG Lab Services, he worked in the ITSO for nine years writing multiple IBM Redbooksfi publications. He also worked for IBM Colombia as an IBM AS/400fi IT Specialist doing presales support for the Andean countries. He has 28 years of experience in the computing field and has taught database classes in Colombian universities. He holds a Master's degree in Computer Science from EAFIT, Colombia. His areas of expertise are database technology, performance, and data warehousing. Hernando can be contacted at hbedoya@us.ibm.com .

 ## Authors

 <!-- image -->

-Chapter 1.
-
 1

+Chapter 1.
+
 ## Securing and protecting IBM DB2 data

 Recent news headlines are filled with reports of data breaches and cyber-attacks impacting global businesses of all sizes. The Identity Theft Resource Center$^{1}$ reports that almost 5000 data breaches have occurred since 2005, exposing over 600 million records of data. The financial cost of these data breaches is skyrocketing. Studies from the Ponemon Institute$^{2}$ revealed that the average cost of a data breach increased in 2013 by 15% globally and resulted in a brand equity loss of $9.4 million per attack. The average cost that is incurred for each lost record containing sensitive information increased more than 9% to $145 per record.
@ -211,10 +211,10 @@ Table 2-2 Comparison of the different function usage IDs and *JOBCTL authority
 | User action                                                                    | *JOBCTL   | QIBM_DB_SECADM   | QIBM_DB_SQLADM   | QIBM_DB_SYSMON   | No Authority   |
 |--------------------------------------------------------------------------------|-----------|------------------|------------------|------------------|----------------|
 | SET CURRENT DEGREE  (SQL statement)                                            | X         |                  | X                |                  |                |
-| CHGQRYA  command targeting a different user's job                              | X         |                  | X                |                  |                |
-| STRDBMON  or  ENDDBMON  commands targeting a different user's job              | X         |                  | X                |                  |                |
+| CHGQRYA  command targeting a different user’s job                              | X         |                  | X                |                  |                |
+| STRDBMON  or  ENDDBMON  commands targeting a different user’s job              | X         |                  | X                |                  |                |
 | STRDBMON  or  ENDDBMON  commands targeting a job that matches the current user | X         |                  | X                | X                | X              |
-| QUSRJOBI() API format 900 or System i Navigator's SQL Details for Job          | X         |                  | X                | X                |                |
+| QUSRJOBI() API format 900 or System i Navigator’s SQL Details for Job          | X         |                  | X                | X                |                |
 | Visual Explain within Run SQL scripts                                          | X         |                  | X                | X                | X              |
 | Visual Explain outside of Run SQL scripts                                      | X         |                  | X                |                  |                |
 | ANALYZE PLAN CACHE procedure                                                   | X         |                  | X                |                  |                |
@ -226,8 +226,6 @@ Table 2-2 Comparison of the different function usage IDs and *JOBCTL authority
 The SQL CREATE PERMISSION statement that is shown in Figure 3-1 is used to define and initially enable or disable the row access rules.

 Figure 3-1 CREATE PERMISSION SQL statement
-
-The SQL CREATE PERMISSION statement that is shown in Figure 3-1 is used to define and initially enable or disable the row access rules.Figure 3-1 CREATE PERMISSION SQL statement
 <!-- image -->

 ## Column mask
@ -315,10 +313,10 @@ WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR', 'EMP' ) = 1 THEN EMPLOYEES . D

 - To implement this column mask, run the SQL statement that is shown in Example 3-9.

-Example 3-9 Creating a mask on the TAX_ID column
-
 CREATE MASK HR_SCHEMA.MASK_TAX_ID_ON_EMPLOYEES ON HR_SCHEMA.EMPLOYEES AS EMPLOYEES FOR COLUMN TAX_ID RETURN CASE WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR' ) = 1 THEN EMPLOYEES . TAX_ID WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER = EMPLOYEES . USER_ID THEN EMPLOYEES . TAX_ID WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER <> EMPLOYEES . USER_ID THEN ( 'XXX-XX-' CONCAT QSYS2 . SUBSTR ( EMPLOYEES . TAX_ID , 8 , 4 ) ) WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'EMP' ) = 1 THEN EMPLOYEES . TAX_ID ELSE 'XXX-XX-XXXX' END ENABLE ;

+Example 3-9 Creating a mask on the TAX_ID column
+
 - 3. Figure 3-10 shows the masks that are created in the HR_SCHEMA.

 Figure 3-10 Column masks shown in System i Navigator
@ -351,11 +349,11 @@ Figure 3-11 Selecting the EMPLOYEES table from System i Navigator

 - 2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause.

+- 3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.
+
 Figure 4-68 Visual Explain with RCAC enabled
 <!-- image -->

- 3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.
-
 Figure 4-69 Index advice with no RCAC
 <!-- image -->

@ -369,10 +367,10 @@ Implement roles and separation of duties

 Leverage row permissions on the database

-Protect columns by defining column masks
-
 This IBM Redpaper publication provides information about the IBM i 7.2 feature of IBM DB2 for i Row and Column Access Control (RCAC). It offers a broad description of the function and advantages of controlling access to data in a comprehensive and transparent way. This publication helps you understand the capabilities of RCAC and provides examples of defining, creating, and implementing the row permissions and column masks in a relational database environment.

+Protect columns by defining column masks
+
 This paper is intended for database engineers, data-centric application developers, and security officers who want to design and implement RCAC as a part of their data control and governance policy. A solid background in IBM i object level security, DB2 for i relational database concepts, and SQL is assumed.

 <!-- image -->
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_01.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/right_to_left_01.doctags.txt
@ -2,8 +2,8 @@
 <subtitle-level-1><location><page_1><loc_37><loc_89><loc_85><loc_91></location>Pythonو R ةغلب ةجمربلا للاخ نم تلاكشملا لحو ةيجاتنلإا نيسحت</subtitle-level-1>
 <paragraph><location><page_1><loc_15><loc_80><loc_85><loc_87></location>Python و R ةغلب ةجمربلا ربتعت ةلاعف لولح داجيإ يف دعاستو ةيجاتنلإا ززعت نأ نكمي يتلا ةيوقلا تاودلأا نم ءاملعلاو نيللحملا ىلع لهسي امم ،تانايبلا ليلحتل ةيلاثم اهلعجت ةديرف تازيمPython و R نم لك كلتمي .تلاكشملل ناك اذإ .ةلاعفو ةعيرس ةقيرطب ةدقعم تلايلحت ءارجإ مهسي نأ نكمي تاغللا هذه مادختسا نإف ،ةيليلحت ةيلقع كيدل .لمعلا جئاتن نيسحت يف ريبك لكشب</paragraph>
 <paragraph><location><page_1><loc_34><loc_73><loc_34><loc_75></location>ً</paragraph>
-<paragraph><location><page_1><loc_16><loc_71><loc_85><loc_78></location>جارختساو تانايبلا نم ةلئاه تايمك ةجلاعم نكمملا نم حبصي ،ةجمربلا تاراهم عم يليلحتلا ريكفتلا عمتجي امدنع ذيفنتلPython و R مادختسا نيجمربملل نكمي .اهنم تاهجوتلاو طامنلأا ةجذمنلا لثم ،ةمدقتم ةيليلحت تايلمع ةقد رثكأ تارارق ذاختا ىلإ ا ضيأ يدؤي نأ نكمي لب ،تقولا رفوي طقف سيل اذه .ةريبكلا تانايبلا ليلحتو ةيئاصحلإا تانايبلا ىلع ةمئاق تاجاتنتسا ىلع ءانب .</paragraph>
 <paragraph><location><page_1><loc_83><loc_71><loc_83><loc_73></location>ً</paragraph>
+<paragraph><location><page_1><loc_16><loc_71><loc_85><loc_78></location>جارختساو تانايبلا نم ةلئاه تايمك ةجلاعم نكمملا نم حبصي ،ةجمربلا تاراهم عم يليلحتلا ريكفتلا عمتجي امدنع ذيفنتلPython و R مادختسا نيجمربملل نكمي .اهنم تاهجوتلاو طامنلأا ةجذمنلا لثم ،ةمدقتم ةيليلحت تايلمع ةقد رثكأ تارارق ذاختا ىلإ ا ضيأ يدؤي نأ نكمي لب ،تقولا رفوي طقف سيل اذه .ةريبكلا تانايبلا ليلحتو ةيئاصحلإا تانايبلا ىلع ةمئاق تاجاتنتسا ىلع ءانب .</paragraph>
 <paragraph><location><page_1><loc_15><loc_63><loc_85><loc_70></location>ليلحتلا نم ،تاقيبطتلا نم ةعساو ةعومجم معدت ةينغ تاودأو تابتكمPython و R نم لك رفوت ،كلذ ىلع ةولاع ىلع .ةفلتخملا تلاكشملل ةركتبم لولح ريوطتل تابتكملا هذه نم ةدافتسلاا نيمدختسملل نكمي .يللآا ملعتلا ىلإ ينايبلا R رفوت امنيب ،ةءافكب تانايبلا ةرادلإ Python يف pandas ةبتكم مادختسا نكمي ،لاثملا ليبس مسرلل ةيوق تاودأ .نيللحملاو نيثحابلل ةيلاثم اهلعجي امم ،يئاصحلإا ليلحتلاو ينايبلا</paragraph>
 <paragraph><location><page_1><loc_16><loc_56><loc_85><loc_61></location>Python و R ةغلب ةجمربلا يدؤت نأ نكمي ،ةياهنلا يف ةركتبم لولح ريفوتو ةيجاتنلإا نيسحت ىلإ ةيليلحت ةيلقع عم اهل نوكت نأ نكمي ةبسانملا ةيجمربلا بيلاسلأا قيبطتو لاعف لكشب تانايبلا ليلحت ىلع ةردقلا نإ .ةدقعملا تلاكشملل .ينهملاو يصخشلا ءادلأا ىلع ىدملا ةديعب ةيباجيإ تاريثأت</paragraph>
 </document>
--- a/tests/data/groundtruth/docling_v1/right_to_left_01.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_01.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_01.md
+++ b/tests/data/groundtruth/docling_v1/right_to_left_01.md
@ -6,8 +6,6 @@ Python و R ةغلب ةجمربلا ربتعت ةلاعف لولح داجيإ ي

 جارختساو تانايبلا نم ةلئاه تايمك ةجلاعم نكمملا نم حبصي ،ةجمربلا تاراهم عم يليلحتلا ريكفتلا عمتجي امدنع ذيفنتلPython و R مادختسا نيجمربملل نكمي .اهنم تاهجوتلاو طامنلأا ةجذمنلا لثم ،ةمدقتم ةيليلحت تايلمع ةقد رثكأ تارارق ذاختا ىلإ ا ضيأ يدؤي نأ نكمي لب ،تقولا رفوي طقف سيل اذه .ةريبكلا تانايبلا ليلحتو ةيئاصحلإا تانايبلا ىلع ةمئاق تاجاتنتسا ىلع ءانب .

-ً
-
 ليلحتلا نم ،تاقيبطتلا نم ةعساو ةعومجم معدت ةينغ تاودأو تابتكمPython و R نم لك رفوت ،كلذ ىلع ةولاع ىلع .ةفلتخملا تلاكشملل ةركتبم لولح ريوطتل تابتكملا هذه نم ةدافتسلاا نيمدختسملل نكمي .يللآا ملعتلا ىلإ ينايبلا R رفوت امنيب ،ةءافكب تانايبلا ةرادلإ Python يف pandas ةبتكم مادختسا نكمي ،لاثملا ليبس مسرلل ةيوق تاودأ .نيللحملاو نيثحابلل ةيلاثم اهلعجي امم ،يئاصحلإا ليلحتلاو ينايبلا

 Python و R ةغلب ةجمربلا يدؤت نأ نكمي ،ةياهنلا يف ةركتبم لولح ريفوتو ةيجاتنلإا نيسحت ىلإ ةيليلحت ةيلقع عم اهل نوكت نأ نكمي ةبسانملا ةيجمربلا بيلاسلأا قيبطتو لاعف لكشب تانايبلا ليلحت ىلع ةردقلا نإ .ةدقعملا تلاكشملل .ينهملاو يصخشلا ءادلأا ىلع ىدملا ةديعب ةيباجيإ تاريثأت
--- a/tests/data/groundtruth/docling_v1/right_to_left_02.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_02.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_03.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_03.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt
@ -4,17 +4,17 @@
 <text><loc_170><loc_111><loc_309><loc_116>{ ahn,nli,mly,taa } @zurich.ibm.com</text>
 <section_header_level_1><loc_119><loc_136><loc_156><loc_143>Abstract</section_header_level_1>
 <section_header_level_1><loc_258><loc_138><loc_334><loc_143>a. Picture of a table:</section_header_level_1>
+<text><loc_41><loc_152><loc_234><loc_324>Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.</text>
 <section_header_level_1><loc_41><loc_341><loc_104><loc_348>1. Introduction</section_header_level_1>
 <text><loc_41><loc_354><loc_234><loc_450>The occurrence of tables in documents is ubiquitous. They often summarise quantitative or factual data, which is cumbersome to describe in verbose text but nevertheless extremely valuable. Unfortunately, this compact representation is often not easy to parse by machines. There are many implicit conventions used to obtain a compact table representation. For example, tables often have complex columnand row-headers in order to reduce duplicated cell content. Lines of different shapes and sizes are leveraged to separate content or indicate a tree structure. Additionally, tables can also have empty/missing table-entries or multi-row textual table-entries. Fig. 1 shows a table which presents all these issues.</text>
 <picture><loc_258><loc_144><loc_439><loc_191></picture>
-<otsl><loc_258><loc_144><loc_439><loc_191><ched>3<ched>1<nl><caption><loc_41><loc_152><loc_234><loc_324>Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.</caption></otsl>
+<otsl><loc_258><loc_144><loc_439><loc_191><ched>3<ched>1<nl></otsl>
 <unordered_list><list_item><loc_258><loc_198><loc_397><loc_210>b. Red-annotation of bounding boxes, Blue-predictions by TableFormer</list_item>
+<list_item><loc_258><loc_265><loc_401><loc_271>c. Structure predicted by TableFormer:</list_item>
 </unordered_list>
 <picture><loc_257><loc_213><loc_441><loc_259></picture>
-<unordered_list><list_item><loc_258><loc_265><loc_401><loc_271>c. Structure predicted by TableFormer:</list_item>
-</unordered_list>
-<picture><loc_258><loc_274><loc_439><loc_313></picture>
-<otsl><loc_258><loc_274><loc_439><loc_313><ched>0<ched>1<lcel><ched>2 1<lcel><ecel><nl><fcel>3<fcel>4<fcel>5 3<fcel>6<fcel>7<ecel><nl><fcel>8<fcel>9<fcel>10<fcel>11<fcel>12<fcel>2<nl><ecel><fcel>13<fcel>14<fcel>15<fcel>16<ucel><nl><ecel><fcel>17<fcel>18<fcel>19<fcel>20<ucel><nl><caption><loc_252><loc_325><loc_445><loc_353>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption></otsl>
+<picture><loc_258><loc_274><loc_439><loc_313><caption><loc_252><loc_325><loc_445><loc_353>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption></picture>
+<otsl><loc_258><loc_274><loc_439><loc_313><ched>0<ched>1<lcel><ched>2 1<lcel><ecel><nl><fcel>3<fcel>4<fcel>5 3<fcel>6<fcel>7<ecel><nl><fcel>8<fcel>9<fcel>10<fcel>11<fcel>12<fcel>2<nl><ecel><fcel>13<fcel>14<fcel>15<fcel>16<ucel><nl><ecel><fcel>17<fcel>18<fcel>19<fcel>20<ucel><nl></otsl>
 <text><loc_252><loc_369><loc_445><loc_420>Recently, significant progress has been made with vision based approaches to extract tables in documents. For the sake of completeness, the issue of table extraction from documents is typically decomposed into two separate challenges, i.e. (1) finding the location of the table(s) on a document-page and (2) finding the structure of a given table in the document.</text>
 <text><loc_252><loc_422><loc_445><loc_450>The first problem is called table-location and has been previously addressed [30, 38, 19, 21, 23, 26, 8] with stateof-the-art object-detection networks (e.g. YOLO and later on Mask-RCNN [9]). For all practical purposes, it can be</text>
 <page_footer><loc_241><loc_463><loc_245><loc_469>1</page_footer>
@ -30,12 +30,12 @@
 </unordered_list>
 <text><loc_41><loc_411><loc_234><loc_439>The paper is structured as follows. In Sec. 2, we give a brief overview of the current state-of-the-art. In Sec. 3, we describe the datasets on which we train. In Sec. 4, we introduce the TableFormer model-architecture and describe</text>
 <footnote><loc_50><loc_445><loc_150><loc_450>$^{1}$https://github.com/IBM/SynthTabNet</footnote>
-<page_footer><loc_241><loc_463><loc_245><loc_469>2</page_footer>
 <text><loc_252><loc_47><loc_445><loc_68>its results & performance in Sec. 5. As a conclusion, we describe how this new model-architecture can be re-purposed for other tasks in the computer-vision community.</text>
 <section_header_level_1><loc_252><loc_77><loc_407><loc_84>2. Previous work and State of the Art</section_header_level_1>
 <text><loc_252><loc_90><loc_445><loc_209>Identifying the structure of a table has been an outstanding problem in the document-parsing community, that motivates many organised public challenges [6, 4, 14]. The difficulty of the problem can be attributed to a number of factors. First, there is a large variety in the shapes and sizes of tables. Such large variety requires a flexible method. This is especially true for complex column- and row headers, which can be extremely intricate and demanding. A second factor of complexity is the lack of data with regard to table-structure. Until the publication of PubTabNet [37], there were no large datasets (i.e. > 100 K tables) that provided structure information. This happens primarily due to the fact that tables are notoriously time-consuming to annotate by hand. However, this has definitely changed in recent years with the deliverance of PubTabNet [37], FinTabNet [36], TableBank [17] etc.</text>
 <text><loc_252><loc_211><loc_445><loc_284>Before the rising popularity of deep neural networks, the community relied heavily on heuristic and/or statistical methods to do table structure identification [3, 7, 11, 5, 13, 28]. Although such methods work well on constrained tables [12], a more data-driven approach can be applied due to the advent of convolutional neural networks (CNNs) and the availability of large datasets. To the best-of-our knowledge, there are currently two different types of network architecture that are being pursued for state-of-the-art tablestructure identification.</text>
 <text><loc_252><loc_286><loc_445><loc_450>Image-to-Text networks : In this type of network, one predicts a sequence of tokens starting from an encoded image. Such sequences of tokens can be HTML table tags [37, 17] or LaTeX symbols[10]. The choice of symbols is ultimately not very important, since one can be transformed into the other. There are however subtle variations in the Image-to-Text networks. The easiest network architectures are "image-encoder → text-decoder" (IETD), similar to network architectures that try to provide captions to images [32]. In these IETD networks, one expects as output the LaTeX/HTML string of the entire table, i.e. the symbols necessary for creating the table with the content of the table. Another approach is the "image-encoder → dual decoder" (IEDD) networks. In these type of networks, one has two consecutive decoders with different purposes. The first decoder is the tag-decoder , i.e. it only produces the HTML/LaTeX tags which construct an empty table. The second content-decoder uses the encoding of the image in combination with the output encoding of each cell-tag (from the tag-decoder ) to generate the textual content of each table cell. The network architecture of IEDD is certainly more elaborate, but it has the advantage that one can pre-train the</text>
+<page_footer><loc_241><loc_463><loc_245><loc_469>2</page_footer>
 <page_break>
 <text><loc_41><loc_47><loc_204><loc_53>tag-decoder which is constrained to the table-tags.</text>
 <text><loc_41><loc_55><loc_234><loc_174>In practice, both network architectures (IETD and IEDD) require an implicit, custom trained object-characterrecognition (OCR) to obtain the content of the table-cells. In the case of IETD, this OCR engine is implicit in the decoder similar to [24]. For the IEDD, the OCR is solely embedded in the content-decoder. This reliance on a custom, implicit OCR decoder is of course problematic. OCR is a well known and extremely tough problem, that often needs custom training for each individual language. However, the limited availability for non-english content in the current datasets, makes it impractical to apply the IETD and IEDD methods on tables with other languages. Additionally, OCR can be completely omitted if the tables originate from programmatic PDF documents with known positions of each cell. The latter was the inspiration for the work of this paper.</text>
@ -43,18 +43,17 @@
 <text><loc_41><loc_312><loc_234><loc_393>Hybrid Deep Learning-Rule-Based approach : A popular current model for table-structure identification is the use of a hybrid Deep Learning-Rule-Based approach similar to [27, 29]. In this approach, one first detects the position of the table-cells with object detection (e.g. YoloVx or MaskRCNN), then classifies the table into different types (from its images) and finally uses different rule-sets to obtain its table-structure. Currently, this approach achieves stateof-the-art results, but is not an end-to-end deep-learning method. As such, new rules need to be written if different types of tables are encountered.</text>
 <section_header_level_1><loc_41><loc_401><loc_86><loc_408>3. Datasets</section_header_level_1>
 <text><loc_41><loc_414><loc_234><loc_450>We rely on large-scale datasets such as PubTabNet [37], FinTabNet [36], and TableBank [17] datasets to train and evaluate our models. These datasets span over various appearance styles and content. We also introduce our own synthetically generated SynthTabNet dataset to fix an im-</text>
-<page_footer><loc_241><loc_463><loc_245><loc_469>3</page_footer>
 <picture><loc_255><loc_50><loc_450><loc_158><caption><loc_252><loc_169><loc_445><loc_182>Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets</caption></picture>
 <text><loc_252><loc_200><loc_357><loc_206>balance in the previous datasets.</text>
 <text><loc_252><loc_209><loc_445><loc_396>The PubTabNet dataset contains 509k tables delivered as annotated PNG images. The annotations consist of the table structure represented in HTML format, the tokenized text and its bounding boxes per table cell. Fig. 1 shows the appearance style of PubTabNet. Depending on its complexity, a table is characterized as "simple" when it does not contain row spans or column spans, otherwise it is "complex". The dataset is divided into Train and Val splits (roughly 98% and 2%). The Train split consists of 54% simple and 46% complex tables and the Val split of 51% and 49% respectively. The FinTabNet dataset contains 112k tables delivered as single-page PDF documents with mixed table structures and text content. Similarly to the PubTabNet, the annotations of FinTabNet include the table structure in HTML, the tokenized text and the bounding boxes on a table cell basis. The dataset is divided into Train, Test and Val splits (81%, 9.5%, 9.5%), and each one is almost equally divided into simple and complex tables (Train: 48% simple, 52% complex, Test: 48% simple, 52% complex, Test: 53% simple, 47% complex). Finally the TableBank dataset consists of 145k tables provided as JPEG images. The latter has annotations for the table structure, but only few with bounding boxes of the table cells. The entire dataset consists of simple tables and it is divided into 90% Train, 3% Test and 7% Val splits.</text>
 <text><loc_252><loc_399><loc_445><loc_450>Due to the heterogeneity across the dataset formats, it was necessary to combine all available data into one homogenized dataset before we could train our models for practical purposes. Given the size of PubTabNet, we adopted its annotation format and we extracted and converted all tables as PNG images with a resolution of 72 dpi. Additionally, we have filtered out tables with extreme sizes due to small</text>
+<page_footer><loc_241><loc_463><loc_245><loc_469>3</page_footer>
 <page_break>
 <text><loc_41><loc_47><loc_234><loc_61>amount of such tables, and kept only those ones ranging between 1*1 and 20*10 (rows/columns).</text>
 <text><loc_41><loc_64><loc_234><loc_198>The availability of the bounding boxes for all table cells is essential to train our models. In order to distinguish between empty and non-empty bounding boxes, we have introduced a binary class in the annotation. Unfortunately, the original datasets either omit the bounding boxes for whole tables (e.g. TableBank) or they narrow their scope only to non-empty cells. Therefore, it was imperative to introduce a data pre-processing procedure that generates the missing bounding boxes out of the annotation information. This procedure first parses the provided table structure and calculates the dimensions of the most fine-grained grid that covers the table structure. Notice that each table cell may occupy multiple grid squares due to row or column spans. In case of PubTabNet we had to compute missing bounding boxes for 48% of the simple and 69% of the complex tables. Regarding FinTabNet, 68% of the simple and 98% of the complex tables require the generation of bounding boxes.</text>
 <text><loc_41><loc_201><loc_234><loc_274>As it is illustrated in Fig. 2, the table distributions from all datasets are skewed towards simpler structures with fewer number of rows/columns. Additionally, there is very limited variance in the table styles, which in case of PubTabNet and FinTabNet means one styling format for the majority of the tables. Similar limitations appear also in the type of table content, which in some cases (e.g. FinTabNet) is restricted to a certain domain. Ultimately, the lack of diversity in the training dataset damages the ability of the models to generalize well on unseen data.</text>
 <text><loc_41><loc_277><loc_234><loc_396>Motivated by those observations we aimed at generating a synthetic table dataset named SynthTabNet . This approach offers control over: 1) the size of the dataset, 2) the table structure, 3) the table style and 4) the type of content. The complexity of the table structure is described by the size of the table header and the table body, as well as the percentage of the table cells covered by row spans and column spans. A set of carefully designed styling templates provides the basis to build a wide range of table appearances. Lastly, the table content is generated out of a curated collection of text corpora. By controlling the size and scope of the synthetic datasets we are able to train and evaluate our models in a variety of different conditions. For example, we can first generate a highly diverse dataset to train our models and then evaluate their performance on other synthetic datasets which are focused on a specific domain.</text>
 <text><loc_41><loc_399><loc_234><loc_450>In this regard, we have prepared four synthetic datasets, each one containing 150k examples. The corpora to generate the table text consists of the most frequent terms appearing in PubTabNet and FinTabNet together with randomly generated text. The first two synthetic datasets have been fine-tuned to mimic the appearance of the original datasets but encompass more complicated table structures. The third</text>
-<page_footer><loc_241><loc_463><loc_245><loc_469>4</page_footer>
 <otsl><loc_254><loc_46><loc_444><loc_98><ecel><ched>Tags<ched>Bbox<ched>Size<ched>Format<nl><rhed>PubTabNet<fcel>3<fcel>3<fcel>509k<fcel>PNG<nl><rhed>FinTabNet<fcel>3<fcel>3<fcel>112k<fcel>PDF<nl><rhed>TableBank<fcel>3<fcel>7<fcel>145k<fcel>JPEG<nl><rhed>Combined-Tabnet(*)<fcel>3<fcel>3<fcel>400k<fcel>PNG<nl><rhed>Combined(**)<fcel>3<fcel>3<fcel>500k<fcel>PNG<nl><rhed>SynthTabNet<fcel>3<fcel>3<fcel>600k<fcel>PNG<nl><caption><loc_252><loc_106><loc_445><loc_142>Table 1: Both "Combined-Tabnet" and "CombinedTabnet" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.</caption></otsl>
 <text><loc_252><loc_158><loc_445><loc_186>one adopts a colorful appearance with high contrast and the last one contains tables with sparse content. Lastly, we have combined all synthetic datasets into one big unified synthetic dataset of 600k examples.</text>
 <text><loc_262><loc_188><loc_443><loc_194>Tab. 1 summarizes the various attributes of the datasets.</text>
@ -63,6 +62,7 @@
 <section_header_level_1><loc_252><loc_289><loc_343><loc_295>4.1. Model architecture.</section_header_level_1>
 <text><loc_252><loc_301><loc_445><loc_420>We now describe in detail the proposed method, which is composed of three main components, see Fig. 4. Our CNN Backbone Network encodes the input as a feature vector of predefined length. The input feature vector of the encoded image is passed to the Structure Decoder to produce a sequence of HTML tags that represent the structure of the table. With each prediction of an HTML standard data cell (' < td > ') the hidden state of that cell is passed to the Cell BBox Decoder. As for spanning cells, such as row or column span, the tag is broken down to ' < ', 'rowspan=' or 'colspan=', with the number of spanning cells (attribute), and ' > '. The hidden state attached to ' < ' is passed to the Cell BBox Decoder. A shared feed forward network (FFN) receives the hidden states from the Structure Decoder, to provide the final detection predictions of the bounding box coordinates and their classification.</text>
 <text><loc_252><loc_422><loc_445><loc_450>CNN Backbone Network. A ResNet-18 CNN is the backbone that receives the table image and encodes it as a vector of predefined length. The network has been modified by removing the linear and pooling layer, as we are not per-</text>
+<page_footer><loc_241><loc_463><loc_245><loc_469>4</page_footer>
 <page_break>
 <picture><loc_61><loc_49><loc_425><loc_116><caption><loc_41><loc_129><loc_445><loc_142>Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.</caption></picture>
 <picture><loc_43><loc_163><loc_233><loc_320><caption><loc_41><loc_333><loc_234><loc_429>Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.</caption></picture>
@ -83,15 +83,14 @@
 <section_header_level_1><loc_41><loc_364><loc_146><loc_370>5.1. Implementation Details</section_header_level_1>
 <text><loc_41><loc_376><loc_234><loc_404>TableFormer uses ResNet-18 as the CNN Backbone Network . The input images are resized to 448*448 pixels and the feature map has a dimension of 28*28. Additionally, we enforce the following input constraints:</text>
 <formula><loc_75><loc_413><loc_234><loc_428></formula>
-<text><loc_41><loc_437><loc_234><loc_450>Although input constraints are used also by other methods, such as EDD, ours are less restrictive due to the improved</text>
-<page_footer><loc_241><loc_463><loc_245><loc_469>6</page_footer>
-<text><loc_252><loc_47><loc_445><loc_68>runtime performance and lower memory footprint of TableFormer. This allows to utilize input samples with longer sequences and images with larger dimensions.</text>
+<text><loc_41><loc_437><loc_234><loc_450><loc_41><loc_437><loc_234><loc_450>Although input constraints are used also by other methods, such as EDD, ours are less restrictive due to the improved runtime performance and lower memory footprint of TableFormer. This allows to utilize input samples with longer sequences and images with larger dimensions.</text>
 <text><loc_252><loc_73><loc_445><loc_207>The Transformer Encoder consists of two "Transformer Encoder Layers", with an input feature size of 512, feed forward network of 1024, and 4 attention heads. As for the Transformer Decoder it is composed of four "Transformer Decoder Layers" with similar input and output dimensions as the "Transformer Encoder Layers". Even though our model uses fewer layers and heads than the default implementation parameters, our extensive experimentation has proved this setup to be more suitable for table images. We attribute this finding to the inherent design of table images, which contain mostly lines and text, unlike the more elaborate content present in other scopes (e.g. the COCO dataset). Moreover, we have added ResNet blocks to the inputs of the Structure Decoder and Cell BBox Decoder. This prevents a decoder having a stronger influence over the learned weights which would damage the other prediction task (structure vs bounding boxes), but learn task specific weights instead. Lastly our dropout layers are set to 0.5.</text>
 <text><loc_252><loc_212><loc_445><loc_271>For training, TableFormer is trained with 3 Adam optimizers, each one for the CNN Backbone Network , Structure Decoder , and Cell BBox Decoder . Taking the PubTabNet as an example for our parameter set up, the initializing learning rate is 0.001 for 12 epochs with a batch size of 24, and λ set to 0.5. Afterwards, we reduce the learning rate to 0.0001, the batch size to 18 and train for 12 more epochs or convergence.</text>
 <text><loc_252><loc_276><loc_445><loc_350>TableFormer is implemented with PyTorch and Torchvision libraries [22]. To speed up the inference, the image undergoes a single forward pass through the CNN Backbone Network and transformer encoder. This eliminates the overhead of generating the same features for each decoding step. Similarly, we employ a 'caching' technique to preform faster autoregressive decoding. This is achieved by storing the features of decoded tokens so we can reuse them for each time step. Therefore, we only compute the attention for each new tag.</text>
 <section_header_level_1><loc_252><loc_366><loc_325><loc_372>5.2. Generalization</section_header_level_1>
 <text><loc_252><loc_381><loc_445><loc_424>TableFormer is evaluated on three major publicly available datasets of different nature to prove the generalization and effectiveness of our model. The datasets used for evaluation are the PubTabNet, FinTabNet and TableBank which stem from the scientific, financial and general domains respectively.</text>
 <text><loc_252><loc_430><loc_445><loc_450>We also share our baseline results on the challenging SynthTabNet dataset. Throughout our experiments, the same parameters stated in Sec. 5.1 are utilized.</text>
+<page_footer><loc_241><loc_463><loc_245><loc_469>6</page_footer>
 <page_break>
 <section_header_level_1><loc_41><loc_47><loc_137><loc_53>5.3. Datasets and Metrics</section_header_level_1>
 <text><loc_41><loc_59><loc_234><loc_87>The Tree-Edit-Distance-Based Similarity (TEDS) metric was introduced in [37]. It represents the prediction, and ground-truth as a tree structure of HTML tags. This similarity is calculated as:</text>
@ -99,14 +98,14 @@
 <text><loc_41><loc_114><loc_234><loc_135>where T$_{a}$ and T$_{b}$ represent tables in tree structure HTML format. EditDist denotes the tree-edit distance, and | T | represents the number of nodes in T .</text>
 <section_header_level_1><loc_41><loc_142><loc_139><loc_148>5.4. Quantitative Analysis</section_header_level_1>
 <text><loc_41><loc_154><loc_234><loc_250>Structure. As shown in Tab. 2, TableFormer outperforms all SOTA methods across different datasets by a large margin for predicting the table structure from an image. All the more, our model outperforms pre-trained methods. During the evaluation we do not apply any table filtering. We also provide our baseline results on the SynthTabNet dataset. It has been observed that large tables (e.g. tables that occupy half of the page or more) yield poor predictions. We attribute this issue to the image resizing during the preprocessing step, that produces downsampled images with indistinguishable features. This problem can be addressed by treating such big tables with a separate model which accepts a large input image size.</text>
-<otsl><loc_44><loc_258><loc_231><loc_368><ched>Model<ched>Dataset<ched>Simple<ched>TEDS Complex<ched>All<nl><rhed>EDD<fcel>PTN<fcel>91.1<fcel>88.7<fcel>89.9<nl><rhed>GTE<fcel>PTN<fcel>-<fcel>-<fcel>93.01<nl><rhed>TableFormer<fcel>PTN<fcel>98.5<fcel>95.0<fcel>96.75<nl><rhed>EDD<fcel>FTN<fcel>88.4<fcel>92.08<fcel>90.6<nl><rhed>GTE<fcel>FTN<fcel>-<fcel>-<fcel>87.14<nl><rhed>GTE (FT)<fcel>FTN<fcel>-<fcel>-<fcel>91.02<nl><rhed>TableFormer<fcel>FTN<fcel>97.5<fcel>96.0<fcel>96.8<nl><rhed>EDD<fcel>TB<fcel>86.0<fcel>-<fcel>86.0<nl><rhed>TableFormer<fcel>TB<fcel>89.6<fcel>-<fcel>89.6<nl><rhed>TableFormer<fcel>STN<fcel>96.9<fcel>95.7<fcel>96.7<nl><caption><loc_41><loc_374><loc_234><loc_387>Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).</caption></otsl>
+<otsl><loc_44><loc_258><loc_231><loc_368><ched>Model<ched>Dataset<ched>Simple<ched>TEDS Complex<ched>All<nl><rhed>EDD<fcel>PTN<fcel>91.1<fcel>88.7<fcel>89.9<nl><rhed>GTE<fcel>PTN<fcel>-<fcel>-<fcel>93.01<nl><rhed>TableFormer<fcel>PTN<fcel>98.5<fcel>95.0<fcel>96.75<nl><rhed>EDD<fcel>FTN<fcel>88.4<fcel>92.08<fcel>90.6<nl><rhed>GTE<fcel>FTN<fcel>-<fcel>-<fcel>87.14<nl><rhed>GTE (FT)<fcel>FTN<fcel>-<fcel>-<fcel>91.02<nl><rhed>TableFormer<fcel>FTN<fcel>97.5<fcel>96.0<fcel>96.8<nl><rhed>EDD<fcel>TB<fcel>86.0<fcel>-<fcel>86.0<nl><rhed>TableFormer<fcel>TB<fcel>89.6<fcel>-<fcel>89.6<nl><rhed>TableFormer<fcel>STN<fcel>96.9<fcel>95.7<fcel>96.7<nl></otsl>
+<text><loc_41><loc_374><loc_234><loc_387>Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).</text>
 <text><loc_41><loc_389><loc_214><loc_395>FT: Model was trained on PubTabNet then finetuned.</text>
-<text><loc_41><loc_407><loc_234><loc_450>Cell Detection. Like any object detector, our Cell BBox Detector provides bounding boxes that can be improved with post-processing during inference. We make use of the grid-like structure of tables to refine the predictions. A detailed explanation on the post-processing is available in the supplementary material. As shown in Tab. 3, we evaluate</text>
-<page_footer><loc_241><loc_463><loc_245><loc_469>7</page_footer>
-<text><loc_252><loc_47><loc_445><loc_144>our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.</text>
+<text><loc_41><loc_407><loc_234><loc_450><loc_41><loc_407><loc_234><loc_450>Cell Detection. Like any object detector, our Cell BBox Detector provides bounding boxes that can be improved with post-processing during inference. We make use of the grid-like structure of tables to refine the predictions. A detailed explanation on the post-processing is available in the supplementary material. As shown in Tab. 3, we evaluate our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.</text>
 <otsl><loc_252><loc_156><loc_436><loc_192><ched>Model<ched>Dataset<ched>mAP<ched>mAP (PP)<nl><fcel>EDD+BBox<fcel>PubTabNet<fcel>79.2<fcel>82.7<nl><fcel>TableFormer<fcel>PubTabNet<fcel>82.1<fcel>86.8<nl><fcel>TableFormer<fcel>SynthTabNet<fcel>87.7<fcel>-<nl><caption><loc_252><loc_200><loc_445><loc_213>Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.</caption></otsl>
 <text><loc_252><loc_232><loc_445><loc_328>Cell Content. In this section, we evaluate the entire pipeline of recovering a table with content. Here we put our approach to test by capitalizing on extracting content from the PDF cells rather than decoding from images. Tab. 4 shows the TEDs score of HTML code representing the structure of the table along with the content inserted in the data cell and compared with the ground-truth. Our method achieved a 5.3% increase over the state-of-the-art, and commercial solutions. We believe our scores would be higher if the HTML ground-truth matched the extracted PDF cell content. Unfortunately, there are small discrepancies such as spacings around words or special characters with various unicode representations.</text>
 <otsl><loc_272><loc_341><loc_426><loc_406><fcel>Model<ched>Simple<ched>TEDS Complex<ched>All<nl><rhed>Tabula<fcel>78.0<fcel>57.8<fcel>67.9<nl><rhed>Traprange<fcel>60.8<fcel>49.9<fcel>55.4<nl><rhed>Camelot<fcel>80.0<fcel>66.0<fcel>73.0<nl><rhed>Acrobat Pro<fcel>68.9<fcel>61.8<fcel>65.3<nl><rhed>EDD<fcel>91.2<fcel>85.4<fcel>88.3<nl><rhed>TableFormer<fcel>95.4<fcel>90.1<fcel>93.6<nl><caption><loc_252><loc_415><loc_445><loc_435>Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.</caption></otsl>
+<page_footer><loc_241><loc_463><loc_245><loc_469>7</page_footer>
 <unordered_list><page_break>
 <list_item><loc_44><loc_50><loc_50><loc_55>a.</list_item>
 <list_item><loc_54><loc_50><loc_408><loc_55>Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells</list_item>
@ -115,14 +114,14 @@
 <section_header_level_1><loc_249><loc_60><loc_352><loc_64>Example table from FinTabNet:</section_header_level_1>
 <picture><loc_41><loc_65><loc_246><loc_118></picture>
 <picture><loc_250><loc_62><loc_453><loc_114><caption><loc_44><loc_131><loc_315><loc_136>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption></picture>
-<otsl><loc_44><loc_138><loc_244><loc_185><ecel><ecel><ched>論文ファイル<lcel><ched>参考文献<lcel><nl><ched>出典<ched>ファイル 数<ched>英語<ched>日本語<ched>英語<ched>日本語<nl><rhed>Association for Computational Linguistics(ACL2003)<fcel>65<fcel>65<fcel>0<fcel>150<fcel>0<nl><rhed>Computational Linguistics(COLING2002)<fcel>140<fcel>140<fcel>0<fcel>150<fcel>0<nl><rhed>電気情報通信学会 2003 年総合大会<fcel>150<fcel>8<fcel>142<fcel>223<fcel>147<nl><rhed>情報処理学会第 65 回全国大会 (2003)<fcel>177<fcel>1<fcel>176<fcel>150<fcel>236<nl><rhed>第 17 回人工知能学会全国大会 (2003)<fcel>208<fcel>5<fcel>203<fcel>152<fcel>244<nl><rhed>自然言語処理研究会第 146 〜 155 回<fcel>98<fcel>2<fcel>96<fcel>150<fcel>232<nl><rhed>WWW から収集した論文<fcel>107<fcel>73<fcel>34<fcel>147<fcel>96<nl><ecel><fcel>945<fcel>294<fcel>651<fcel>1122<fcel>955<nl></otsl>
-<otsl><loc_249><loc_138><loc_450><loc_182><ecel><ched>Shares (in millions)<lcel><ched>Weighted Average Grant Date Fair Value<lcel><nl><ecel><ched>RS U s<ched>PSUs<ched>RSUs<ched>PSUs<nl><rhed>Nonvested on Janua ry 1<fcel>1. 1<fcel>0.3<fcel>90.10 $<fcel>$ 91.19<nl><rhed>Granted<fcel>0. 5<fcel>0.1<fcel>117.44<fcel>122.41<nl><rhed>Vested<fcel>(0. 5 )<fcel>(0.1)<fcel>87.08<fcel>81.14<nl><rhed>Canceled or forfeited<fcel>(0. 1 )<fcel>-<fcel>102.01<fcel>92.18<nl><rhed>Nonvested on December 31<fcel>1.0<fcel>0.3<fcel>104.85 $<fcel>$ 104.51<nl><caption><loc_311><loc_185><loc_449><loc_189>Text is aligned to match original for ease of viewing</caption></otsl>
-<picture><loc_42><loc_240><loc_173><loc_280><caption><loc_41><loc_203><loc_445><loc_231>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption></picture>
+<otsl><loc_44><loc_138><loc_244><loc_185><ecel><ecel><ched>論文ファイル<lcel><ched>参考文献<lcel><nl><ched>出典<ched>ファイル 数<ched>英語<ched>日本語<ched>英語<ched>日本語<nl><rhed>Association for Computational Linguistics(ACL2003)<fcel>65<fcel>65<fcel>0<fcel>150<fcel>0<nl><rhed>Computational Linguistics(COLING2002)<fcel>140<fcel>140<fcel>0<fcel>150<fcel>0<nl><rhed>電気情報通信学会 2003 年総合大会<fcel>150<fcel>8<fcel>142<fcel>223<fcel>147<nl><rhed>情報処理学会第 65 回全国大会 (2003)<fcel>177<fcel>1<fcel>176<fcel>150<fcel>236<nl><rhed>第 17 回人工知能学会全国大会 (2003)<fcel>208<fcel>5<fcel>203<fcel>152<fcel>244<nl><rhed>自然言語処理研究会第 146 〜 155 回<fcel>98<fcel>2<fcel>96<fcel>150<fcel>232<nl><rhed>WWW から収集した論文<fcel>107<fcel>73<fcel>34<fcel>147<fcel>96<nl><ecel><fcel>945<fcel>294<fcel>651<fcel>1122<fcel>955<nl><caption><loc_311><loc_185><loc_449><loc_189>Text is aligned to match original for ease of viewing</caption></otsl>
+<otsl><loc_249><loc_138><loc_450><loc_182><ecel><ched>Shares (in millions)<lcel><ched>Weighted Average Grant Date Fair Value<lcel><nl><ecel><ched>RS U s<ched>PSUs<ched>RSUs<ched>PSUs<nl><rhed>Nonvested on Janua ry 1<fcel>1. 1<fcel>0.3<fcel>90.10 $<fcel>$ 91.19<nl><rhed>Granted<fcel>0. 5<fcel>0.1<fcel>117.44<fcel>122.41<nl><rhed>Vested<fcel>(0. 5 )<fcel>(0.1)<fcel>87.08<fcel>81.14<nl><rhed>Canceled or forfeited<fcel>(0. 1 )<fcel>-<fcel>102.01<fcel>92.18<nl><rhed>Nonvested on December 31<fcel>1.0<fcel>0.3<fcel>104.85 $<fcel>$ 104.51<nl></otsl>
+<picture><loc_42><loc_240><loc_173><loc_280><caption><loc_51><loc_290><loc_435><loc_295>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption></picture>
+<picture><loc_177><loc_240><loc_307><loc_280><caption><loc_41><loc_203><loc_445><loc_231>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption></picture>
 <picture><loc_313><loc_241><loc_443><loc_280></picture>
-<picture><loc_177><loc_240><loc_307><loc_280><caption><loc_51><loc_290><loc_435><loc_295>Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.</caption></picture>
 <section_header_level_1><loc_41><loc_310><loc_134><loc_316>5.5. Qualitative Analysis</section_header_level_1>
-<text><loc_41><loc_339><loc_234><loc_450>We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type.</text>
 <section_header_level_1><loc_252><loc_310><loc_377><loc_317>6. Future Work & Conclusion</section_header_level_1>
+<text><loc_41><loc_339><loc_234><loc_450>We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type.</text>
 <text><loc_252><loc_324><loc_445><loc_412>In this paper, we presented TableFormer an end-to-end transformer based approach to predict table structures and bounding boxes of cells from an image. This approach enables us to recreate the table structure, and extract the cell content from PDF or OCR by using bounding boxes. Additionally, it provides the versatility required in real-world scenarios when dealing with various types of PDF documents, and languages. Furthermore, our method outperforms all state-of-the-arts with a wide margin. Finally, we introduce "SynthTabNet" a challenging synthetically generated dataset that reinforces missing characteristics from other datasets.</text>
 <section_header_level_1><loc_252><loc_424><loc_298><loc_431>References</section_header_level_1>
 <unordered_list><list_item><loc_256><loc_438><loc_445><loc_450>[1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-</list_item>
@ -132,31 +131,30 @@
 <list_item><loc_57><loc_48><loc_234><loc_74>end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5</list_item>
 <list_item><loc_45><loc_76><loc_234><loc_95>[2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3</list_item>
 <list_item><loc_45><loc_97><loc_234><loc_116>[3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2</list_item>
-<list_item><loc_45><loc_118><loc_234><loc_143>[4] Herv'e D'ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2</list_item>
+<list_item><loc_45><loc_118><loc_234><loc_143>[4] Herv´e D´ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2</list_item>
 <list_item><loc_45><loc_146><loc_234><loc_171>[5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2</list_item>
-<list_item><loc_45><loc_173><loc_234><loc_199>[6] Max Gobel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2</list_item>
+<list_item><loc_45><loc_173><loc_234><loc_199>[6] Max G¨obel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2</list_item>
 <list_item><loc_45><loc_201><loc_234><loc_220>[7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2</list_item>
 <list_item><loc_45><loc_222><loc_234><loc_255>[8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1</list_item>
 <list_item><loc_45><loc_257><loc_234><loc_276>[9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1</list_item>
 <list_item><loc_41><loc_278><loc_234><loc_304>[10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2</list_item>
 <list_item><loc_41><loc_306><loc_234><loc_339>[11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2</list_item>
 <list_item><loc_41><loc_341><loc_234><loc_373>[12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2</list_item>
-<list_item><loc_41><loc_375><loc_234><loc_408>[13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl'ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2</list_item>
+<list_item><loc_41><loc_375><loc_234><loc_408>[13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl´ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2</list_item>
 <list_item><loc_41><loc_410><loc_234><loc_429>[14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2</list_item>
 <list_item><loc_41><loc_431><loc_234><loc_450>[15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6</list_item>
-</unordered_list>
-<page_footer><loc_241><loc_463><loc_245><loc_469>9</page_footer>
-<unordered_list><list_item><loc_252><loc_48><loc_445><loc_88>[16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4</list_item>
+<list_item><loc_252><loc_48><loc_445><loc_88>[16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4</list_item>
 <list_item><loc_252><loc_90><loc_445><loc_109>[17] Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3</list_item>
 <list_item><loc_252><loc_111><loc_445><loc_164>[18] Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3</list_item>
 <list_item><loc_252><loc_166><loc_445><loc_206>[19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1</list_item>
 <list_item><loc_252><loc_208><loc_445><loc_234>[20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2</list_item>
 <list_item><loc_252><loc_236><loc_445><loc_276>[21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1</list_item>
-<list_item><loc_252><loc_278><loc_445><loc_352>[22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch'e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6</list_item>
+<list_item><loc_252><loc_278><loc_445><loc_352>[22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch´e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6</list_item>
 <list_item><loc_252><loc_354><loc_445><loc_394>[23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1</list_item>
 <list_item><loc_252><loc_396><loc_445><loc_422>[24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3</list_item>
 <list_item><loc_252><loc_424><loc_445><loc_450>[25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on</list_item>
 </unordered_list>
+<page_footer><loc_241><loc_463><loc_245><loc_469>9</page_footer>
 <page_break>
 <text><loc_57><loc_48><loc_234><loc_60>Computer Vision and Pattern Recognition , pages 658-666, 2019. 6</text>
 <unordered_list><list_item><loc_41><loc_62><loc_234><loc_102>[26] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1</list_item>
@ -171,11 +169,10 @@
 <list_item><loc_41><loc_375><loc_234><loc_401>[35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4</list_item>
 <list_item><loc_41><loc_403><loc_234><loc_436>[36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3</list_item>
 <list_item><loc_41><loc_438><loc_234><loc_450>[37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model,</list_item>
-</unordered_list>
-<page_footer><loc_239><loc_463><loc_247><loc_469>10</page_footer>
-<unordered_list><list_item><loc_269><loc_48><loc_445><loc_74>and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7</list_item>
+<list_item><loc_269><loc_48><loc_445><loc_74>and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7</list_item>
 <list_item><loc_252><loc_76><loc_445><loc_102>[38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1</list_item>
 </unordered_list>
+<page_footer><loc_239><loc_463><loc_247><loc_469>10</page_footer>
 <page_break>
 <section_header_level_1><loc_109><loc_70><loc_380><loc_86>TableFormer: Table Structure Understanding with Transformers Supplementary Material</section_header_level_1>
 <section_header_level_1><loc_41><loc_102><loc_144><loc_109>1. Details on the datasets</section_header_level_1>
@ -184,8 +181,7 @@
 <text><loc_41><loc_247><loc_234><loc_396>We have developed a technique that tries to derive a missing bounding box out of its neighbors. As a first step, we use the annotation data to generate the most fine-grained grid that covers the table structure. In case of strict HTML tables, all grid squares are associated with some table cell and in the presence of table spans a cell extends across multiple grid squares. When enough bounding boxes are known for a rectangular table, it is possible to compute the geometrical border lines between the grid rows and columns. Eventually this information is used to generate the missing bounding boxes. Additionally, the existence of unused grid squares indicates that the table rows have unequal number of columns and the overall structure is non-strict. The generation of missing bounding boxes for non-strict HTML tables is ambiguous and therefore quite challenging. Thus, we have decided to simply discard those tables. In case of PubTabNet we have computed missing bounding boxes for 48% of the simple and 69% of the complex tables. Regarding FinTabNet, 68% of the simple and 98% of the complex tables require the generation of bounding boxes.</text>
 <text><loc_41><loc_398><loc_234><loc_411>Figure 7 illustrates the distribution of the tables across different dimensions per dataset.</text>
 <section_header_level_1><loc_41><loc_418><loc_125><loc_424>1.2. Synthetic datasets</section_header_level_1>
-<text><loc_41><loc_430><loc_234><loc_451>Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear-</text>
-<text><loc_252><loc_103><loc_445><loc_131>ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%).</text>
+<text><loc_41><loc_430><loc_234><loc_451><loc_41><loc_430><loc_234><loc_451>Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear- ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%).</text>
 <text><loc_252><loc_133><loc_445><loc_147>The process of generating a synthetic dataset can be decomposed into the following steps:</text>
 <unordered_list><list_item><loc_252><loc_149><loc_445><loc_200>1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.).</list_item>
 <list_item><loc_252><loc_202><loc_445><loc_283>2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans.</list_item>
@ -201,6 +197,7 @@
 <unordered_list><list_item><loc_50><loc_133><loc_234><loc_146>· TableFormer output does not include the table cell content.</list_item>
 <list_item><loc_50><loc_154><loc_234><loc_167>· There are occasional inaccuracies in the predictions of the bounding boxes.</list_item>
 </unordered_list>
+<text><loc_252><loc_133><loc_445><loc_161>dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.</text>
 <text><loc_41><loc_176><loc_234><loc_250>However, it is possible to mitigate those limitations by combining the TableFormer predictions with the information already present inside a programmatic PDF document. More specifically, PDF documents can be seen as a sequence of PDF cells where each cell is described by its content and bounding box. If we are able to associate the PDF cells with the predicted table cells, we can directly link the PDF cell content to the table cell structure and use the PDF bounding boxes to correct misalignments in the predicted table cell bounding boxes.</text>
 <text><loc_41><loc_252><loc_234><loc_265>Here is a step-by-step description of the prediction postprocessing:</text>
 <unordered_list><list_item><loc_41><loc_267><loc_234><loc_288>1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.</list_item>
@ -212,9 +209,7 @@
 <formula><loc_90><loc_394><loc_234><loc_413></formula>
 <text><loc_41><loc_421><loc_234><loc_435>where c is one of { left, centroid, right } and x$_{c}$ is the xcoordinate for the corresponding point.</text>
 <unordered_list><list_item><loc_41><loc_437><loc_234><loc_450>5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-</list_item>
-</unordered_list>
-<text><loc_252><loc_133><loc_445><loc_161>dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.</text>
-<unordered_list><list_item><loc_252><loc_164><loc_445><loc_177>6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.</list_item>
+<list_item><loc_252><loc_164><loc_445><loc_177>6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.</list_item>
 <list_item><loc_252><loc_179><loc_445><loc_245>7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.</list_item>
 <list_item><loc_252><loc_247><loc_445><loc_290>8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score.</list_item>
 <list_item><loc_252><loc_293><loc_445><loc_359>9. Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan.</list_item>
@ -233,34 +228,37 @@
 <otsl><loc_69><loc_99><loc_195><loc_135></otsl>
 <otsl><loc_68><loc_148><loc_195><loc_184></otsl>
 <otsl><loc_69><loc_195><loc_195><loc_232></otsl>
-<otsl><loc_68><loc_250><loc_203><loc_308><caption><loc_52><loc_317><loc_223><loc_323>Figure 8: Example of a table with multi-line header.</caption></otsl>
-<page_footer><loc_239><loc_463><loc_247><loc_469>13</page_footer>
-<otsl><loc_254><loc_64><loc_454><loc_86></otsl>
+<otsl><loc_68><loc_250><loc_203><loc_308></otsl>
+<caption><loc_52><loc_317><loc_223><loc_323>Figure 8: Example of a table with multi-line header.</caption>
+<otsl><loc_254><loc_64><loc_454><loc_86><caption><loc_252><loc_194><loc_445><loc_207>Figure 9: Example of a table with big empty distance between cells.</caption></otsl>
 <otsl><loc_253><loc_98><loc_454><loc_117></otsl>
 <otsl><loc_253><loc_124><loc_454><loc_147></otsl>
 <picture><loc_253><loc_160><loc_348><loc_185></picture>
-<otsl><loc_253><loc_160><loc_348><loc_185><caption><loc_252><loc_194><loc_445><loc_207>Figure 9: Example of a table with big empty distance between cells.</caption></otsl>
-<otsl><loc_274><loc_245><loc_400><loc_276></otsl>
+<otsl><loc_253><loc_160><loc_348><loc_185></otsl>
+<otsl><loc_274><loc_245><loc_400><loc_276><caption><loc_255><loc_430><loc_443><loc_435>Figure 10: Example of a complex table with empty cells.</caption></otsl>
 <otsl><loc_274><loc_287><loc_400><loc_317></otsl>
 <otsl><loc_274><loc_328><loc_401><loc_358></otsl>
 <picture><loc_273><loc_374><loc_424><loc_420></picture>
-<otsl><loc_273><loc_374><loc_424><loc_420><caption><loc_255><loc_430><loc_443><loc_435>Figure 10: Example of a complex table with empty cells.</caption></otsl>
+<otsl><loc_273><loc_374><loc_424><loc_420></otsl>
+<page_footer><loc_239><loc_463><loc_247><loc_469>13</page_footer>
 <page_break>
 <otsl><loc_42><loc_173><loc_231><loc_217></otsl>
-<picture><loc_42><loc_66><loc_231><loc_218><caption><loc_41><loc_225><loc_234><loc_238>Figure 11: Simple table with different style and empty cells.</caption></picture>
+<picture><loc_42><loc_66><loc_231><loc_218></picture>
+<caption><loc_41><loc_225><loc_234><loc_238>Figure 11: Simple table with different style and empty cells.</caption>
 <otsl><loc_42><loc_286><loc_254><loc_310></otsl>
 <otsl><loc_42><loc_318><loc_254><loc_342></otsl>
 <otsl><loc_42><loc_350><loc_254><loc_374></otsl>
-<picture><loc_41><loc_386><loc_145><loc_414><caption><loc_45><loc_424><loc_230><loc_430>Figure 12: Simple table predictions and post processing.</caption></picture>
-<page_footer><loc_239><loc_463><loc_247><loc_469>14</page_footer>
+<picture><loc_41><loc_386><loc_145><loc_414></picture>
+<caption><loc_45><loc_424><loc_230><loc_430>Figure 12: Simple table predictions and post processing.</caption>
 <otsl><loc_261><loc_102><loc_437><loc_135></otsl>
 <otsl><loc_261><loc_143><loc_437><loc_177></otsl>
 <otsl><loc_268><loc_182><loc_428><loc_226></otsl>
 <picture><loc_260><loc_57><loc_437><loc_227><caption><loc_258><loc_235><loc_440><loc_240>Figure 13: Table predictions example on colorful table.</caption></picture>
-<otsl><loc_261><loc_272><loc_424><loc_302></otsl>
+<otsl><loc_261><loc_272><loc_424><loc_302><caption><loc_282><loc_432><loc_416><loc_437>Figure 14: Example with multi-line text.</caption></otsl>
 <otsl><loc_261><loc_309><loc_424><loc_338></otsl>
 <otsl><loc_261><loc_345><loc_425><loc_374></otsl>
-<otsl><loc_261><loc_385><loc_436><loc_422><caption><loc_282><loc_432><loc_416><loc_437>Figure 14: Example with multi-line text.</caption></otsl>
+<otsl><loc_261><loc_385><loc_436><loc_422></otsl>
+<page_footer><loc_239><loc_463><loc_247><loc_469>14</page_footer>
 <page_break>
 <picture><loc_45><loc_86><loc_228><loc_157></picture>
 <otsl><loc_45><loc_86><loc_228><loc_157></otsl>
@ -268,14 +266,15 @@
 <otsl><loc_44><loc_164><loc_228><loc_236></otsl>
 <picture><loc_45><loc_243><loc_229><loc_314></picture>
 <picture><loc_41><loc_319><loc_261><loc_399></picture>
-<otsl><loc_41><loc_319><loc_261><loc_399><caption><loc_69><loc_407><loc_206><loc_412>Figure 15: Example with triangular table.</caption></otsl>
-<page_footer><loc_239><loc_463><loc_247><loc_469>15</page_footer>
-<otsl><loc_264><loc_77><loc_430><loc_141></otsl>
+<otsl><loc_264><loc_77><loc_430><loc_141><caption><loc_69><loc_407><loc_206><loc_412>Figure 15: Example with triangular table.</caption></otsl>
 <otsl><loc_264><loc_153><loc_430><loc_217></otsl>
 <picture><loc_264><loc_229><loc_430><loc_293></picture>
 <otsl><loc_264><loc_229><loc_430><loc_293></otsl>
 <picture><loc_289><loc_308><loc_405><loc_401></picture>
-<otsl><loc_289><loc_308><loc_405><loc_401><caption><loc_252><loc_412><loc_445><loc_425>Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.</caption></otsl>
+<otsl><loc_289><loc_308><loc_405><loc_401></otsl>
+<otsl><loc_41><loc_319><loc_261><loc_399></otsl>
+<caption><loc_252><loc_412><loc_445><loc_425>Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.</caption>
+<page_footer><loc_239><loc_463><loc_247><loc_469>15</page_footer>
 <page_break>
 <picture><loc_55><loc_160><loc_432><loc_314><caption><loc_41><loc_321><loc_445><loc_334>Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.</caption></picture>
 <page_footer><loc_239><loc_463><loc_247><loc_469>16</page_footer>
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.md
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.md
@ -8,24 +8,23 @@

 ## a. Picture of a table:

+Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.
+
 ## 1. Introduction

 The occurrence of tables in documents is ubiquitous. They often summarise quantitative or factual data, which is cumbersome to describe in verbose text but nevertheless extremely valuable. Unfortunately, this compact representation is often not easy to parse by machines. There are many implicit conventions used to obtain a compact table representation. For example, tables often have complex columnand row-headers in order to reduce duplicated cell content. Lines of different shapes and sizes are leveraged to separate content or indicate a tree structure. Additionally, tables can also have empty/missing table-entries or multi-row textual table-entries. Fig. 1 shows a table which presents all these issues.

 <!-- image -->

-Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.
-
 - b. Red-annotation of bounding boxes, Blue-predictions by TableFormer
-
-<!-- image -->
-
 - c. Structure predicted by TableFormer:

 <!-- image -->

 Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.

+<!-- image -->
+
 | 0   |   1 | 1   |   2 1 |   2 1 |    |
 |-----|-----|-----|-------|-------|----|
 | 3   |   4 | 5 3 |     6 |     7 |    |
@ -157,9 +156,7 @@ TableFormer uses ResNet-18 as the CNN Backbone Network . The input images are re

 <!-- formula-not-decoded -->

-Although input constraints are used also by other methods, such as EDD, ours are less restrictive due to the improved
-
-runtime performance and lower memory footprint of TableFormer. This allows to utilize input samples with longer sequences and images with larger dimensions.
+Although input constraints are used also by other methods, such as EDD, ours are less restrictive due to the improved runtime performance and lower memory footprint of TableFormer. This allows to utilize input samples with longer sequences and images with larger dimensions.

 The Transformer Encoder consists of two "Transformer Encoder Layers", with an input feature size of 512, feed forward network of 1024, and 4 attention heads. As for the Transformer Decoder it is composed of four "Transformer Decoder Layers" with similar input and output dimensions as the "Transformer Encoder Layers". Even though our model uses fewer layers and heads than the default implementation parameters, our extensive experimentation has proved this setup to be more suitable for table images. We attribute this finding to the inherent design of table images, which contain mostly lines and text, unlike the more elaborate content present in other scopes (e.g. the COCO dataset). Moreover, we have added ResNet blocks to the inputs of the Structure Decoder and Cell BBox Decoder. This prevents a decoder having a stronger influence over the learned weights which would damage the other prediction task (structure vs bounding boxes), but learn task specific weights instead. Lastly our dropout layers are set to 0.5.

@ -185,8 +182,6 @@ where T$_{a}$ and T$_{b}$ represent tables in tree structure HTML format. EditDi

 Structure. As shown in Tab. 2, TableFormer outperforms all SOTA methods across different datasets by a large margin for predicting the table structure from an image. All the more, our model outperforms pre-trained methods. During the evaluation we do not apply any table filtering. We also provide our baseline results on the SynthTabNet dataset. It has been observed that large tables (e.g. tables that occupy half of the page or more) yield poor predictions. We attribute this issue to the image resizing during the preprocessing step, that produces downsampled images with indistinguishable features. This problem can be addressed by treating such big tables with a separate model which accepts a large input image size.

-Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).
-
 | Model       | Dataset   | Simple   | TEDS Complex   |   All |
 |-------------|-----------|----------|----------------|-------|
 | EDD         | PTN       | 91.1     | 88.7           | 89.9  |
@ -200,11 +195,11 @@ Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) a
 | TableFormer | TB        | 89.6     | -              | 89.6  |
 | TableFormer | STN       | 96.9     | 95.7           | 96.7  |

+Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).
+
 FT: Model was trained on PubTabNet then finetuned.

-Cell Detection. Like any object detector, our Cell BBox Detector provides bounding boxes that can be improved with post-processing during inference. We make use of the grid-like structure of tables to refine the predictions. A detailed explanation on the post-processing is available in the supplementary material. As shown in Tab. 3, we evaluate
-
-our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.
+Cell Detection. Like any object detector, our Cell BBox Detector provides bounding boxes that can be improved with post-processing during inference. We make use of the grid-like structure of tables to refine the predictions. A detailed explanation on the post-processing is available in the supplementary material. As shown in Tab. 3, we evaluate our Cell BBox Decoder accuracy for cells with a class label of 'content' only using the PASCAL VOC mAP metric for pre-processing and post-processing. Note that we do not have post-processing results for SynthTabNet as images are only provided. To compare the performance of our proposed approach, we've integrated TableFormer's Cell BBox Decoder into EDD architecture. As mentioned previously, the Structure Decoder provides the Cell BBox Decoder with the features needed to predict the bounding box predictions. Therefore, the accuracy of the Structure Decoder directly influences the accuracy of the Cell BBox Decoder . If the Structure Decoder predicts an extra column, this will result in an extra column of predicted bounding boxes.

 Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.

@ -240,6 +235,8 @@ b. Structure predicted by TableFormer, with superimposed matched PDF cell text:

 <!-- image -->

+Text is aligned to match original for ease of viewing
+
 |                                                    |             | 論文ファイル   | 論文ファイル   | 参考文献   | 参考文献   |
 |----------------------------------------------------|-------------|----------------|----------------|------------|------------|
 | 出典                                               | ファイル 数 | 英語           | 日本語         | 英語       | 日本語     |
@ -252,8 +249,6 @@ b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
 | WWW から収集した論文                               | 107         | 73             | 34             | 147        | 96         |
 |                                                    | 945         | 294            | 651            | 1122       | 955        |

-Text is aligned to match original for ease of viewing
-
 |                          | Shares (in millions)   | Shares (in millions)   | Weighted Average Grant Date Fair Value   | Weighted Average Grant Date Fair Value   |
 |--------------------------|------------------------|------------------------|------------------------------------------|------------------------------------------|
 |                          | RS U s                 | PSUs                   | RSUs                                     | PSUs                                     |
@ -263,22 +258,22 @@ Text is aligned to match original for ease of viewing
 | Canceled or forfeited    | (0. 1 )                | -                      | 102.01                                   | 92.18                                    |
 | Nonvested on December 31 | 1.0                    | 0.3                    | 104.85 $                                 | $ 104.51                                 |

+Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.
+
+<!-- image -->
+
 Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.

 <!-- image -->

 <!-- image -->

-Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.
-
-<!-- image -->
-
 ## 5.5. Qualitative Analysis

-We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type.
-
 ## 6. Future Work &amp; Conclusion

+We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type.
+
 In this paper, we presented TableFormer an end-to-end transformer based approach to predict table structures and bounding boxes of cells from an image. This approach enables us to recreate the table structure, and extract the cell content from PDF or OCR by using bounding boxes. Additionally, it provides the versatility required in real-world scenarios when dealing with various types of PDF documents, and languages. Furthermore, our method outperforms all state-of-the-arts with a wide margin. Finally, we introduce "SynthTabNet" a challenging synthetically generated dataset that reinforces missing characteristics from other datasets.

 ## References
@ -288,26 +283,25 @@ In this paper, we presented TableFormer an end-to-end transformer based approach
 - end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5
 - [2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3
 - [3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2
- [4] Herv'e D'ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2
+- [4] Herv´e D´ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2
 - [5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2
- [6] Max Gobel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2
+- [6] Max G¨obel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2
 - [7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2
 - [8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1
 - [9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1
 - [10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2
 - [11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2
 - [12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2
- [13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl'ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2
+- [13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl´ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2
 - [14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2
 - [15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6
-
 - [16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4
 - [17] Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3
 - [18] Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3
 - [19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1
 - [20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2
 - [21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1
- [22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch'e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6
+- [22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch´e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6
 - [23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1
 - [24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3
 - [25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on
@ -326,7 +320,6 @@ Computer Vision and Pattern Recognition , pages 658-666, 2019. 6
 - [35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4
 - [36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3
 - [37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model,
-
 - and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7
 - [38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1

@ -344,9 +337,7 @@ Figure 7 illustrates the distribution of the tables across different dimensions

 ## 1.2. Synthetic datasets

-Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear-
-
-ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%).
+Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear- ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%).

 The process of generating a synthetic dataset can be decomposed into the following steps:

@ -367,6 +358,8 @@ Figure 7: Distribution of the tables across different dimensions per dataset. Si
 - · TableFormer output does not include the table cell content.
 - · There are occasional inaccuracies in the predictions of the bounding boxes.

+dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.
+
 However, it is possible to mitigate those limitations by combining the TableFormer predictions with the information already present inside a programmatic PDF document. More specifically, PDF documents can be seen as a sequence of PDF cells where each cell is described by its content and bounding box. If we are able to associate the PDF cells with the predicted table cells, we can directly link the PDF cell content to the table cell structure and use the PDF bounding boxes to correct misalignments in the predicted table cell bounding boxes.

 Here is a step-by-step description of the prediction postprocessing:
@ -382,9 +375,6 @@ Here is a step-by-step description of the prediction postprocessing:
 where c is one of { left, centroid, right } and x$_{c}$ is the xcoordinate for the corresponding point.

 - 5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-
-
-dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.
-
 - 6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.
 - 7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.
 - 8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score.
@ -403,21 +393,15 @@ phan cell.

 Aditional images with examples of TableFormer predictions and post-processing can be found below.

-Figure 8: Example of a table with multi-line header.
-
-<!-- image -->
-
 Figure 9: Example of a table with big empty distance between cells.

 <!-- image -->

 Figure 10: Example of a complex table with empty cells.

-Figure 11: Simple table with different style and empty cells.
-
 <!-- image -->

-Figure 12: Simple table predictions and post processing.
+<!-- image -->

 <!-- image -->

@ -441,8 +425,6 @@ Figure 15: Example with triangular table.

 <!-- image -->

-Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.
-
 Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.

 <!-- image -->
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt
@ -10,7 +10,7 @@
 <section_header_level_1><loc_44><loc_348><loc_110><loc_354>CCS CONCEPTS</section_header_level_1>
 <text><loc_44><loc_357><loc_243><loc_377>· Information systems → Document structure ; · Applied computing → Document analysis ; · Computing methodologies → Machine learning ; Computer vision ; Object detection ;</text>
 <text><loc_44><loc_401><loc_241><loc_425>Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).</text>
-<text><loc_44><loc_426><loc_162><loc_430>KDD '22, August 14-18, 2022, Washington, DC, USA</text>
+<text><loc_44><loc_426><loc_162><loc_430>KDD ’22, August 14-18, 2022, Washington, DC, USA</text>
 <text><loc_44><loc_432><loc_153><loc_436>© 2022 Copyright held by the owner/author(s).</text>
 <text><loc_44><loc_437><loc_128><loc_441>ACM ISBN 978-1-4503-9385-0/22/08.</text>
 <text><loc_44><loc_442><loc_136><loc_446>https://doi.org/10.1145/3534678.3539043</text>
@ -20,7 +20,7 @@
 <section_header_level_1><loc_260><loc_404><loc_331><loc_409>ACM Reference Format:</section_header_level_1>
 <text><loc_260><loc_410><loc_457><loc_447>Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043</text>
 <page_break>
-<page_header><loc_44><loc_38><loc_456><loc_43>KDD ’22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar</page_header>
+<page_header><loc_44><loc_38><loc_456><loc_43>KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar</page_header>
 <section_header_level_1><loc_44><loc_54><loc_128><loc_61>1 INTRODUCTION</section_header_level_1>
 <text><loc_44><loc_70><loc_248><loc_145>Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1.</text>
 <text><loc_44><loc_146><loc_241><loc_317>A key problem in the process of document conversion is to understand the structure of a single document page, i.e. which segments of text should be grouped together in a unit. To train models for this task, there are currently two large datasets available to the community, PubLayNet [6] and DocBank [7]. They were introduced in 2019 and 2020 respectively and significantly accelerated the implementation of layout detection and segmentation models due to their sizes of 300K and 500K ground-truth pages. These sizes were achieved by leveraging an automation approach. The benefit of automated ground-truth generation is obvious: one can generate large ground-truth datasets at virtually no cost. However, the automation introduces a constraint on the variability in the dataset, because corresponding structured source data must be available. PubLayNet and DocBank were both generated from scientific document repositories (PubMed and arXiv), which provide XML or L A T E X sources. Those scientific documents present a limited variability in their layouts, because they are typeset in uniform templates provided by the publishers. Obviously, documents such as technical manuals, annual company reports, legal text, government tenders, etc. have very different and partially unique layouts. As a consequence, the layout predictions obtained from models trained on PubLayNet or DocBank is very reasonable when applied on scientific documents. However, for more artistic or free-style layouts, we see sub-par prediction quality from these models, which we demonstrate in Section 5.</text>
@ -57,12 +57,11 @@
 <section_header_level_1><loc_260><loc_383><loc_384><loc_390>4 ANNOTATION CAMPAIGN</section_header_level_1>
 <text><loc_260><loc_399><loc_457><loc_446>The annotation campaign was carried out in four phases. In phase one, we identified and prepared the data sources for annotation. In phase two, we determined the class labels and how annotations should be done on the documents in order to obtain maximum consistency. The latter was guided by a detailed requirement analysis and exhaustive experiments. In phase three, we trained the annotation staff and performed exams for quality assurance. In phase four,</text>
 <page_break>
-<page_header><loc_44><loc_38><loc_456><loc_43>KDD ’22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar</page_header>
+<page_header><loc_44><loc_38><loc_456><loc_43>KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar</page_header>
 <otsl><loc_81><loc_87><loc_419><loc_186><ecel><ecel><ched>% of Total<lcel><lcel><lcel><ched>triple inter-annotator mAP @ 0.5-0.95 (%)<lcel><lcel><lcel><lcel><lcel><nl><ched>class label<ched>Count<ched>Train<ched>Test<ched>Val<ched>All<ched>Fin<ched>Man<ched>Sci<ched>Law<ched>Pat<ched>Ten<nl><rhed>Caption<fcel>22524<fcel>2.04<fcel>1.77<fcel>2.32<fcel>84-89<fcel>40-61<fcel>86-92<fcel>94-99<fcel>95-99<fcel>69-78<fcel>n/a<nl><rhed>Footnote<fcel>6318<fcel>0.60<fcel>0.31<fcel>0.58<fcel>83-91<fcel>n/a<fcel>100<fcel>62-88<fcel>85-94<fcel>n/a<fcel>82-97<nl><rhed>Formula<fcel>25027<fcel>2.25<fcel>1.90<fcel>2.96<fcel>83-85<fcel>n/a<fcel>n/a<fcel>84-87<fcel>86-96<fcel>n/a<fcel>n/a<nl><rhed>List-item<fcel>185660<fcel>17.19<fcel>13.34<fcel>15.82<fcel>87-88<fcel>74-83<fcel>90-92<fcel>97-97<fcel>81-85<fcel>75-88<fcel>93-95<nl><rhed>Page-footer<fcel>70878<fcel>6.51<fcel>5.58<fcel>6.00<fcel>93-94<fcel>88-90<fcel>95-96<fcel>100<fcel>92-97<fcel>100<fcel>96-98<nl><rhed>Page-header<fcel>58022<fcel>5.10<fcel>6.70<fcel>5.06<fcel>85-89<fcel>66-76<fcel>90-94<fcel>98-100<fcel>91-92<fcel>97-99<fcel>81-86<nl><rhed>Picture<fcel>45976<fcel>4.21<fcel>2.78<fcel>5.31<fcel>69-71<fcel>56-59<fcel>82-86<fcel>69-82<fcel>80-95<fcel>66-71<fcel>59-76<nl><rhed>Section-header<fcel>142884<fcel>12.60<fcel>15.77<fcel>12.85<fcel>83-84<fcel>76-81<fcel>90-92<fcel>94-95<fcel>87-94<fcel>69-73<fcel>78-86<nl><rhed>Table<fcel>34733<fcel>3.20<fcel>2.27<fcel>3.60<fcel>77-81<fcel>75-80<fcel>83-86<fcel>98-99<fcel>58-80<fcel>79-84<fcel>70-85<nl><rhed>Text<fcel>510377<fcel>45.82<fcel>49.28<fcel>45.00<fcel>84-86<fcel>81-86<fcel>88-93<fcel>89-93<fcel>87-92<fcel>71-79<fcel>87-95<nl><rhed>Title<fcel>5071<fcel>0.47<fcel>0.30<fcel>0.50<fcel>60-72<fcel>24-63<fcel>50-63<fcel>94-100<fcel>82-96<fcel>68-79<fcel>24-56<nl><rhed>Total<fcel>1107470<fcel>941123<fcel>99816<fcel>66531<fcel>82-83<fcel>71-74<fcel>79-81<fcel>89-94<fcel>86-91<fcel>71-76<fcel>68-85<nl><caption><loc_44><loc_54><loc_456><loc_73>Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.</caption></otsl>
 <picture><loc_43><loc_196><loc_242><loc_341><caption><loc_44><loc_350><loc_242><loc_383>Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.</caption></picture>
 <text><loc_44><loc_400><loc_240><loc_426>we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised.</text>
-<text><loc_44><loc_428><loc_241><loc_447>Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources</text>
-<text><loc_260><loc_197><loc_457><loc_237>include publication repositories such as arXiv$^{3}$, government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process.</text>
+<text><loc_44><loc_428><loc_241><loc_447><loc_44><loc_428><loc_241><loc_447>Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources include publication repositories such as arXiv$^{3}$, government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process.</text>
 <text><loc_260><loc_239><loc_457><loc_320>Preparation work included uploading and parsing the sourced PDF documents in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides a visual annotation interface and allows for dataset inspection and analysis. The annotation interface of CCS is shown in Figure 3. The desired balance of pages between the different document categories was achieved by selective subsampling of pages with certain desired properties. For example, we made sure to include the title page of each document and bias the remaining page selection to those with figures or tables. The latter was achieved by leveraging pre-trained object detection models from PubLayNet, which helped us estimate how many figures and tables a given page contains.</text>
 <text><loc_259><loc_321><loc_457><loc_438>Phase 2: Label selection and guideline. We reviewed the collected documents and identified the most common structural features they exhibit. This was achieved by identifying recurrent layout elements and lead us to the definition of 11 distinct class labels. These 11 class labels are Caption , Footnote , Formula , List-item , Pagefooter , Page-header , Picture , Section-header , Table , Text , and Title . Critical factors that were considered for the choice of these class labels were (1) the overall occurrence of the label, (2) the specificity of the label, (3) recognisability on a single page (i.e. no need for context from previous or next page) and (4) overall coverage of the page. Specificity ensures that the choice of label is not ambiguous, while coverage ensures that all meaningful items on a page can be annotated. We refrained from class labels that are very specific to a document category, such as Abstract in the Scientific Articles category. We also avoided class labels that are tightly linked to the semantics of the text. Labels such as Author and Affiliation , as seen in DocBank, are often only distinguishable by discriminating on</text>
 <footnote><loc_260><loc_443><loc_302><loc_448>$^{3}$https://arxiv.org/</footnote>
@ -81,13 +80,15 @@
 </unordered_list>
 <text><loc_44><loc_336><loc_241><loc_363>The complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.</text>
 <text><loc_44><loc_364><loc_241><loc_446>Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations</text>
-<picture><loc_258><loc_54><loc_457><loc_290><caption><loc_260><loc_299><loc_457><loc_318>Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.</caption></picture>
+<picture><loc_258><loc_54><loc_457><loc_290></picture>
 <text><loc_327><loc_289><loc_389><loc_291>05237a14f2524e3f53c8454b074409d05078038a6a36b770fcc8ec7e540deae0</text>
+<caption><loc_260><loc_299><loc_457><loc_318>Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.</caption>
 <text><loc_259><loc_332><loc_456><loc_344>were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.</text>
 <text><loc_259><loc_346><loc_457><loc_448>Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted</text>
 <page_break>
-<page_header><loc_44><loc_38><loc_456><loc_43>KDD ’22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar</page_header>
-<otsl><loc_51><loc_124><loc_233><loc_222><ecel><ched>human<ched>MRCNN<lcel><ched>FRCNN<ched>YOLO<nl><ecel><ucel><ched>R50<ched>R101<ched>R101<ched>v5x6<nl><rhed>Caption<fcel>84-89<fcel>68.4<fcel>71.5<fcel>70.1<fcel>77.7<nl><rhed>Footnote<fcel>83-91<fcel>70.9<fcel>71.8<fcel>73.7<fcel>77.2<nl><rhed>Formula<fcel>83-85<fcel>60.1<fcel>63.4<fcel>63.5<fcel>66.2<nl><rhed>List-item<fcel>87-88<fcel>81.2<fcel>80.8<fcel>81.0<fcel>86.2<nl><rhed>Page-footer<fcel>93-94<fcel>61.6<fcel>59.3<fcel>58.9<fcel>61.1<nl><rhed>Page-header<fcel>85-89<fcel>71.9<fcel>70.0<fcel>72.0<fcel>67.9<nl><rhed>Picture<fcel>69-71<fcel>71.7<fcel>72.7<fcel>72.0<fcel>77.1<nl><rhed>Section-header<fcel>83-84<fcel>67.6<fcel>69.3<fcel>68.4<fcel>74.6<nl><rhed>Table<fcel>77-81<fcel>82.2<fcel>82.9<fcel>82.2<fcel>86.3<nl><rhed>Text<fcel>84-86<fcel>84.6<fcel>85.8<fcel>85.4<fcel>88.1<nl><rhed>Title<fcel>60-72<fcel>76.7<fcel>80.4<fcel>79.9<fcel>82.7<nl><rhed>All<fcel>82-83<fcel>72.4<fcel>73.5<fcel>73.4<fcel>76.8<nl><caption><loc_44><loc_55><loc_242><loc_116>Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.</caption></otsl>
+<page_header><loc_44><loc_38><loc_456><loc_43>KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar</page_header>
+<text><loc_44><loc_55><loc_242><loc_116>Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.</text>
+<otsl><loc_51><loc_124><loc_233><loc_222><ecel><ched>human<ched>MRCNN<lcel><ched>FRCNN<ched>YOLO<nl><ecel><ucel><ched>R50<ched>R101<ched>R101<ched>v5x6<nl><rhed>Caption<fcel>84-89<fcel>68.4<fcel>71.5<fcel>70.1<fcel>77.7<nl><rhed>Footnote<fcel>83-91<fcel>70.9<fcel>71.8<fcel>73.7<fcel>77.2<nl><rhed>Formula<fcel>83-85<fcel>60.1<fcel>63.4<fcel>63.5<fcel>66.2<nl><rhed>List-item<fcel>87-88<fcel>81.2<fcel>80.8<fcel>81.0<fcel>86.2<nl><rhed>Page-footer<fcel>93-94<fcel>61.6<fcel>59.3<fcel>58.9<fcel>61.1<nl><rhed>Page-header<fcel>85-89<fcel>71.9<fcel>70.0<fcel>72.0<fcel>67.9<nl><rhed>Picture<fcel>69-71<fcel>71.7<fcel>72.7<fcel>72.0<fcel>77.1<nl><rhed>Section-header<fcel>83-84<fcel>67.6<fcel>69.3<fcel>68.4<fcel>74.6<nl><rhed>Table<fcel>77-81<fcel>82.2<fcel>82.9<fcel>82.2<fcel>86.3<nl><rhed>Text<fcel>84-86<fcel>84.6<fcel>85.8<fcel>85.4<fcel>88.1<nl><rhed>Title<fcel>60-72<fcel>76.7<fcel>80.4<fcel>79.9<fcel>82.7<nl><rhed>All<fcel>82-83<fcel>72.4<fcel>73.5<fcel>73.4<fcel>76.8<nl></otsl>
 <text><loc_44><loc_234><loc_241><loc_364>to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.</text>
 <section_header_level_1><loc_44><loc_371><loc_120><loc_378>5 EXPERIMENTS</section_header_level_1>
 <text><loc_44><loc_387><loc_241><loc_448>The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this</text>
@ -100,11 +101,12 @@
 <page_header><loc_44><loc_38><loc_284><loc_43>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</page_header>
 <page_header><loc_299><loc_38><loc_456><loc_43>KDD ’22, August 14-18, 2022, Washington, DC, USA</page_header>
 <text><loc_44><loc_55><loc_242><loc_81>Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained on DocLayNet with different class label sets. The reduced label sets were obtained by either down-mapping or dropping labels.</text>
-<otsl><loc_66><loc_95><loc_218><loc_187><ched>Class-count<ched>11<ched>6<ched>5<ched>4<nl><rhed>Caption<fcel>68<fcel>Text<fcel>Text<fcel>Text<nl><rhed>Footnote<fcel>71<fcel>Text<fcel>Text<fcel>Text<nl><rhed>Formula<fcel>60<fcel>Text<fcel>Text<fcel>Text<nl><rhed>List-item<fcel>81<fcel>Text<fcel>82<fcel>Text<nl><rhed>Page-footer<fcel>62<fcel>62<fcel>-<fcel>-<nl><rhed>Page-header<fcel>72<fcel>68<fcel>-<fcel>-<nl><rhed>Picture<fcel>72<fcel>72<fcel>72<fcel>72<nl><rhed>Section-header<fcel>68<fcel>67<fcel>69<fcel>68<nl><rhed>Table<fcel>82<fcel>83<fcel>82<fcel>82<nl><rhed>Text<fcel>85<fcel>84<fcel>84<fcel>84<nl><rhed>Title<fcel>77<fcel>Sec.-h.<fcel>Sec.-h.<fcel>Sec.-h.<nl><rhed>Overall<fcel>72<fcel>73<fcel>78<fcel>77<nl><caption><loc_260><loc_55><loc_457><loc_81>Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.</caption></otsl>
+<otsl><loc_66><loc_95><loc_218><loc_187><ched>Class-count<ched>11<ched>6<ched>5<ched>4<nl><rhed>Caption<fcel>68<fcel>Text<fcel>Text<fcel>Text<nl><rhed>Footnote<fcel>71<fcel>Text<fcel>Text<fcel>Text<nl><rhed>Formula<fcel>60<fcel>Text<fcel>Text<fcel>Text<nl><rhed>List-item<fcel>81<fcel>Text<fcel>82<fcel>Text<nl><rhed>Page-footer<fcel>62<fcel>62<fcel>-<fcel>-<nl><rhed>Page-header<fcel>72<fcel>68<fcel>-<fcel>-<nl><rhed>Picture<fcel>72<fcel>72<fcel>72<fcel>72<nl><rhed>Section-header<fcel>68<fcel>67<fcel>69<fcel>68<nl><rhed>Table<fcel>82<fcel>83<fcel>82<fcel>82<nl><rhed>Text<fcel>85<fcel>84<fcel>84<fcel>84<nl><rhed>Title<fcel>77<fcel>Sec.-h.<fcel>Sec.-h.<fcel>Sec.-h.<nl><rhed>Overall<fcel>72<fcel>73<fcel>78<fcel>77<nl></otsl>
 <section_header_level_1><loc_44><loc_202><loc_107><loc_208>Learning Curve</section_header_level_1>
 <text><loc_43><loc_211><loc_241><loc_334>One of the fundamental questions related to any dataset is if it is "large enough". To answer this question for DocLayNet, we performed a data ablation study in which we evaluated a Mask R-CNN model trained on increasing fractions of the DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in the beginning and eventually levels out. To estimate the error-bar on the metrics, we ran the training five times on the entire data-set. This resulted in a 1% error-bar, depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the exact same data-points, but with a logarithmic scale on the x-axis. As is expected, the mAP score increases linearly as a function of the data-size in the inset. The curve ultimately flattens out between the 80% and 100% mark, with the 80% mark falling within the error-bars of the 100% mark. This provides a good indication that the model would not improve significantly by yet increasing the data size. Rather, it would probably benefit more from improved data consistency (as discussed in Section 3), data augmentation methods [23], or the addition of more document categories and styles.</text>
 <section_header_level_1><loc_44><loc_342><loc_134><loc_349>Impact of Class Labels</section_header_level_1>
 <text><loc_44><loc_352><loc_241><loc_447>The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of</text>
+<text><loc_260><loc_55><loc_457><loc_81>Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.</text>
 <otsl><loc_288><loc_95><loc_427><loc_193><fcel>Class-count<ched>11<lcel><ched>5<lcel><nl><fcel>Split<ched>Doc<ched>Page<ched>Doc<ched>Page<nl><rhed>Caption<fcel>68<fcel>83<ecel><ecel><nl><rhed>Footnote<fcel>71<fcel>84<ecel><ecel><nl><rhed>Formula<fcel>60<fcel>66<ecel><ecel><nl><rhed>List-item<fcel>81<fcel>88<fcel>82<fcel>88<nl><rhed>Page-footer<fcel>62<fcel>89<ecel><ecel><nl><rhed>Page-header<fcel>72<fcel>90<ecel><ecel><nl><rhed>Picture<fcel>72<fcel>82<fcel>72<fcel>82<nl><rhed>Section-header<fcel>68<fcel>83<fcel>69<fcel>83<nl><rhed>Table<fcel>82<fcel>89<fcel>82<fcel>90<nl><rhed>Text<fcel>85<fcel>91<fcel>84<fcel>90<nl><rhed>Title<fcel>77<fcel>81<ecel><ecel><nl><rhed>All<fcel>72<fcel>84<fcel>78<fcel>87<nl></otsl>
 <text><loc_260><loc_209><loc_457><loc_263>lists in PubLayNet (grouped list-items) versus DocLayNet (separate list-items), the label set of size 4 is the closest to PubLayNet, in the assumption that the List is down-mapped to Text in PubLayNet. The results in Table 3 show that the prediction accuracy on the remaining class labels does not change significantly when other classes are merged into them. The overall macro-average improves by around 5%, in particular when Page-footer and Page-header are excluded.</text>
 <section_header_level_1><loc_260><loc_271><loc_449><loc_278>Impact of Document Split in Train and Test Set</section_header_level_1>
@ -112,8 +114,9 @@
 <section_header_level_1><loc_260><loc_384><loc_342><loc_391>Dataset Comparison</section_header_level_1>
 <text><loc_260><loc_394><loc_457><loc_447>Throughout this paper, we claim that DocLayNet's wider variety of document layouts leads to more robust layout detection models. In Table 5, we provide evidence for that. We trained models on each of the available datasets (PubLayNet, DocBank and DocLayNet) and evaluated them on the test sets of the other datasets. Due to the different label sets and annotation styles, a direct comparison is not possible. Hence, we focussed on the common labels among the datasets. Between PubLayNet and DocLayNet, these are Picture ,</text>
 <page_break>
-<page_header><loc_44><loc_38><loc_456><loc_43>KDD ’22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar</page_header>
-<otsl><loc_59><loc_109><loc_225><loc_215><ecel><ecel><ched>Testing on<lcel><lcel><nl><ched>Training on<ched>labels<ched>PLN<ched>DB<ched>DLN<nl><rhed>PubLayNet (PLN)<rhed>Figure<fcel>96<fcel>43<fcel>23<nl><ucel><rhed>Sec-header<fcel>87<fcel>-<fcel>32<nl><ucel><rhed>Table<fcel>95<fcel>24<fcel>49<nl><ucel><rhed>Text<fcel>96<fcel>-<fcel>42<nl><ucel><rhed>total<fcel>93<fcel>34<fcel>30<nl><rhed>DocBank (DB)<rhed>Figure<fcel>77<fcel>71<fcel>31<nl><ucel><rhed>Table<fcel>19<fcel>65<fcel>22<nl><ucel><rhed>total<fcel>48<fcel>68<fcel>27<nl><rhed>DocLayNet (DLN)<rhed>Figure<fcel>67<fcel>51<fcel>72<nl><ucel><rhed>Sec-header<fcel>53<fcel>-<fcel>68<nl><ucel><rhed>Table<fcel>87<fcel>43<fcel>82<nl><ucel><rhed>Text<fcel>77<fcel>-<fcel>84<nl><ucel><rhed>total<fcel>59<fcel>47<fcel>78<nl><caption><loc_44><loc_55><loc_242><loc_95>Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.</caption></otsl>
+<page_header><loc_44><loc_38><loc_456><loc_43>KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar</page_header>
+<text><loc_44><loc_55><loc_242><loc_95>Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.</text>
+<otsl><loc_59><loc_109><loc_225><loc_215><ecel><ecel><ched>Testing on<lcel><lcel><nl><ched>Training on<ched>labels<ched>PLN<ched>DB<ched>DLN<nl><rhed>PubLayNet (PLN)<rhed>Figure<fcel>96<fcel>43<fcel>23<nl><ucel><rhed>Sec-header<fcel>87<fcel>-<fcel>32<nl><ucel><rhed>Table<fcel>95<fcel>24<fcel>49<nl><ucel><rhed>Text<fcel>96<fcel>-<fcel>42<nl><ucel><rhed>total<fcel>93<fcel>34<fcel>30<nl><rhed>DocBank (DB)<rhed>Figure<fcel>77<fcel>71<fcel>31<nl><ucel><rhed>Table<fcel>19<fcel>65<fcel>22<nl><ucel><rhed>total<fcel>48<fcel>68<fcel>27<nl><rhed>DocLayNet (DLN)<rhed>Figure<fcel>67<fcel>51<fcel>72<nl><ucel><rhed>Sec-header<fcel>53<fcel>-<fcel>68<nl><ucel><rhed>Table<fcel>87<fcel>43<fcel>82<nl><ucel><rhed>Text<fcel>77<fcel>-<fcel>84<nl><ucel><rhed>total<fcel>59<fcel>47<fcel>78<nl></otsl>
 <text><loc_44><loc_247><loc_240><loc_280>Section-header , Table and Text . Before training, we either mapped or excluded DocLayNet's other labels as specified in table 3, and also PubLayNet's List to Text . Note that the different clustering of lists (by list-element vs. whole list objects) naturally decreases the mAP score for Text .</text>
 <text><loc_44><loc_281><loc_241><loc_370>For comparison of DocBank with DocLayNet, we trained only on Picture and Table clusters of each dataset. We had to exclude Text because successive paragraphs are often grouped together into a single object in DocBank. This paragraph grouping is incompatible with the individual paragraphs of DocLayNet. As can be seen in Table 5, DocLayNet trained models yield better performance compared to the previous datasets. It is noteworthy that the models trained on PubLayNet and DocBank perform very well on their own test set, but have a much lower performance on the foreign datasets. While this also applies to DocLayNet, the difference is far less pronounced. Thus we conclude that DocLayNet trained models are overall more robust and will produce better results for challenging, unseen layouts.</text>
 <section_header_level_1><loc_44><loc_382><loc_127><loc_388>Example Predictions</section_header_level_1>
@ -143,13 +146,13 @@
 <picture><loc_43><loc_53><loc_455><loc_279><caption><loc_51><loc_279><loc_260><loc_283>Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title</caption></picture>
 <text><loc_44><loc_293><loc_457><loc_319>Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes.</text>
 <text><loc_57><loc_333><loc_241><loc_347>Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.</text>
-<unordered_list><list_item><loc_44><loc_348><loc_241><loc_362>[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.</list_item>
+<unordered_list><list_item><loc_260><loc_333><loc_457><loc_342>[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</list_item>
+<list_item><loc_44><loc_348><loc_241><loc_362>[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.</list_item>
 <list_item><loc_44><loc_363><loc_240><loc_372>[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019.</list_item>
 <list_item><loc_44><loc_373><loc_241><loc_387>[16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014.</list_item>
 <list_item><loc_44><loc_388><loc_241><loc_397>[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019.</list_item>
 <list_item><loc_44><loc_398><loc_241><loc_422>[18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021.</list_item>
 <list_item><loc_44><loc_423><loc_241><loc_448>[19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery.</list_item>
-<list_item><loc_260><loc_333><loc_457><loc_342>[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</list_item>
 <list_item><loc_260><loc_343><loc_457><loc_357>[21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.</list_item>
 <list_item><loc_260><loc_358><loc_457><loc_377>[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.</list_item>
 <list_item><loc_260><loc_378><loc_457><loc_387>[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.</list_item>
--- a/tests/data/groundtruth/docling_v2/2206.01062.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.md
+++ b/tests/data/groundtruth/docling_v2/2206.01062.md
@ -20,7 +20,7 @@ Accurate document layout analysis is a key requirement for highquality PDF docum

 Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).

-KDD '22, August 14-18, 2022, Washington, DC, USA
+KDD ’22, August 14-18, 2022, Washington, DC, USA

 © 2022 Copyright held by the owner/author(s).

@ -119,9 +119,7 @@ Figure 3: Corpus Conversion Service annotation user interface. The PDF page is s

 we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised.

-Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources
-
-include publication repositories such as arXiv$^{3}$, government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process.
+Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources include publication repositories such as arXiv$^{3}$, government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process.

 Preparation work included uploading and parsing the sourced PDF documents in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides a visual annotation interface and allows for dataset inspection and analysis. The annotation interface of CCS is shown in Figure 3. The desired balance of pages between the different document categories was achieved by selective subsampling of pages with certain desired properties. For example, we made sure to include the title page of each document and bias the remaining page selection to those with figures or tables. The latter was achieved by leveraging pre-trained object detection models from PubLayNet, which helped us estimate how many figures and tables a given page contains.

@ -144,8 +142,6 @@ The complete annotation guideline is over 100 pages long and a detailed descript

 Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations

-Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.
-
 <!-- image -->

 05237a14f2524e3f53c8454b074409d05078038a6a36b770fcc8ec7e540deae0
@ -192,8 +188,6 @@ In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], F

 Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained on DocLayNet with different class label sets. The reduced label sets were obtained by either down-mapping or dropping labels.

-Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH&lt;tildelow&gt; 10% point improvement.
-
 | Class-count    |   11 | 6       | 5       | 4       |
 |----------------|------|---------|---------|---------|
 | Caption        |   68 | Text    | Text    | Text    |
@ -217,6 +211,8 @@ One of the fundamental questions related to any dataset is if it is "large enoug

 The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of

+Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH&lt;tildelow&gt; 10% point improvement.
+
 | Class-count    | 11   | 11   | 5   | 5    |
 |----------------|------|------|-----|------|
 | Split          | Doc  | Page | Doc | Page |
@ -302,13 +298,13 @@ Figure 6: Example layout predictions on selected pages from the DocLayNet test-s

 Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.

+- [20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.
 - [14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.
 - [15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019.
 - [16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014.
 - [17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019.
 - [18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021.
 - [19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery.
- [20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.
 - [21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.
 - [22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.
 - [23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.
--- a/tests/data/groundtruth/docling_v2/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v2/amt_handbook_sample.json
+++ b/tests/data/groundtruth/docling_v2/amt_handbook_sample.json
--- a/tests/data/groundtruth/docling_v2/amt_handbook_sample.pages.json
+++ b/tests/data/groundtruth/docling_v2/amt_handbook_sample.pages.json
--- a/tests/data/groundtruth/docling_v2/code_and_formula.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/code_and_formula.doctags.txt
@ -1,8 +1,8 @@
 <doctag><section_header_level_1><loc_109><loc_79><loc_258><loc_87>JavaScript Code Example</section_header_level_1>
 <text><loc_109><loc_94><loc_390><loc_183>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
 <text><loc_109><loc_185><loc_390><loc_213>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet,</text>
-<paragraph><loc_182><loc_221><loc_317><loc_226>Listing 1: Simple JavaScript Program</paragraph>
 <code<loc_110><loc_231><loc_215><loc_257><_unknown_>function add(a, b) { return a + b; } console.log(add(3, 5));</code
+<caption><loc_182><loc_221><loc_317><loc_226>Listing 1: Simple JavaScript Program</caption>
 <text><loc_109><loc_265><loc_390><loc_353>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
 <text><loc_109><loc_355><loc_390><loc_383>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet,</text>
 <page_footer><loc_248><loc_439><loc_252><loc_445>1</page_footer>
--- a/tests/data/groundtruth/docling_v2/code_and_formula.json
+++ b/tests/data/groundtruth/docling_v2/code_and_formula.json
--- a/tests/data/groundtruth/docling_v2/code_and_formula.md
+++ b/tests/data/groundtruth/docling_v2/code_and_formula.md
@ -4,8 +4,6 @@ Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod

 Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet,

-Listing 1: Simple JavaScript Program
-
 ```
 function add(a, b) { return a + b; } console.log(add(3, 5));
 ```
--- a/tests/data/groundtruth/docling_v2/picture_classification.json
+++ b/tests/data/groundtruth/docling_v2/picture_classification.json
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.json
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.json
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.md
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.md
@ -6,7 +6,7 @@ Front cover

 <!-- image -->

-<!-- image -->
+Front cover

 ## Contents

@ -108,20 +108,20 @@ This paper was produced by the IBM DB2 for i Center of Excellence team in partne

 <!-- image -->

-<!-- image -->
-
 Jim Bainbridge is a senior DB2 consultant on the DB2 for i Center of Excellence team in the IBM Lab Services and Training organization. His primary role is training and implementation services for IBM DB2 Web Query for i and business analytics. Jim began his career with IBM 30 years ago in the IBM Rochester Development Lab, where he developed cooperative processing products that paired IBM PCs with IBM S/36 and AS/.400 systems. In the years since, Jim has held numerous technical roles, including independent software vendors technical support on a broad range of IBM technologies and products, and supporting customers in the IBM Executive Briefing Center and IBM Project Office.

+<!-- image -->
+
 Hernando Bedoya is a Senior IT Specialist at STG Lab Services and Training in Rochester, Minnesota. He writes extensively and teaches IBM classes worldwide in all areas of DB2 for i. Before joining STG Lab Services, he worked in the ITSO for nine years writing multiple IBM Redbooksfi publications. He also worked for IBM Colombia as an IBM AS/400fi IT Specialist doing presales support for the Andean countries. He has 28 years of experience in the computing field and has taught database classes in Colombian universities. He holds a Master's degree in Computer Science from EAFIT, Colombia. His areas of expertise are database technology, performance, and data warehousing. Hernando can be contacted at hbedoya@us.ibm.com .

 ## Authors

 <!-- image -->

-Chapter 1.
-
 1

+Chapter 1.
+
 ## Securing and protecting IBM DB2 data

 Recent news headlines are filled with reports of data breaches and cyber-attacks impacting global businesses of all sizes. The Identity Theft Resource Center$^{1}$ reports that almost 5000 data breaches have occurred since 2005, exposing over 600 million records of data. The financial cost of these data breaches is skyrocketing. Studies from the Ponemon Institute$^{2}$ revealed that the average cost of a data breach increased in 2013 by 15% globally and resulted in a brand equity loss of $9.4 million per attack. The average cost that is incurred for each lost record containing sensitive information increased more than 9% to $145 per record.
@ -196,8 +196,6 @@ Table 2-1 FUNCTION\_USAGE view

 To discover who has authorization to define and manage RCAC, you can use the query that is shown in Example 2-1.

-Example 2-1 Query to determine who has authority to define and manage RCAC
-
 SELECT

 function\_id,
@ -241,10 +239,10 @@ Table 2-2 Comparison of the different function usage IDs and *JOBCTL authority
 | User action                                                                    | *JOBCTL   | QIBM\_DB\_SECADM   | QIBM\_DB\_SQLADM   | QIBM\_DB\_SYSMON   | No Authority   |
 |--------------------------------------------------------------------------------|-----------|------------------|------------------|------------------|----------------|
 | SET CURRENT DEGREE  (SQL statement)                                            | X         |                  | X                |                  |                |
-| CHGQRYA  command targeting a different user's job                              | X         |                  | X                |                  |                |
-| STRDBMON  or  ENDDBMON  commands targeting a different user's job              | X         |                  | X                |                  |                |
+| CHGQRYA  command targeting a different user’s job                              | X         |                  | X                |                  |                |
+| STRDBMON  or  ENDDBMON  commands targeting a different user’s job              | X         |                  | X                |                  |                |
 | STRDBMON  or  ENDDBMON  commands targeting a job that matches the current user | X         |                  | X                | X                | X              |
-| QUSRJOBI() API format 900 or System i Navigator's SQL Details for Job          | X         |                  | X                | X                |                |
+| QUSRJOBI() API format 900 or System i Navigator’s SQL Details for Job          | X         |                  | X                | X                |                |
 | Visual Explain within Run SQL scripts                                          | X         |                  | X                | X                | X              |
 | Visual Explain outside of Run SQL scripts                                      | X         |                  | X                |                  |                |
 | ANALYZE PLAN CACHE procedure                                                   | X         |                  | X                |                  |                |
@ -253,7 +251,7 @@ Table 2-2 Comparison of the different function usage IDs and *JOBCTL authority
 | MODIFY PLAN CACHE PROPERTIES procedure (currently does not check authority)    | X         |                  | X                |                  |                |
 | CHANGE PLAN CACHE SIZE procedure (currently does not check authority)          | X         |                  | X                |                  |                |

-The SQL CREATE PERMISSION statement that is shown in Figure 3-1 is used to define and initially enable or disable the row access rules.Figure 3-1 CREATE PERMISSION SQL statement
+Figure 3-1 CREATE PERMISSION SQL statement

 <!-- image -->

@ -261,8 +259,6 @@ The SQL CREATE PERMISSION statement that is shown in Figure 3-1 is used to defin

 A column mask is a database object that manifests a column value access control rule for a specific column in a specific table. It uses a CASE expression that describes what you see when you access the column. For example, a teller can see only the last four digits of a tax identification number.

-Table 3-1 summarizes these special registers and their values.
-
 Table 3-1 Special registers and their corresponding values

 | Special register     | Corresponding value                                                                                                                   |
@ -336,8 +332,6 @@ WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR', 'EMP' ) = 1 THEN EMPLOYEES . D
 - -Any other person sees the entire TAX\_ID as masked, for example, XXX-XX-XXXX.
 - To implement this column mask, run the SQL statement that is shown in Example 3-9.

-Example 3-9 Creating a mask on the TAX\_ID column
-
 ```
 CREATE MASK HR_SCHEMA.MASK_TAX_ID_ON_EMPLOYEES ON HR_SCHEMA.EMPLOYEES AS EMPLOYEES FOR COLUMN TAX_ID RETURN CASE WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR' ) = 1 THEN EMPLOYEES . TAX_ID WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER = EMPLOYEES . USER_ID THEN EMPLOYEES . TAX_ID WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER <> EMPLOYEES . USER_ID THEN ( 'XXX-XX-' CONCAT QSYS2 . SUBSTR ( EMPLOYEES . TAX_ID , 8 , 4 ) ) WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'EMP' ) = 1 THEN EMPLOYEES . TAX_ID ELSE 'XXX-XX-XXXX' END ENABLE ;
 ```
@ -374,13 +368,12 @@ Figure 3-11 Selecting the EMPLOYEES table from System i Navigator
 <!-- image -->

 - 2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause.
+- 3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.

 Figure 4-68 Visual Explain with RCAC enabled

 <!-- image -->

- 3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.
-
 Figure 4-69 Index advice with no RCAC

 <!-- image -->
@ -397,10 +390,10 @@ Implement roles and separation of duties

 Leverage row permissions on the database

-Protect columns by defining column masks
-
 This IBM Redpaper publication provides information about the IBM i 7.2 feature of IBM DB2 for i Row and Column Access Control (RCAC). It offers a broad description of the function and advantages of controlling access to data in a comprehensive and transparent way. This publication helps you understand the capabilities of RCAC and provides examples of defining, creating, and implementing the row permissions and column masks in a relational database environment.

+Protect columns by defining column masks
+
 This paper is intended for database engineers, data-centric application developers, and security officers who want to design and implement RCAC as a part of their data control and governance policy. A solid background in IBM i object level security, DB2 for i relational database concepts, and SQL is assumed.

 <!-- image -->
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_01.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/right_to_left_01.doctags.txt
@ -1,8 +1,8 @@
 <doctag><section_header_level_1><loc_183><loc_46><loc_426><loc_55>Pythonو R ةغلب ةجمربلا للاخ نم تلاكشملا لحو ةيجاتنلإا نيسحت</section_header_level_1>
 <text><loc_74><loc_64><loc_427><loc_99>Python و R ةغلب ةجمربلا ربتعت ةلاعف لولح داجيإ يف دعاستو ةيجاتنلإا ززعت نأ نكمي يتلا ةيوقلا تاودلأا نم ءاملعلاو نيللحملا ىلع لهسي امم ،تانايبلا ليلحتل ةيلاثم اهلعجت ةديرف تازيمPython و R نم لك كلتمي .تلاكشملل ناك اذإ .ةلاعفو ةعيرس ةقيرطب ةدقعم تلايلحت ءارجإ مهسي نأ نكمي تاغللا هذه مادختسا نإف ،ةيليلحت ةيلقع كيدل .لمعلا جئاتن نيسحت يف ريبك لكشب</text>
 <text><loc_170><loc_126><loc_170><loc_134>ً</text>
-<text><loc_82><loc_108><loc_427><loc_143>جارختساو تانايبلا نم ةلئاه تايمك ةجلاعم نكمملا نم حبصي ،ةجمربلا تاراهم عم يليلحتلا ريكفتلا عمتجي امدنع ذيفنتلPython و R مادختسا نيجمربملل نكمي .اهنم تاهجوتلاو طامنلأا ةجذمنلا لثم ،ةمدقتم ةيليلحت تايلمع ةقد رثكأ تارارق ذاختا ىلإ ا ضيأ يدؤي نأ نكمي لب ،تقولا رفوي طقف سيل اذه .ةريبكلا تانايبلا ليلحتو ةيئاصحلإا تانايبلا ىلع ةمئاق تاجاتنتسا ىلع ءانب .</text>
 <text><loc_416><loc_135><loc_416><loc_143>ً</text>
+<text><loc_82><loc_108><loc_427><loc_143>جارختساو تانايبلا نم ةلئاه تايمك ةجلاعم نكمملا نم حبصي ،ةجمربلا تاراهم عم يليلحتلا ريكفتلا عمتجي امدنع ذيفنتلPython و R مادختسا نيجمربملل نكمي .اهنم تاهجوتلاو طامنلأا ةجذمنلا لثم ،ةمدقتم ةيليلحت تايلمع ةقد رثكأ تارارق ذاختا ىلإ ا ضيأ يدؤي نأ نكمي لب ،تقولا رفوي طقف سيل اذه .ةريبكلا تانايبلا ليلحتو ةيئاصحلإا تانايبلا ىلع ةمئاق تاجاتنتسا ىلع ءانب .</text>
 <text><loc_76><loc_152><loc_427><loc_186>ليلحتلا نم ،تاقيبطتلا نم ةعساو ةعومجم معدت ةينغ تاودأو تابتكمPython و R نم لك رفوت ،كلذ ىلع ةولاع ىلع .ةفلتخملا تلاكشملل ةركتبم لولح ريوطتل تابتكملا هذه نم ةدافتسلاا نيمدختسملل نكمي .يللآا ملعتلا ىلإ ينايبلا R رفوت امنيب ،ةءافكب تانايبلا ةرادلإ Python يف pandas ةبتكم مادختسا نكمي ،لاثملا ليبس مسرلل ةيوق تاودأ .نيللحملاو نيثحابلل ةيلاثم اهلعجي امم ،يئاصحلإا ليلحتلاو ينايبلا</text>
 <text><loc_79><loc_195><loc_427><loc_221>Python و R ةغلب ةجمربلا يدؤت نأ نكمي ،ةياهنلا يف ةركتبم لولح ريفوتو ةيجاتنلإا نيسحت ىلإ ةيليلحت ةيلقع عم اهل نوكت نأ نكمي ةبسانملا ةيجمربلا بيلاسلأا قيبطتو لاعف لكشب تانايبلا ليلحت ىلع ةردقلا نإ .ةدقعملا تلاكشملل .ينهملاو يصخشلا ءادلأا ىلع ىدملا ةديعب ةيباجيإ تاريثأت</text>
 </doctag>
--- a/tests/data/groundtruth/docling_v2/right_to_left_01.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_01.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_01.md
+++ b/tests/data/groundtruth/docling_v2/right_to_left_01.md
@ -4,10 +4,10 @@ Python و R ةغلب ةجمربلا ربتعت ةلاعف لولح داجيإ ي

 ً

-جارختساو تانايبلا نم ةلئاه تايمك ةجلاعم نكمملا نم حبصي ،ةجمربلا تاراهم عم يليلحتلا ريكفتلا عمتجي امدنع ذيفنتلPython و R مادختسا نيجمربملل نكمي .اهنم تاهجوتلاو طامنلأا ةجذمنلا لثم ،ةمدقتم ةيليلحت تايلمع ةقد رثكأ تارارق ذاختا ىلإ ا ضيأ يدؤي نأ نكمي لب ،تقولا رفوي طقف سيل اذه .ةريبكلا تانايبلا ليلحتو ةيئاصحلإا تانايبلا ىلع ةمئاق تاجاتنتسا ىلع ءانب .
-
 ً

+جارختساو تانايبلا نم ةلئاه تايمك ةجلاعم نكمملا نم حبصي ،ةجمربلا تاراهم عم يليلحتلا ريكفتلا عمتجي امدنع ذيفنتلPython و R مادختسا نيجمربملل نكمي .اهنم تاهجوتلاو طامنلأا ةجذمنلا لثم ،ةمدقتم ةيليلحت تايلمع ةقد رثكأ تارارق ذاختا ىلإ ا ضيأ يدؤي نأ نكمي لب ،تقولا رفوي طقف سيل اذه .ةريبكلا تانايبلا ليلحتو ةيئاصحلإا تانايبلا ىلع ةمئاق تاجاتنتسا ىلع ءانب .
+
 ليلحتلا نم ،تاقيبطتلا نم ةعساو ةعومجم معدت ةينغ تاودأو تابتكمPython و R نم لك رفوت ،كلذ ىلع ةولاع ىلع .ةفلتخملا تلاكشملل ةركتبم لولح ريوطتل تابتكملا هذه نم ةدافتسلاا نيمدختسملل نكمي .يللآا ملعتلا ىلإ ينايبلا R رفوت امنيب ،ةءافكب تانايبلا ةرادلإ Python يف pandas ةبتكم مادختسا نكمي ،لاثملا ليبس مسرلل ةيوق تاودأ .نيللحملاو نيثحابلل ةيلاثم اهلعجي امم ،يئاصحلإا ليلحتلاو ينايبلا

 Python و R ةغلب ةجمربلا يدؤت نأ نكمي ،ةياهنلا يف ةركتبم لولح ريفوتو ةيجاتنلإا نيسحت ىلإ ةيليلحت ةيلقع عم اهل نوكت نأ نكمي ةبسانملا ةيجمربلا بيلاسلأا قيبطتو لاعف لكشب تانايبلا ليلحت ىلع ةردقلا نإ .ةدقعملا تلاكشملل .ينهملاو يصخشلا ءادلأا ىلع ىدملا ةديعب ةيباجيإ تاريثأت
--- a/tests/data/groundtruth/docling_v2/right_to_left_02.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_02.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_03.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_03.json
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test.json
@ -1 +1 @@
-{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.58837890625, 506.6666564941406, 767.2550048828125], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
+{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.5883585611979, 506.6666666666667, 767.2550252278646], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.json
@ -1 +1 @@
-{"schema_name": "DoclingDocument", "version": "1.1.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550048828125, "r": 506.6666564941406, "b": 688.58837890625, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
+{"schema_name": "DoclingDocument", "version": "1.1.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}