Docling/docling/utils/glm_utils.py

import re
from pathlib import Path
from typing import List

import pandas as pd
from docling_core.types.doc import (
    BoundingBox,
    CoordOrigin,
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
    GroupLabel,
    ProvenanceItem,
    Size,
    TableCell,
    TableData,
)


def resolve_item(paths, obj):
    """Find item in document from a reference path"""

    if len(paths) == 0:
        return obj

    if paths[0] == "#":
        return resolve_item(paths[1:], obj)

    try:
        key = int(paths[0])
    except:
        key = paths[0]

    if len(paths) == 1:
        if isinstance(key, str) and key in obj:
            return obj[key]
        elif isinstance(key, int) and key < len(obj):
            return obj[key]
        else:
            return None

    elif len(paths) > 1:
        if isinstance(key, str) and key in obj:
            return resolve_item(paths[1:], obj[key])
        elif isinstance(key, int) and key < len(obj):
            return resolve_item(paths[1:], obj[key])
        else:
            return None

    else:
        return None


def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
    unique_objects = []
    seen_spans = set()

    for sublist in grid:
        for obj in sublist:
            # Convert the spans list to a tuple of tuples for hashing
            spans_tuple = tuple(tuple(span) for span in obj["spans"])
            if spans_tuple not in seen_spans:
                seen_spans.add(spans_tuple)
                unique_objects.append(obj)

    return unique_objects


def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
    origin = DocumentOrigin(
        mimetype="application/pdf",
        filename=doc_glm["file-info"]["filename"],
        binary_hash=doc_glm["file-info"]["document-hash"],
    )
    doc_name = Path(origin.filename).stem

    doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)

    for page_dim in doc_glm["page-dimensions"]:
        page_no = int(page_dim["page"])
        size = Size(width=page_dim["width"], height=page_dim["height"])

        doc.add_page(page_no=page_no, size=size)

    if "properties" in doc_glm:
        props = pd.DataFrame(
            doc_glm["properties"]["data"], columns=doc_glm["properties"]["headers"]
        )
    else:
        props = pd.DataFrame()

    current_list = None

    for ix, pelem in enumerate(doc_glm["page-elements"]):
        ptype = pelem["type"]
        span_i = pelem["span"][0]
        span_j = pelem["span"][1]

        if "iref" not in pelem:
            # print(json.dumps(pelem, indent=2))
            continue

        iref = pelem["iref"]

        if re.match("#/figures/(\\d+)/captions/(.+)", iref):
            # print(f"skip {iref}")
            continue

        if re.match("#/tables/(\\d+)/captions/(.+)", iref):
            # print(f"skip {iref}")
            continue

        path = iref.split("/")
        obj = resolve_item(path, doc_glm)

        if obj is None:
            current_list = None
            print(f"warning: undefined {path}")
            continue

        if ptype == "figure":
            current_list = None
            text = ""
            caption_refs = []
            for caption in obj["captions"]:
                text += caption["text"]

                for nprov in caption["prov"]:
                    npaths = nprov["$ref"].split("/")
                    nelem = resolve_item(npaths, doc_glm)

                    if nelem is None:
                        # print(f"warning: undefined caption {npaths}")
                        continue

                    span_i = nelem["span"][0]
                    span_j = nelem["span"][1]

                    cap_text = caption["text"][span_i:span_j]

                    # doc_glm["page-elements"].remove(nelem)

                    prov = ProvenanceItem(
                        page_no=nelem["page"],
                        charspan=tuple(nelem["span"]),
                        bbox=BoundingBox.from_tuple(
                            nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
                        ),
                    )

                    caption_obj = doc.add_text(
                        label=DocItemLabel.CAPTION, text=cap_text, prov=prov
                    )
                    caption_refs.append(caption_obj.get_ref())

            prov = ProvenanceItem(
                page_no=pelem["page"],
                charspan=(0, len(text)),
                bbox=BoundingBox.from_tuple(
                    pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
                ),
            )

            pic = doc.add_picture(prov=prov)
            pic.captions.extend(caption_refs)
            _add_child_elements(pic, doc, obj, pelem)

        elif ptype == "table":
            current_list = None
            text = ""
            caption_refs = []
            for caption in obj["captions"]:
                text += caption["text"]

                for nprov in caption["prov"]:
                    npaths = nprov["$ref"].split("/")
                    nelem = resolve_item(npaths, doc_glm)

                    if nelem is None:
                        # print(f"warning: undefined caption {npaths}")
                        continue

                    span_i = nelem["span"][0]
                    span_j = nelem["span"][1]

                    cap_text = caption["text"][span_i:span_j]

                    # doc_glm["page-elements"].remove(nelem)

                    prov = ProvenanceItem(
                        page_no=nelem["page"],
                        charspan=tuple(nelem["span"]),
                        bbox=BoundingBox.from_tuple(
                            nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
                        ),
                    )

                    caption_obj = doc.add_text(
                        label=DocItemLabel.CAPTION, text=cap_text, prov=prov
                    )
                    caption_refs.append(caption_obj.get_ref())

            table_cells_glm = _flatten_table_grid(obj["data"])

            table_cells = []
            for tbl_cell_glm in table_cells_glm:
                if tbl_cell_glm["bbox"] is not None:
                    bbox = BoundingBox.from_tuple(
                        tbl_cell_glm["bbox"], origin=CoordOrigin.BOTTOMLEFT
                    )
                else:
                    bbox = None

                is_col_header = False
                is_row_header = False
                is_row_section = False

                if tbl_cell_glm["type"] == "col_header":
                    is_col_header = True
                elif tbl_cell_glm["type"] == "row_header":
                    is_row_header = True
                elif tbl_cell_glm["type"] == "row_section":
                    is_row_section = True

                table_cells.append(
                    TableCell(
                        row_span=tbl_cell_glm["row-span"][1]
                        - tbl_cell_glm["row-span"][0],
                        col_span=tbl_cell_glm["col-span"][1]
                        - tbl_cell_glm["col-span"][0],
                        start_row_offset_idx=tbl_cell_glm["row-span"][0],
                        end_row_offset_idx=tbl_cell_glm["row-span"][1],
                        start_col_offset_idx=tbl_cell_glm["col-span"][0],
                        end_col_offset_idx=tbl_cell_glm["col-span"][1],
                        text=tbl_cell_glm["text"],
                        bbox=bbox,
                        column_header=is_col_header,
                        row_header=is_row_header,
                        row_section=is_row_section,
                    )
                )

            tbl_data = TableData(
                num_rows=obj.get("#-rows", 0),
                num_cols=obj.get("#-cols", 0),
                table_cells=table_cells,
            )

            prov = ProvenanceItem(
                page_no=pelem["page"],
                charspan=(0, 0),
                bbox=BoundingBox.from_tuple(
                    pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
                ),
            )

            tbl = doc.add_table(data=tbl_data, prov=prov)
            tbl.captions.extend(caption_refs)

        elif ptype in ["form", "key_value_region"]:
            label = DocItemLabel(ptype)
            container_el = doc.add_group(label=GroupLabel.UNSPECIFIED, name=label)

            _add_child_elements(container_el, doc, obj, pelem)

        elif "text" in obj:
            text = obj["text"][span_i:span_j]

            type_label = pelem["type"]
            name_label = pelem["name"]
            if update_name_label and len(props) > 0 and type_label == "paragraph":
                prop = props[
                    (props["type"] == "semantic") & (props["subj_path"] == iref)
                ]
                if len(prop) == 1 and prop.iloc[0]["confidence"] > 0.85:
                    name_label = prop.iloc[0]["label"]

            prov = ProvenanceItem(
                page_no=pelem["page"],
                charspan=(0, len(text)),
                bbox=BoundingBox.from_tuple(
                    pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
                ),
            )
            label = DocItemLabel(name_label)

            if label == DocItemLabel.LIST_ITEM:
                if current_list is None:
                    current_list = doc.add_group(label=GroupLabel.LIST, name="list")

                # TODO: Infer if this is a numbered or a bullet list item
                doc.add_list_item(
                    text=text, enumerated=False, prov=prov, parent=current_list
                )
            elif label == DocItemLabel.SECTION_HEADER:
                current_list = None

                doc.add_heading(text=text, prov=prov)
            else:
                current_list = None

                doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov)

    return doc


def _add_child_elements(container_el, doc, obj, pelem):
    payload = obj.get("payload")
    if payload is not None:
        children = payload.get("children", [])

        for child in children:
            c_label = DocItemLabel(child["label"])
            c_bbox = BoundingBox.model_validate(child["bbox"]).to_bottom_left_origin(
                doc.pages[pelem["page"]].size.height
            )
            c_text = " ".join(
                [
                    cell["text"].replace("\x02", "-").strip()
                    for cell in child["cells"]
                    if len(cell["text"].strip()) > 0
                ]
            )

            c_prov = ProvenanceItem(
                page_no=pelem["page"], charspan=(0, len(c_text)), bbox=c_bbox
            )
            if c_label == DocItemLabel.LIST_ITEM:
                # TODO: Infer if this is a numbered or a bullet list item
                doc.add_list_item(parent=container_el, text=c_text, prov=c_prov)
            elif c_label == DocItemLabel.SECTION_HEADER:
                doc.add_heading(parent=container_el, text=c_text, prov=c_prov)
            else:
                doc.add_text(
                    parent=container_el, label=c_label, text=c_text, prov=c_prov
                )