Docling/docling/utils/glm_utils.py
Michele Dolfi 6a76b49a47
feat: Expose equation exports (#869)
* pin new docling-core and exploit it via assembler changes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update test results

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update with docling-core release

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-02-03 10:31:19 +01:00

352 lines
12 KiB
Python

import re
from pathlib import Path
from typing import List
import pandas as pd
from docling_core.types.doc import (
BoundingBox,
CoordOrigin,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
ProvenanceItem,
Size,
TableCell,
TableData,
)
def resolve_item(paths, obj):
"""Find item in document from a reference path"""
if len(paths) == 0:
return obj
if paths[0] == "#":
return resolve_item(paths[1:], obj)
try:
key = int(paths[0])
except:
key = paths[0]
if len(paths) == 1:
if isinstance(key, str) and key in obj:
return obj[key]
elif isinstance(key, int) and key < len(obj):
return obj[key]
else:
return None
elif len(paths) > 1:
if isinstance(key, str) and key in obj:
return resolve_item(paths[1:], obj[key])
elif isinstance(key, int) and key < len(obj):
return resolve_item(paths[1:], obj[key])
else:
return None
else:
return None
def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
unique_objects = []
seen_spans = set()
for sublist in grid:
for obj in sublist:
# Convert the spans list to a tuple of tuples for hashing
spans_tuple = tuple(tuple(span) for span in obj["spans"])
if spans_tuple not in seen_spans:
seen_spans.add(spans_tuple)
unique_objects.append(obj)
return unique_objects
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
origin = DocumentOrigin(
mimetype="application/pdf",
filename=doc_glm["file-info"]["filename"],
binary_hash=doc_glm["file-info"]["document-hash"],
)
doc_name = Path(origin.filename).stem
doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
for page_dim in doc_glm["page-dimensions"]:
page_no = int(page_dim["page"])
size = Size(width=page_dim["width"], height=page_dim["height"])
doc.add_page(page_no=page_no, size=size)
if "properties" in doc_glm:
props = pd.DataFrame(
doc_glm["properties"]["data"], columns=doc_glm["properties"]["headers"]
)
else:
props = pd.DataFrame()
current_list = None
for ix, pelem in enumerate(doc_glm["page-elements"]):
ptype = pelem["type"]
span_i = pelem["span"][0]
span_j = pelem["span"][1]
if "iref" not in pelem:
# print(json.dumps(pelem, indent=2))
continue
iref = pelem["iref"]
if re.match("#/figures/(\\d+)/captions/(.+)", iref):
# print(f"skip {iref}")
continue
if re.match("#/tables/(\\d+)/captions/(.+)", iref):
# print(f"skip {iref}")
continue
path = iref.split("/")
obj = resolve_item(path, doc_glm)
if obj is None:
current_list = None
print(f"warning: undefined {path}")
continue
if ptype == "figure":
current_list = None
text = ""
caption_refs = []
for caption in obj["captions"]:
text += caption["text"]
for nprov in caption["prov"]:
npaths = nprov["$ref"].split("/")
nelem = resolve_item(npaths, doc_glm)
if nelem is None:
# print(f"warning: undefined caption {npaths}")
continue
span_i = nelem["span"][0]
span_j = nelem["span"][1]
cap_text = caption["text"][span_i:span_j]
# doc_glm["page-elements"].remove(nelem)
prov = ProvenanceItem(
page_no=nelem["page"],
charspan=tuple(nelem["span"]),
bbox=BoundingBox.from_tuple(
nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
),
)
caption_obj = doc.add_text(
label=DocItemLabel.CAPTION, text=cap_text, prov=prov
)
caption_refs.append(caption_obj.get_ref())
prov = ProvenanceItem(
page_no=pelem["page"],
charspan=(0, len(text)),
bbox=BoundingBox.from_tuple(
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
),
)
pic = doc.add_picture(prov=prov)
pic.captions.extend(caption_refs)
_add_child_elements(pic, doc, obj, pelem)
elif ptype == "table":
current_list = None
text = ""
caption_refs = []
item_label = DocItemLabel(pelem["name"])
for caption in obj["captions"]:
text += caption["text"]
for nprov in caption["prov"]:
npaths = nprov["$ref"].split("/")
nelem = resolve_item(npaths, doc_glm)
if nelem is None:
# print(f"warning: undefined caption {npaths}")
continue
span_i = nelem["span"][0]
span_j = nelem["span"][1]
cap_text = caption["text"][span_i:span_j]
# doc_glm["page-elements"].remove(nelem)
prov = ProvenanceItem(
page_no=nelem["page"],
charspan=tuple(nelem["span"]),
bbox=BoundingBox.from_tuple(
nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
),
)
caption_obj = doc.add_text(
label=DocItemLabel.CAPTION, text=cap_text, prov=prov
)
caption_refs.append(caption_obj.get_ref())
table_cells_glm = _flatten_table_grid(obj["data"])
table_cells = []
for tbl_cell_glm in table_cells_glm:
if tbl_cell_glm["bbox"] is not None:
bbox = BoundingBox.from_tuple(
tbl_cell_glm["bbox"], origin=CoordOrigin.BOTTOMLEFT
)
else:
bbox = None
is_col_header = False
is_row_header = False
is_row_section = False
if tbl_cell_glm["type"] == "col_header":
is_col_header = True
elif tbl_cell_glm["type"] == "row_header":
is_row_header = True
elif tbl_cell_glm["type"] == "row_section":
is_row_section = True
table_cells.append(
TableCell(
row_span=tbl_cell_glm["row-span"][1]
- tbl_cell_glm["row-span"][0],
col_span=tbl_cell_glm["col-span"][1]
- tbl_cell_glm["col-span"][0],
start_row_offset_idx=tbl_cell_glm["row-span"][0],
end_row_offset_idx=tbl_cell_glm["row-span"][1],
start_col_offset_idx=tbl_cell_glm["col-span"][0],
end_col_offset_idx=tbl_cell_glm["col-span"][1],
text=tbl_cell_glm["text"],
bbox=bbox,
column_header=is_col_header,
row_header=is_row_header,
row_section=is_row_section,
)
)
tbl_data = TableData(
num_rows=obj.get("#-rows", 0),
num_cols=obj.get("#-cols", 0),
table_cells=table_cells,
)
prov = ProvenanceItem(
page_no=pelem["page"],
charspan=(0, 0),
bbox=BoundingBox.from_tuple(
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
),
)
tbl = doc.add_table(data=tbl_data, prov=prov, label=item_label)
tbl.captions.extend(caption_refs)
elif ptype in [DocItemLabel.FORM.value, DocItemLabel.KEY_VALUE_REGION.value]:
label = DocItemLabel(ptype)
group_label = GroupLabel.UNSPECIFIED
if label == DocItemLabel.FORM:
group_label = GroupLabel.FORM_AREA
elif label == DocItemLabel.KEY_VALUE_REGION:
group_label = GroupLabel.KEY_VALUE_AREA
container_el = doc.add_group(label=group_label)
_add_child_elements(container_el, doc, obj, pelem)
elif "text" in obj:
text = obj["text"][span_i:span_j]
type_label = pelem["type"]
name_label = pelem["name"]
if update_name_label and len(props) > 0 and type_label == "paragraph":
prop = props[
(props["type"] == "semantic") & (props["subj_path"] == iref)
]
if len(prop) == 1 and prop.iloc[0]["confidence"] > 0.85:
name_label = prop.iloc[0]["label"]
prov = ProvenanceItem(
page_no=pelem["page"],
charspan=(0, len(text)),
bbox=BoundingBox.from_tuple(
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
),
)
label = DocItemLabel(name_label)
if label == DocItemLabel.LIST_ITEM:
if current_list is None:
current_list = doc.add_group(label=GroupLabel.LIST, name="list")
# TODO: Infer if this is a numbered or a bullet list item
doc.add_list_item(
text=text, enumerated=False, prov=prov, parent=current_list
)
elif label == DocItemLabel.SECTION_HEADER:
current_list = None
doc.add_heading(text=text, prov=prov)
elif label == DocItemLabel.CODE:
current_list = None
doc.add_code(text=text, prov=prov)
elif label == DocItemLabel.FORMULA:
current_list = None
doc.add_text(label=DocItemLabel.FORMULA, text="", orig=text, prov=prov)
else:
current_list = None
doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov)
return doc
def _add_child_elements(container_el, doc, obj, pelem):
payload = obj.get("payload")
if payload is not None:
children = payload.get("children", [])
for child in children:
c_label = DocItemLabel(child["label"])
c_bbox = BoundingBox.model_validate(child["bbox"]).to_bottom_left_origin(
doc.pages[pelem["page"]].size.height
)
c_text = " ".join(
[
cell["text"].replace("\x02", "-").strip()
for cell in child["cells"]
if len(cell["text"].strip()) > 0
]
)
c_prov = ProvenanceItem(
page_no=pelem["page"], charspan=(0, len(c_text)), bbox=c_bbox
)
if c_label == DocItemLabel.LIST_ITEM:
# TODO: Infer if this is a numbered or a bullet list item
doc.add_list_item(parent=container_el, text=c_text, prov=c_prov)
elif c_label == DocItemLabel.SECTION_HEADER:
doc.add_heading(parent=container_el, text=c_text, prov=c_prov)
else:
doc.add_text(
parent=container_el, label=c_label, text=c_text, prov=c_prov
)