
* Move to_docling_document from ds-glm to this repo Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Upgrade to ds-glm 1.0 and docling-parse 3.0 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update lock Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix DP2 backend code, change CLI default backend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
337 lines
11 KiB
Python
337 lines
11 KiB
Python
import re
|
|
from pathlib import Path
|
|
from typing import List
|
|
|
|
import pandas as pd
|
|
from docling_core.types.doc import (
|
|
BoundingBox,
|
|
CoordOrigin,
|
|
DocItemLabel,
|
|
DoclingDocument,
|
|
DocumentOrigin,
|
|
GroupLabel,
|
|
ProvenanceItem,
|
|
Size,
|
|
TableCell,
|
|
TableData,
|
|
)
|
|
|
|
|
|
def resolve_item(paths, obj):
|
|
"""Find item in document from a reference path"""
|
|
|
|
if len(paths) == 0:
|
|
return obj
|
|
|
|
if paths[0] == "#":
|
|
return resolve_item(paths[1:], obj)
|
|
|
|
try:
|
|
key = int(paths[0])
|
|
except:
|
|
key = paths[0]
|
|
|
|
if len(paths) == 1:
|
|
if isinstance(key, str) and key in obj:
|
|
return obj[key]
|
|
elif isinstance(key, int) and key < len(obj):
|
|
return obj[key]
|
|
else:
|
|
return None
|
|
|
|
elif len(paths) > 1:
|
|
if isinstance(key, str) and key in obj:
|
|
return resolve_item(paths[1:], obj[key])
|
|
elif isinstance(key, int) and key < len(obj):
|
|
return resolve_item(paths[1:], obj[key])
|
|
else:
|
|
return None
|
|
|
|
else:
|
|
return None
|
|
|
|
|
|
def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
|
|
unique_objects = []
|
|
seen_spans = set()
|
|
|
|
for sublist in grid:
|
|
for obj in sublist:
|
|
# Convert the spans list to a tuple of tuples for hashing
|
|
spans_tuple = tuple(tuple(span) for span in obj["spans"])
|
|
if spans_tuple not in seen_spans:
|
|
seen_spans.add(spans_tuple)
|
|
unique_objects.append(obj)
|
|
|
|
return unique_objects
|
|
|
|
|
|
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
|
origin = DocumentOrigin(
|
|
mimetype="application/pdf",
|
|
filename=doc_glm["file-info"]["filename"],
|
|
binary_hash=doc_glm["file-info"]["document-hash"],
|
|
)
|
|
doc_name = Path(origin.filename).stem
|
|
|
|
doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
|
|
|
|
for page_dim in doc_glm["page-dimensions"]:
|
|
page_no = int(page_dim["page"])
|
|
size = Size(width=page_dim["width"], height=page_dim["height"])
|
|
|
|
doc.add_page(page_no=page_no, size=size)
|
|
|
|
if "properties" in doc_glm:
|
|
props = pd.DataFrame(
|
|
doc_glm["properties"]["data"], columns=doc_glm["properties"]["headers"]
|
|
)
|
|
else:
|
|
props = pd.DataFrame()
|
|
|
|
current_list = None
|
|
|
|
for ix, pelem in enumerate(doc_glm["page-elements"]):
|
|
ptype = pelem["type"]
|
|
span_i = pelem["span"][0]
|
|
span_j = pelem["span"][1]
|
|
|
|
if "iref" not in pelem:
|
|
# print(json.dumps(pelem, indent=2))
|
|
continue
|
|
|
|
iref = pelem["iref"]
|
|
|
|
if re.match("#/figures/(\\d+)/captions/(.+)", iref):
|
|
# print(f"skip {iref}")
|
|
continue
|
|
|
|
if re.match("#/tables/(\\d+)/captions/(.+)", iref):
|
|
# print(f"skip {iref}")
|
|
continue
|
|
|
|
path = iref.split("/")
|
|
obj = resolve_item(path, doc_glm)
|
|
|
|
if obj is None:
|
|
current_list = None
|
|
print(f"warning: undefined {path}")
|
|
continue
|
|
|
|
if ptype == "figure":
|
|
current_list = None
|
|
text = ""
|
|
caption_refs = []
|
|
for caption in obj["captions"]:
|
|
text += caption["text"]
|
|
|
|
for nprov in caption["prov"]:
|
|
npaths = nprov["$ref"].split("/")
|
|
nelem = resolve_item(npaths, doc_glm)
|
|
|
|
if nelem is None:
|
|
# print(f"warning: undefined caption {npaths}")
|
|
continue
|
|
|
|
span_i = nelem["span"][0]
|
|
span_j = nelem["span"][1]
|
|
|
|
cap_text = caption["text"][span_i:span_j]
|
|
|
|
# doc_glm["page-elements"].remove(nelem)
|
|
|
|
prov = ProvenanceItem(
|
|
page_no=nelem["page"],
|
|
charspan=tuple(nelem["span"]),
|
|
bbox=BoundingBox.from_tuple(
|
|
nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
),
|
|
)
|
|
|
|
caption_obj = doc.add_text(
|
|
label=DocItemLabel.CAPTION, text=cap_text, prov=prov
|
|
)
|
|
caption_refs.append(caption_obj.get_ref())
|
|
|
|
prov = ProvenanceItem(
|
|
page_no=pelem["page"],
|
|
charspan=(0, len(text)),
|
|
bbox=BoundingBox.from_tuple(
|
|
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
),
|
|
)
|
|
|
|
pic = doc.add_picture(prov=prov)
|
|
pic.captions.extend(caption_refs)
|
|
_add_child_elements(pic, doc, obj, pelem)
|
|
|
|
elif ptype == "table":
|
|
current_list = None
|
|
text = ""
|
|
caption_refs = []
|
|
for caption in obj["captions"]:
|
|
text += caption["text"]
|
|
|
|
for nprov in caption["prov"]:
|
|
npaths = nprov["$ref"].split("/")
|
|
nelem = resolve_item(npaths, doc_glm)
|
|
|
|
if nelem is None:
|
|
# print(f"warning: undefined caption {npaths}")
|
|
continue
|
|
|
|
span_i = nelem["span"][0]
|
|
span_j = nelem["span"][1]
|
|
|
|
cap_text = caption["text"][span_i:span_j]
|
|
|
|
# doc_glm["page-elements"].remove(nelem)
|
|
|
|
prov = ProvenanceItem(
|
|
page_no=nelem["page"],
|
|
charspan=tuple(nelem["span"]),
|
|
bbox=BoundingBox.from_tuple(
|
|
nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
),
|
|
)
|
|
|
|
caption_obj = doc.add_text(
|
|
label=DocItemLabel.CAPTION, text=cap_text, prov=prov
|
|
)
|
|
caption_refs.append(caption_obj.get_ref())
|
|
|
|
table_cells_glm = _flatten_table_grid(obj["data"])
|
|
|
|
table_cells = []
|
|
for tbl_cell_glm in table_cells_glm:
|
|
if tbl_cell_glm["bbox"] is not None:
|
|
bbox = BoundingBox.from_tuple(
|
|
tbl_cell_glm["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
)
|
|
else:
|
|
bbox = None
|
|
|
|
is_col_header = False
|
|
is_row_header = False
|
|
is_row_section = False
|
|
|
|
if tbl_cell_glm["type"] == "col_header":
|
|
is_col_header = True
|
|
elif tbl_cell_glm["type"] == "row_header":
|
|
is_row_header = True
|
|
elif tbl_cell_glm["type"] == "row_section":
|
|
is_row_section = True
|
|
|
|
table_cells.append(
|
|
TableCell(
|
|
row_span=tbl_cell_glm["row-span"][1]
|
|
- tbl_cell_glm["row-span"][0],
|
|
col_span=tbl_cell_glm["col-span"][1]
|
|
- tbl_cell_glm["col-span"][0],
|
|
start_row_offset_idx=tbl_cell_glm["row-span"][0],
|
|
end_row_offset_idx=tbl_cell_glm["row-span"][1],
|
|
start_col_offset_idx=tbl_cell_glm["col-span"][0],
|
|
end_col_offset_idx=tbl_cell_glm["col-span"][1],
|
|
text=tbl_cell_glm["text"],
|
|
bbox=bbox,
|
|
column_header=is_col_header,
|
|
row_header=is_row_header,
|
|
row_section=is_row_section,
|
|
)
|
|
)
|
|
|
|
tbl_data = TableData(
|
|
num_rows=obj.get("#-rows", 0),
|
|
num_cols=obj.get("#-cols", 0),
|
|
table_cells=table_cells,
|
|
)
|
|
|
|
prov = ProvenanceItem(
|
|
page_no=pelem["page"],
|
|
charspan=(0, 0),
|
|
bbox=BoundingBox.from_tuple(
|
|
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
),
|
|
)
|
|
|
|
tbl = doc.add_table(data=tbl_data, prov=prov)
|
|
tbl.captions.extend(caption_refs)
|
|
|
|
elif ptype in ["form", "key_value_region"]:
|
|
label = DocItemLabel(ptype)
|
|
container_el = doc.add_group(label=GroupLabel.UNSPECIFIED, name=label)
|
|
|
|
_add_child_elements(container_el, doc, obj, pelem)
|
|
|
|
elif "text" in obj:
|
|
text = obj["text"][span_i:span_j]
|
|
|
|
type_label = pelem["type"]
|
|
name_label = pelem["name"]
|
|
if update_name_label and len(props) > 0 and type_label == "paragraph":
|
|
prop = props[
|
|
(props["type"] == "semantic") & (props["subj_path"] == iref)
|
|
]
|
|
if len(prop) == 1 and prop.iloc[0]["confidence"] > 0.85:
|
|
name_label = prop.iloc[0]["label"]
|
|
|
|
prov = ProvenanceItem(
|
|
page_no=pelem["page"],
|
|
charspan=(0, len(text)),
|
|
bbox=BoundingBox.from_tuple(
|
|
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
),
|
|
)
|
|
label = DocItemLabel(name_label)
|
|
|
|
if label == DocItemLabel.LIST_ITEM:
|
|
if current_list is None:
|
|
current_list = doc.add_group(label=GroupLabel.LIST, name="list")
|
|
|
|
# TODO: Infer if this is a numbered or a bullet list item
|
|
doc.add_list_item(
|
|
text=text, enumerated=False, prov=prov, parent=current_list
|
|
)
|
|
elif label == DocItemLabel.SECTION_HEADER:
|
|
current_list = None
|
|
|
|
doc.add_heading(text=text, prov=prov)
|
|
else:
|
|
current_list = None
|
|
|
|
doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov)
|
|
|
|
return doc
|
|
|
|
|
|
def _add_child_elements(container_el, doc, obj, pelem):
|
|
payload = obj.get("payload")
|
|
if payload is not None:
|
|
children = payload.get("children", [])
|
|
|
|
for child in children:
|
|
c_label = DocItemLabel(child["label"])
|
|
c_bbox = BoundingBox.model_validate(child["bbox"]).to_bottom_left_origin(
|
|
doc.pages[pelem["page"]].size.height
|
|
)
|
|
c_text = " ".join(
|
|
[
|
|
cell["text"].replace("\x02", "-").strip()
|
|
for cell in child["cells"]
|
|
if len(cell["text"].strip()) > 0
|
|
]
|
|
)
|
|
|
|
c_prov = ProvenanceItem(
|
|
page_no=pelem["page"], charspan=(0, len(c_text)), bbox=c_bbox
|
|
)
|
|
if c_label == DocItemLabel.LIST_ITEM:
|
|
# TODO: Infer if this is a numbered or a bullet list item
|
|
doc.add_list_item(parent=container_el, text=c_text, prov=c_prov)
|
|
elif c_label == DocItemLabel.SECTION_HEADER:
|
|
doc.add_heading(parent=container_el, text=c_text, prov=c_prov)
|
|
else:
|
|
doc.add_text(
|
|
parent=container_el, label=c_label, text=c_text, prov=c_prov
|
|
)
|