feat!: Docling v2 (#117)
--------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
@@ -1,39 +1,228 @@
|
||||
import copy
|
||||
import random
|
||||
from typing import List, Union
|
||||
|
||||
from deepsearch_glm.nlp_utils import init_nlp_model
|
||||
from deepsearch_glm.utils.doc_utils import to_legacy_document_format
|
||||
from deepsearch_glm.utils.doc_utils import to_docling_document
|
||||
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
||||
from docling_core.types import BaseText
|
||||
from docling_core.types import Document as DsDocument
|
||||
from docling_core.types import Ref
|
||||
from docling_core.types import DocumentDescription as DsDocumentDescription
|
||||
from docling_core.types import FileInfoObject as DsFileInfoObject
|
||||
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
||||
from docling_core.types import Table as DsSchemaTable
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
|
||||
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
||||
from docling_core.types.legacy_doc.base import Figure, TableCell
|
||||
from PIL import ImageDraw
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
|
||||
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
||||
from docling.utils.utils import create_hash
|
||||
|
||||
|
||||
class GlmOptions(BaseModel):
|
||||
model_config = ConfigDict(protected_namespaces=())
|
||||
|
||||
model_names: str = "" # e.g. "language;term;reference"
|
||||
|
||||
|
||||
class GlmModel:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.model_names = self.config.get(
|
||||
"model_names", ""
|
||||
) # "language;term;reference"
|
||||
load_pretrained_nlp_models()
|
||||
# model = init_nlp_model(model_names="language;term;reference")
|
||||
model = init_nlp_model(model_names=self.model_names)
|
||||
self.model = model
|
||||
def __init__(self, options: GlmOptions):
|
||||
self.options = options
|
||||
|
||||
def __call__(self, conv_res: ConversionResult) -> DsDocument:
|
||||
ds_doc = conv_res._to_ds_document()
|
||||
load_pretrained_nlp_models()
|
||||
self.model = init_nlp_model(model_names=self.options.model_names)
|
||||
|
||||
def _to_legacy_document(self, conv_res) -> DsDocument:
|
||||
title = ""
|
||||
desc: DsDocumentDescription = DsDocumentDescription(logs=[])
|
||||
|
||||
page_hashes = [
|
||||
PageReference(
|
||||
hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
|
||||
page=p.page_no + 1,
|
||||
model="default",
|
||||
)
|
||||
for p in conv_res.pages
|
||||
]
|
||||
|
||||
file_info = DsFileInfoObject(
|
||||
filename=conv_res.input.file.name,
|
||||
document_hash=conv_res.input.document_hash,
|
||||
num_pages=conv_res.input.page_count,
|
||||
page_hashes=page_hashes,
|
||||
)
|
||||
|
||||
main_text: List[Union[Ref, BaseText]] = []
|
||||
tables: List[DsSchemaTable] = []
|
||||
figures: List[Figure] = []
|
||||
|
||||
page_no_to_page = {p.page_no: p for p in conv_res.pages}
|
||||
|
||||
for element in conv_res.assembled.elements:
|
||||
# Convert bboxes to lower-left origin.
|
||||
target_bbox = DsBoundingBox(
|
||||
element.cluster.bbox.to_bottom_left_origin(
|
||||
page_no_to_page[element.page_no].size.height
|
||||
).as_tuple()
|
||||
)
|
||||
|
||||
if isinstance(element, TextElement):
|
||||
main_text.append(
|
||||
BaseText(
|
||||
text=element.text,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
name=element.label,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, len(element.text)],
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
elif isinstance(element, Table):
|
||||
index = len(tables)
|
||||
ref_str = f"#/tables/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=element.label,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
|
||||
# Initialise empty table data grid (only empty cells)
|
||||
table_data = [
|
||||
[
|
||||
TableCell(
|
||||
text="",
|
||||
# bbox=[0,0,0,0],
|
||||
spans=[[i, j]],
|
||||
obj_type="body",
|
||||
)
|
||||
for j in range(element.num_cols)
|
||||
]
|
||||
for i in range(element.num_rows)
|
||||
]
|
||||
|
||||
# Overwrite cells in table data for which there is actual cell content.
|
||||
for cell in element.table_cells:
|
||||
for i in range(
|
||||
min(cell.start_row_offset_idx, element.num_rows),
|
||||
min(cell.end_row_offset_idx, element.num_rows),
|
||||
):
|
||||
for j in range(
|
||||
min(cell.start_col_offset_idx, element.num_cols),
|
||||
min(cell.end_col_offset_idx, element.num_cols),
|
||||
):
|
||||
celltype = "body"
|
||||
if cell.column_header:
|
||||
celltype = "col_header"
|
||||
elif cell.row_header:
|
||||
celltype = "row_header"
|
||||
elif cell.row_section:
|
||||
celltype = "row_section"
|
||||
|
||||
def make_spans(cell):
|
||||
for rspan in range(
|
||||
min(cell.start_row_offset_idx, element.num_rows),
|
||||
min(cell.end_row_offset_idx, element.num_rows),
|
||||
):
|
||||
for cspan in range(
|
||||
min(
|
||||
cell.start_col_offset_idx, element.num_cols
|
||||
),
|
||||
min(cell.end_col_offset_idx, element.num_cols),
|
||||
):
|
||||
yield [rspan, cspan]
|
||||
|
||||
spans = list(make_spans(cell))
|
||||
if cell.bbox is not None:
|
||||
bbox = cell.bbox.to_bottom_left_origin(
|
||||
page_no_to_page[element.page_no].size.height
|
||||
).as_tuple()
|
||||
else:
|
||||
bbox = None
|
||||
|
||||
table_data[i][j] = TableCell(
|
||||
text=cell.text,
|
||||
bbox=bbox,
|
||||
# col=j,
|
||||
# row=i,
|
||||
spans=spans,
|
||||
obj_type=celltype,
|
||||
# col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
|
||||
# row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
|
||||
)
|
||||
|
||||
tables.append(
|
||||
DsSchemaTable(
|
||||
num_cols=element.num_cols,
|
||||
num_rows=element.num_rows,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
data=table_data,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, 0],
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
elif isinstance(element, FigureElement):
|
||||
index = len(figures)
|
||||
ref_str = f"#/figures/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=element.label,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
figures.append(
|
||||
Figure(
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no + 1,
|
||||
span=[0, 0],
|
||||
)
|
||||
],
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
# data=[[]],
|
||||
)
|
||||
)
|
||||
|
||||
page_dimensions = [
|
||||
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
|
||||
for p in conv_res.pages
|
||||
]
|
||||
|
||||
ds_doc: DsDocument = DsDocument(
|
||||
name=title,
|
||||
description=desc,
|
||||
file_info=file_info,
|
||||
main_text=main_text,
|
||||
tables=tables,
|
||||
figures=figures,
|
||||
page_dimensions=page_dimensions,
|
||||
)
|
||||
|
||||
return ds_doc
|
||||
|
||||
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
||||
ds_doc = self._to_legacy_document(conv_res)
|
||||
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
||||
|
||||
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
||||
ds_doc_dict = to_legacy_document_format(
|
||||
glm_doc, ds_doc_dict, update_name_label=True
|
||||
)
|
||||
|
||||
exported_doc = DsDocument.model_validate(ds_doc_dict)
|
||||
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
||||
|
||||
# DEBUG code:
|
||||
def draw_clusters_and_cells(ds_document, page_no):
|
||||
@@ -48,7 +237,7 @@ class GlmModel:
|
||||
if arr == "tables":
|
||||
prov = ds_document.tables[index].prov[0]
|
||||
elif arr == "figures":
|
||||
prov = ds_document.figures[index].prov[0]
|
||||
prov = ds_document.pictures[index].prov[0]
|
||||
else:
|
||||
prov = None
|
||||
|
||||
@@ -83,4 +272,4 @@ class GlmModel:
|
||||
# draw_clusters_and_cells(ds_doc, 0)
|
||||
# draw_clusters_and_cells(exported_doc, 0)
|
||||
|
||||
return exported_doc
|
||||
return docling_doc
|
||||
|
||||
Reference in New Issue
Block a user