
* add the pytests Signed-off-by: Peter Staar <taa@zurich.ibm.com> * renamed the test folder and added the toplevel test Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updated the toplevel function test Signed-off-by: Peter Staar <taa@zurich.ibm.com> * need to start running all tests successfully Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the reference converted documents Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added first test for json and md output Signed-off-by: Peter Staar <taa@zurich.ibm.com> * ran pre-commit Signed-off-by: Peter Staar <taa@zurich.ibm.com> * replaced deprecated json function with model_dump_json Signed-off-by: Peter Staar <taa@zurich.ibm.com> * replaced deprecated json function with model_dump_json Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Fix backend tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * commented out the drawing Signed-off-by: Peter Staar <taa@zurich.ibm.com> * ci: avoid duplicate runs Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> * commented out json verification for now Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added verification of input cells Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformat code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added test to verify the cells in the pages Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added test to verify the cells in the pages (2) Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added test to verify the cells in the pages (3) Signed-off-by: Peter Staar <taa@zurich.ibm.com> * run all examples in CI Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * make sure examples return failures Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * raise a failure if examples fail Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix examples Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * run examples after tests Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Add tests and update top_level_tests using only datamodels Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Remove unnecessary code Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Validate conversion status on e2e test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * package verify utils and add more tests Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * reduce docs in example, since they are already in the tests Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * skip batch_convert Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * pin docling-parse 1.1.2 Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * updated the error messages Signed-off-by: Peter Staar <taa@zurich.ibm.com> * commented out the json verification for now Signed-off-by: Peter Staar <taa@zurich.ibm.com> * bumped GLM version Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Fix lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Pin new docling-parse v1.1.3 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
87 lines
3.3 KiB
Python
87 lines
3.3 KiB
Python
import copy
|
|
import random
|
|
|
|
from deepsearch_glm.nlp_utils import init_nlp_model
|
|
from deepsearch_glm.utils.ds_utils import to_legacy_document_format
|
|
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
|
from docling_core.types import BaseText
|
|
from docling_core.types import Document as DsDocument
|
|
from docling_core.types import Ref
|
|
from PIL import ImageDraw
|
|
|
|
from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
|
|
from docling.datamodel.document import ConversionResult
|
|
|
|
|
|
class GlmModel:
|
|
def __init__(self, config):
|
|
self.config = config
|
|
self.model_names = self.config.get(
|
|
"model_names", ""
|
|
) # "language;term;reference"
|
|
load_pretrained_nlp_models()
|
|
# model = init_nlp_model(model_names="language;term;reference")
|
|
model = init_nlp_model(model_names=self.model_names)
|
|
self.model = model
|
|
|
|
def __call__(self, conv_res: ConversionResult) -> DsDocument:
|
|
ds_doc = conv_res._to_ds_document()
|
|
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
|
|
|
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
|
ds_doc_dict = to_legacy_document_format(
|
|
glm_doc, ds_doc_dict, update_name_label=True
|
|
)
|
|
|
|
exported_doc = DsDocument.model_validate(ds_doc_dict)
|
|
|
|
# DEBUG code:
|
|
def draw_clusters_and_cells(ds_document, page_no):
|
|
clusters_to_draw = []
|
|
image = copy.deepcopy(conv_res.pages[page_no].image)
|
|
for ix, elem in enumerate(ds_document.main_text):
|
|
if isinstance(elem, BaseText):
|
|
prov = elem.prov[0]
|
|
elif isinstance(elem, Ref):
|
|
_, arr, index = elem.ref.split("/")
|
|
index = int(index)
|
|
if arr == "tables":
|
|
prov = ds_document.tables[index].prov[0]
|
|
elif arr == "figures":
|
|
prov = ds_document.figures[index].prov[0]
|
|
else:
|
|
prov = None
|
|
|
|
if prov and prov.page == page_no:
|
|
clusters_to_draw.append(
|
|
Cluster(
|
|
id=ix,
|
|
label=elem.name,
|
|
bbox=BoundingBox.from_tuple(
|
|
coord=prov.bbox,
|
|
origin=CoordOrigin.BOTTOMLEFT,
|
|
).to_top_left_origin(conv_res.pages[page_no].size.height),
|
|
)
|
|
)
|
|
|
|
draw = ImageDraw.Draw(image)
|
|
for c in clusters_to_draw:
|
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
|
draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
|
|
|
|
cell_color = (
|
|
random.randint(30, 140),
|
|
random.randint(30, 140),
|
|
random.randint(30, 140),
|
|
)
|
|
for tc in c.cells: # [:1]:
|
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
|
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
|
image.show()
|
|
|
|
# draw_clusters_and_cells(ds_doc, 0)
|
|
# draw_clusters_and_cells(exported_doc, 0)
|
|
|
|
return exported_doc
|