feat!: Docling v2 (#117)

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Christoph Auer
2024-10-16 21:02:03 +02:00
committed by GitHub
parent d504432c1e
commit 7d3be0edeb
144 changed files with 15180 additions and 3828 deletions

View File

@@ -2,8 +2,9 @@ import logging
from typing import Iterable
import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.pipeline_options import EasyOcrOptions
from docling.models.base_ocr_model import BaseOcrModel
@@ -39,6 +40,8 @@ class EasyOcrModel(BaseOcrModel):
return
for page in page_batch:
assert page._backend is not None
ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = []