feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905)

* Add DoclingParseV3 backend implementation

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Use docling-core with docling-parse types

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes and test updates

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix streams

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix streams

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Reset tests

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* update test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* update test units

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add back DoclingParse v1 backend, pipeline options

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update locks

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: update docling-core to 2.22.0

Update dependency library docling-core to latest release 2.22.0
Fix regression tests and ground truth files

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* Ground-truth files updated

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update tests, use TextCell.from_ocr property

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Text fixes, new test data

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Rename docling backend to v4

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Test all backends, fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Reset all tests to use docling-parse v1 for now

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes for DPv4 backend init, better test coverage

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* test_input_doc use default backend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Christoph Auer
2025-03-18 10:38:19 +01:00
committed by GitHub
parent 772487f9c9
commit 3960b199d6
126 changed files with 1138 additions and 709 deletions

View File

@@ -6,11 +6,12 @@ from typing import Iterable, List
import numpy as np
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
from PIL import Image, ImageDraw
from rtree import index
from scipy.ndimage import binary_dilation, find_objects, label
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OcrOptions
from docling.datamodel.settings import settings
@@ -104,11 +105,13 @@ class BaseOcrModel(BasePageModel):
p.dimension = 2
idx = index.Index(properties=p)
for i, cell in enumerate(programmatic_cells):
idx.insert(i, cell.bbox.as_tuple())
idx.insert(i, cell.rect.to_bounding_box().as_tuple())
def is_overlapping_with_existing_cells(ocr_cell):
# Query the R-tree to get overlapping rectangles
possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
possible_matches_index = list(
idx.intersection(ocr_cell.rect.to_bounding_box().as_tuple())
)
return (
len(possible_matches_index) > 0
@@ -125,10 +128,7 @@ class BaseOcrModel(BasePageModel):
"""
if self.options.force_full_page_ocr:
# If a full page OCR is forced, use only the OCR cells
cells = [
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
for c_ocr in ocr_cells
]
cells = ocr_cells
return cells
## Remove OCR cells which overlap with programmatic cells.
@@ -156,7 +156,7 @@ class BaseOcrModel(BasePageModel):
# Draw OCR and programmatic cells
for tc in page.cells:
x0, y0, x1, y1 = tc.bbox.as_tuple()
x0, y0, x1, y1 = tc.rect.to_bounding_box().as_tuple()
y0 *= scale_x
y1 *= scale_y
x0 *= scale_x
@@ -165,9 +165,8 @@ class BaseOcrModel(BasePageModel):
if y1 <= y0:
y1, y0 = y0, y1
color = "gray"
if isinstance(tc, OcrCell):
color = "magenta"
color = "magenta" if tc.from_ocr else "gray"
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
if show: