feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905)
* Add DoclingParseV3 backend implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use docling-core with docling-parse types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes and test updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test cases Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test units Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add back DoclingParse v1 backend, pipeline options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update locks Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: update docling-core to 2.22.0 Update dependency library docling-core to latest release 2.22.0 Fix regression tests and ground truth files Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Ground-truth files updated Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests, use TextCell.from_ocr property Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Text fixes, new test data Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename docling backend to v4 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Test all backends, fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset all tests to use docling-parse v1 for now Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for DPv4 backend init, better test coverage Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * test_input_doc use default backend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
@@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||
import pypdfium2 as pdfium
|
||||
import pypdfium2.raw as pdfium_c
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfTextPage
|
||||
from pypdfium2._helpers.misc import PdfiumError
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import Cell
|
||||
from docling.utils.locks import pypdfium2_lock
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -68,7 +68,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
|
||||
return text_piece
|
||||
|
||||
def get_text_cells(self) -> Iterable[Cell]:
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
return None
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
with pypdfium2_lock:
|
||||
if not self.text_page:
|
||||
self.text_page = self._ppage.get_textpage()
|
||||
@@ -84,11 +87,19 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
text_piece = self.text_page.get_text_bounded(*rect)
|
||||
x0, y0, x1, y1 = rect
|
||||
cells.append(
|
||||
Cell(
|
||||
id=cell_counter,
|
||||
TextCell(
|
||||
index=cell_counter,
|
||||
text=text_piece,
|
||||
bbox=BoundingBox(
|
||||
l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
|
||||
orig=text_piece,
|
||||
from_ocr=False,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox(
|
||||
l=x0,
|
||||
b=y0,
|
||||
r=x1,
|
||||
t=y1,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
@@ -97,51 +108,56 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
|
||||
# The cell merging code below is to clean this up.
|
||||
def merge_horizontal_cells(
|
||||
cells: List[Cell],
|
||||
cells: List[TextCell],
|
||||
horizontal_threshold_factor: float = 1.0,
|
||||
vertical_threshold_factor: float = 0.5,
|
||||
) -> List[Cell]:
|
||||
) -> List[TextCell]:
|
||||
if not cells:
|
||||
return []
|
||||
|
||||
def group_rows(cells: List[Cell]) -> List[List[Cell]]:
|
||||
def group_rows(cells: List[TextCell]) -> List[List[TextCell]]:
|
||||
rows = []
|
||||
current_row = [cells[0]]
|
||||
row_top = cells[0].bbox.t
|
||||
row_bottom = cells[0].bbox.b
|
||||
row_height = cells[0].bbox.height
|
||||
row_top = cells[0].rect.to_bounding_box().t
|
||||
row_bottom = cells[0].rect.to_bounding_box().b
|
||||
row_height = cells[0].rect.to_bounding_box().height
|
||||
|
||||
for cell in cells[1:]:
|
||||
vertical_threshold = row_height * vertical_threshold_factor
|
||||
if (
|
||||
abs(cell.bbox.t - row_top) <= vertical_threshold
|
||||
and abs(cell.bbox.b - row_bottom) <= vertical_threshold
|
||||
abs(cell.rect.to_bounding_box().t - row_top)
|
||||
<= vertical_threshold
|
||||
and abs(cell.rect.to_bounding_box().b - row_bottom)
|
||||
<= vertical_threshold
|
||||
):
|
||||
current_row.append(cell)
|
||||
row_top = min(row_top, cell.bbox.t)
|
||||
row_bottom = max(row_bottom, cell.bbox.b)
|
||||
row_top = min(row_top, cell.rect.to_bounding_box().t)
|
||||
row_bottom = max(row_bottom, cell.rect.to_bounding_box().b)
|
||||
row_height = row_bottom - row_top
|
||||
else:
|
||||
rows.append(current_row)
|
||||
current_row = [cell]
|
||||
row_top = cell.bbox.t
|
||||
row_bottom = cell.bbox.b
|
||||
row_height = cell.bbox.height
|
||||
row_top = cell.rect.to_bounding_box().t
|
||||
row_bottom = cell.rect.to_bounding_box().b
|
||||
row_height = cell.rect.to_bounding_box().height
|
||||
|
||||
if current_row:
|
||||
rows.append(current_row)
|
||||
|
||||
return rows
|
||||
|
||||
def merge_row(row: List[Cell]) -> List[Cell]:
|
||||
def merge_row(row: List[TextCell]) -> List[TextCell]:
|
||||
merged = []
|
||||
current_group = [row[0]]
|
||||
|
||||
for cell in row[1:]:
|
||||
prev_cell = current_group[-1]
|
||||
avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
|
||||
avg_height = (
|
||||
prev_cell.rect.height + cell.rect.to_bounding_box().height
|
||||
) / 2
|
||||
if (
|
||||
cell.bbox.l - prev_cell.bbox.r
|
||||
cell.rect.to_bounding_box().l
|
||||
- prev_cell.rect.to_bounding_box().r
|
||||
<= avg_height * horizontal_threshold_factor
|
||||
):
|
||||
current_group.append(cell)
|
||||
@@ -154,24 +170,30 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
|
||||
return merged
|
||||
|
||||
def merge_group(group: List[Cell]) -> Cell:
|
||||
def merge_group(group: List[TextCell]) -> TextCell:
|
||||
if len(group) == 1:
|
||||
return group[0]
|
||||
|
||||
merged_text = "".join(cell.text for cell in group)
|
||||
merged_bbox = BoundingBox(
|
||||
l=min(cell.bbox.l for cell in group),
|
||||
t=min(cell.bbox.t for cell in group),
|
||||
r=max(cell.bbox.r for cell in group),
|
||||
b=max(cell.bbox.b for cell in group),
|
||||
l=min(cell.rect.to_bounding_box().l for cell in group),
|
||||
t=min(cell.rect.to_bounding_box().t for cell in group),
|
||||
r=max(cell.rect.to_bounding_box().r for cell in group),
|
||||
b=max(cell.rect.to_bounding_box().b for cell in group),
|
||||
)
|
||||
return TextCell(
|
||||
index=group[0].index,
|
||||
text=merged_text,
|
||||
orig=merged_text,
|
||||
rect=BoundingRectangle.from_bounding_box(merged_bbox),
|
||||
from_ocr=False,
|
||||
)
|
||||
return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
|
||||
|
||||
rows = group_rows(cells)
|
||||
merged_cells = [cell for row in rows for cell in merge_row(row)]
|
||||
|
||||
for i, cell in enumerate(merged_cells, 1):
|
||||
cell.id = i
|
||||
cell.index = i
|
||||
|
||||
return merged_cells
|
||||
|
||||
@@ -181,7 +203,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
) # make new image to avoid drawing on the saved ones
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
|
||||
Reference in New Issue
Block a user