feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905)

* Add DoclingParseV3 backend implementation

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Use docling-core with docling-parse types

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes and test updates

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix streams

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix streams

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Reset tests

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* update test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* update test units

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add back DoclingParse v1 backend, pipeline options

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update locks

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: update docling-core to 2.22.0

Update dependency library docling-core to latest release 2.22.0
Fix regression tests and ground truth files

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* Ground-truth files updated

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update tests, use TextCell.from_ocr property

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Text fixes, new test data

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Rename docling backend to v4

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Test all backends, fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Reset all tests to use docling-parse v1 for now

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes for DPv4 backend init, better test coverage

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* test_input_doc use default backend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Christoph Auer
2025-03-18 10:38:19 +01:00
committed by GitHub
parent 772487f9c9
commit 3960b199d6
126 changed files with 1138 additions and 709 deletions

View File

@@ -6,12 +6,12 @@ from typing import Iterable, List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
from docling_parse.pdf_parsers import pdf_parser_v1
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
@@ -68,8 +68,11 @@ class DoclingParsePageBackend(PdfPageBackend):
return text_piece
def get_text_cells(self) -> Iterable[Cell]:
cells: List[Cell] = []
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return None
def get_text_cells(self) -> Iterable[TextCell]:
cells: List[TextCell] = []
cell_counter = 0
if not self.valid:
@@ -91,19 +94,24 @@ class DoclingParsePageBackend(PdfPageBackend):
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
cells.append(
Cell(
id=cell_counter,
TextCell(
index=cell_counter,
text=text_piece,
bbox=BoundingBox(
# l=x0, b=y0, r=x1, t=y1,
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
# l=x0, b=y0, r=x1, t=y1,
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
def draw_clusters_and_cells():
@@ -112,7 +120,7 @@ class DoclingParsePageBackend(PdfPageBackend):
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),

View File

@@ -6,12 +6,13 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
from docling_parse.pdf_parsers import pdf_parser_v2
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell, Size
from docling.datamodel.base_models import Size
from docling.utils.locks import pypdfium2_lock
if TYPE_CHECKING:
@@ -78,8 +79,11 @@ class DoclingParseV2PageBackend(PdfPageBackend):
return text_piece
def get_text_cells(self) -> Iterable[Cell]:
cells: List[Cell] = []
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return None
def get_text_cells(self) -> Iterable[TextCell]:
cells: List[TextCell] = []
cell_counter = 0
if not self.valid:
@@ -106,16 +110,20 @@ class DoclingParseV2PageBackend(PdfPageBackend):
text_piece = cell_data[cells_header.index("text")]
cells.append(
Cell(
id=cell_counter,
TextCell(
index=cell_counter,
text=text_piece,
bbox=BoundingBox(
# l=x0, b=y0, r=x1, t=y1,
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
# l=x0, b=y0, r=x1, t=y1,
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)

View File

@@ -0,0 +1,185 @@
import logging
import random
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Size
from docling.utils.locks import pypdfium2_lock
if TYPE_CHECKING:
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class DoclingParseV4PageBackend(PdfPageBackend):
def __init__(self, parsed_page: SegmentedPdfPage, page_obj: PdfPage):
self._ppage = page_obj
self._dpage = parsed_page
self.valid = parsed_page is not None
def is_valid(self) -> bool:
return self.valid
def get_text_in_rect(self, bbox: BoundingBox) -> str:
# Find intersecting cells on the page
text_piece = ""
page_size = self.get_size()
scale = (
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
)
for i, cell in enumerate(self._dpage.textline_cells):
cell_bbox = (
cell.rect.to_bounding_box()
.to_top_left_origin(page_height=page_size.height)
.scaled(scale)
)
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
if overlap_frac > 0.5:
if len(text_piece) > 0:
text_piece += " "
text_piece += cell.text
return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return self._dpage
def get_text_cells(self) -> Iterable[TextCell]:
page_size = self.get_size()
[tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
# for cell in self._dpage.textline_cells:
# rect = cell.rect
#
# assert (
# rect.to_bounding_box().l <= rect.to_bounding_box().r
# ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
# assert (
# rect.to_bounding_box().t <= rect.to_bounding_box().b
# ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
return self._dpage.textline_cells
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32
images = self._dpage.bitmap_resources
for img in images:
cropbox = img.rect.to_bounding_box().to_top_left_origin(
self.get_size().height
)
if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
yield cropbox
def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
page_size = self.get_size()
if not cropbox:
cropbox = BoundingBox(
l=0,
r=page_size.width,
t=0,
b=page_size.height,
coord_origin=CoordOrigin.TOPLEFT,
)
padbox = BoundingBox(
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
)
else:
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
padbox.r = page_size.width - padbox.r
padbox.t = page_size.height - padbox.t
image = (
self._ppage.render(
scale=scale * 1.5,
rotation=0, # no additional rotation
crop=padbox.as_tuple(),
)
.to_pil()
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
) # We resize the image from 1.5x the given scale to make it sharper.
return image
def get_size(self) -> Size:
return Size(
width=self._dpage.dimension.width,
height=self._dpage.dimension.height,
)
def unload(self):
self._ppage = None
self._dpage = None
class DoclingParseV4DocumentBackend(PdfDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
with pypdfium2_lock:
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
self.parser = DoclingPdfParser(loglevel="fatal")
self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
success = self.dp_doc is not None
if not success:
raise RuntimeError(
f"docling-parse v4 could not load document {self.document_hash}."
)
def page_count(self) -> int:
# return len(self._pdoc) # To be replaced with docling-parse API
len_1 = len(self._pdoc)
len_2 = self.dp_doc.number_of_pages()
if len_1 != len_2:
_log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
return len_2
def load_page(
self, page_no: int, create_words: bool = True, create_textlines: bool = True
) -> DoclingParseV4PageBackend:
with pypdfium2_lock:
return DoclingParseV4PageBackend(
self.dp_doc.get_page(
page_no + 1,
create_words=create_words,
create_textlines=create_textlines,
),
self._pdoc[page_no],
)
def is_valid(self) -> bool:
return self.page_count() > 0
def unload(self):
super().unload()
self.dp_doc.unload()
with pypdfium2_lock:
self._pdoc.close()
self._pdoc = None

View File

@@ -4,10 +4,11 @@ from pathlib import Path
from typing import Iterable, Optional, Set, Union
from docling_core.types.doc import BoundingBox, Size
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from PIL import Image
from docling.backend.abstract_backend import PaginatedDocumentBackend
from docling.datamodel.base_models import Cell, InputFormat
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
@@ -17,7 +18,11 @@ class PdfPageBackend(ABC):
pass
@abstractmethod
def get_text_cells(self) -> Iterable[Cell]:
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
pass
@abstractmethod
def get_text_cells(self) -> Iterable[TextCell]:
pass
@abstractmethod

View File

@@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
from PIL import Image, ImageDraw
from pypdfium2 import PdfTextPage
from pypdfium2._helpers.misc import PdfiumError
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell
from docling.utils.locks import pypdfium2_lock
if TYPE_CHECKING:
@@ -68,7 +68,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
return text_piece
def get_text_cells(self) -> Iterable[Cell]:
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return None
def get_text_cells(self) -> Iterable[TextCell]:
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
@@ -84,11 +87,19 @@ class PyPdfiumPageBackend(PdfPageBackend):
text_piece = self.text_page.get_text_bounded(*rect)
x0, y0, x1, y1 = rect
cells.append(
Cell(
id=cell_counter,
TextCell(
index=cell_counter,
text=text_piece,
bbox=BoundingBox(
l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
l=x0,
b=y0,
r=x1,
t=y1,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
@@ -97,51 +108,56 @@ class PyPdfiumPageBackend(PdfPageBackend):
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
# The cell merging code below is to clean this up.
def merge_horizontal_cells(
cells: List[Cell],
cells: List[TextCell],
horizontal_threshold_factor: float = 1.0,
vertical_threshold_factor: float = 0.5,
) -> List[Cell]:
) -> List[TextCell]:
if not cells:
return []
def group_rows(cells: List[Cell]) -> List[List[Cell]]:
def group_rows(cells: List[TextCell]) -> List[List[TextCell]]:
rows = []
current_row = [cells[0]]
row_top = cells[0].bbox.t
row_bottom = cells[0].bbox.b
row_height = cells[0].bbox.height
row_top = cells[0].rect.to_bounding_box().t
row_bottom = cells[0].rect.to_bounding_box().b
row_height = cells[0].rect.to_bounding_box().height
for cell in cells[1:]:
vertical_threshold = row_height * vertical_threshold_factor
if (
abs(cell.bbox.t - row_top) <= vertical_threshold
and abs(cell.bbox.b - row_bottom) <= vertical_threshold
abs(cell.rect.to_bounding_box().t - row_top)
<= vertical_threshold
and abs(cell.rect.to_bounding_box().b - row_bottom)
<= vertical_threshold
):
current_row.append(cell)
row_top = min(row_top, cell.bbox.t)
row_bottom = max(row_bottom, cell.bbox.b)
row_top = min(row_top, cell.rect.to_bounding_box().t)
row_bottom = max(row_bottom, cell.rect.to_bounding_box().b)
row_height = row_bottom - row_top
else:
rows.append(current_row)
current_row = [cell]
row_top = cell.bbox.t
row_bottom = cell.bbox.b
row_height = cell.bbox.height
row_top = cell.rect.to_bounding_box().t
row_bottom = cell.rect.to_bounding_box().b
row_height = cell.rect.to_bounding_box().height
if current_row:
rows.append(current_row)
return rows
def merge_row(row: List[Cell]) -> List[Cell]:
def merge_row(row: List[TextCell]) -> List[TextCell]:
merged = []
current_group = [row[0]]
for cell in row[1:]:
prev_cell = current_group[-1]
avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
avg_height = (
prev_cell.rect.height + cell.rect.to_bounding_box().height
) / 2
if (
cell.bbox.l - prev_cell.bbox.r
cell.rect.to_bounding_box().l
- prev_cell.rect.to_bounding_box().r
<= avg_height * horizontal_threshold_factor
):
current_group.append(cell)
@@ -154,24 +170,30 @@ class PyPdfiumPageBackend(PdfPageBackend):
return merged
def merge_group(group: List[Cell]) -> Cell:
def merge_group(group: List[TextCell]) -> TextCell:
if len(group) == 1:
return group[0]
merged_text = "".join(cell.text for cell in group)
merged_bbox = BoundingBox(
l=min(cell.bbox.l for cell in group),
t=min(cell.bbox.t for cell in group),
r=max(cell.bbox.r for cell in group),
b=max(cell.bbox.b for cell in group),
l=min(cell.rect.to_bounding_box().l for cell in group),
t=min(cell.rect.to_bounding_box().t for cell in group),
r=max(cell.rect.to_bounding_box().r for cell in group),
b=max(cell.rect.to_bounding_box().b for cell in group),
)
return TextCell(
index=group[0].index,
text=merged_text,
orig=merged_text,
rect=BoundingRectangle.from_bounding_box(merged_bbox),
from_ocr=False,
)
return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
rows = group_rows(cells)
merged_cells = [cell for row in rows for cell in merge_row(row)]
for i, cell in enumerate(merged_cells, 1):
cell.id = i
cell.index = i
return merged_cells
@@ -181,7 +203,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),