
* Add DoclingParseV3 backend implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use docling-core with docling-parse types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes and test updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test cases Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test units Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add back DoclingParse v1 backend, pipeline options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update locks Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: update docling-core to 2.22.0 Update dependency library docling-core to latest release 2.22.0 Fix regression tests and ground truth files Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Ground-truth files updated Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests, use TextCell.from_ocr property Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Text fixes, new test data Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename docling backend to v4 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Test all backends, fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset all tests to use docling-parse v1 for now Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for DPv4 backend init, better test coverage Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * test_input_doc use default backend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
299 lines
10 KiB
Python
299 lines
10 KiB
Python
import logging
|
|
import random
|
|
from io import BytesIO
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
|
|
|
import pypdfium2 as pdfium
|
|
import pypdfium2.raw as pdfium_c
|
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
|
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
|
from PIL import Image, ImageDraw
|
|
from pypdfium2 import PdfTextPage
|
|
from pypdfium2._helpers.misc import PdfiumError
|
|
|
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
|
from docling.utils.locks import pypdfium2_lock
|
|
|
|
if TYPE_CHECKING:
|
|
from docling.datamodel.document import InputDocument
|
|
|
|
_log = logging.getLogger(__name__)
|
|
|
|
|
|
class PyPdfiumPageBackend(PdfPageBackend):
|
|
def __init__(
|
|
self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
|
|
):
|
|
# Note: lock applied by the caller
|
|
self.valid = True # No better way to tell from pypdfium.
|
|
try:
|
|
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
|
except PdfiumError as e:
|
|
_log.info(
|
|
f"An exception occurred when loading page {page_no} of document {document_hash}.",
|
|
exc_info=True,
|
|
)
|
|
self.valid = False
|
|
self.text_page: Optional[PdfTextPage] = None
|
|
|
|
def is_valid(self) -> bool:
|
|
return self.valid
|
|
|
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
|
AREA_THRESHOLD = 0 # 32 * 32
|
|
page_size = self.get_size()
|
|
with pypdfium2_lock:
|
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
|
pos = obj.get_pos()
|
|
cropbox = BoundingBox.from_tuple(
|
|
pos, origin=CoordOrigin.BOTTOMLEFT
|
|
).to_top_left_origin(page_height=page_size.height)
|
|
|
|
if cropbox.area() > AREA_THRESHOLD:
|
|
cropbox = cropbox.scaled(scale=scale)
|
|
|
|
yield cropbox
|
|
|
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
|
with pypdfium2_lock:
|
|
if not self.text_page:
|
|
self.text_page = self._ppage.get_textpage()
|
|
|
|
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
|
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
|
|
|
with pypdfium2_lock:
|
|
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
|
|
|
return text_piece
|
|
|
|
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
|
return None
|
|
|
|
def get_text_cells(self) -> Iterable[TextCell]:
|
|
with pypdfium2_lock:
|
|
if not self.text_page:
|
|
self.text_page = self._ppage.get_textpage()
|
|
|
|
cells = []
|
|
cell_counter = 0
|
|
|
|
page_size = self.get_size()
|
|
|
|
with pypdfium2_lock:
|
|
for i in range(self.text_page.count_rects()):
|
|
rect = self.text_page.get_rect(i)
|
|
text_piece = self.text_page.get_text_bounded(*rect)
|
|
x0, y0, x1, y1 = rect
|
|
cells.append(
|
|
TextCell(
|
|
index=cell_counter,
|
|
text=text_piece,
|
|
orig=text_piece,
|
|
from_ocr=False,
|
|
rect=BoundingRectangle.from_bounding_box(
|
|
BoundingBox(
|
|
l=x0,
|
|
b=y0,
|
|
r=x1,
|
|
t=y1,
|
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
|
)
|
|
).to_top_left_origin(page_size.height),
|
|
)
|
|
)
|
|
cell_counter += 1
|
|
|
|
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
|
|
# The cell merging code below is to clean this up.
|
|
def merge_horizontal_cells(
|
|
cells: List[TextCell],
|
|
horizontal_threshold_factor: float = 1.0,
|
|
vertical_threshold_factor: float = 0.5,
|
|
) -> List[TextCell]:
|
|
if not cells:
|
|
return []
|
|
|
|
def group_rows(cells: List[TextCell]) -> List[List[TextCell]]:
|
|
rows = []
|
|
current_row = [cells[0]]
|
|
row_top = cells[0].rect.to_bounding_box().t
|
|
row_bottom = cells[0].rect.to_bounding_box().b
|
|
row_height = cells[0].rect.to_bounding_box().height
|
|
|
|
for cell in cells[1:]:
|
|
vertical_threshold = row_height * vertical_threshold_factor
|
|
if (
|
|
abs(cell.rect.to_bounding_box().t - row_top)
|
|
<= vertical_threshold
|
|
and abs(cell.rect.to_bounding_box().b - row_bottom)
|
|
<= vertical_threshold
|
|
):
|
|
current_row.append(cell)
|
|
row_top = min(row_top, cell.rect.to_bounding_box().t)
|
|
row_bottom = max(row_bottom, cell.rect.to_bounding_box().b)
|
|
row_height = row_bottom - row_top
|
|
else:
|
|
rows.append(current_row)
|
|
current_row = [cell]
|
|
row_top = cell.rect.to_bounding_box().t
|
|
row_bottom = cell.rect.to_bounding_box().b
|
|
row_height = cell.rect.to_bounding_box().height
|
|
|
|
if current_row:
|
|
rows.append(current_row)
|
|
|
|
return rows
|
|
|
|
def merge_row(row: List[TextCell]) -> List[TextCell]:
|
|
merged = []
|
|
current_group = [row[0]]
|
|
|
|
for cell in row[1:]:
|
|
prev_cell = current_group[-1]
|
|
avg_height = (
|
|
prev_cell.rect.height + cell.rect.to_bounding_box().height
|
|
) / 2
|
|
if (
|
|
cell.rect.to_bounding_box().l
|
|
- prev_cell.rect.to_bounding_box().r
|
|
<= avg_height * horizontal_threshold_factor
|
|
):
|
|
current_group.append(cell)
|
|
else:
|
|
merged.append(merge_group(current_group))
|
|
current_group = [cell]
|
|
|
|
if current_group:
|
|
merged.append(merge_group(current_group))
|
|
|
|
return merged
|
|
|
|
def merge_group(group: List[TextCell]) -> TextCell:
|
|
if len(group) == 1:
|
|
return group[0]
|
|
|
|
merged_text = "".join(cell.text for cell in group)
|
|
merged_bbox = BoundingBox(
|
|
l=min(cell.rect.to_bounding_box().l for cell in group),
|
|
t=min(cell.rect.to_bounding_box().t for cell in group),
|
|
r=max(cell.rect.to_bounding_box().r for cell in group),
|
|
b=max(cell.rect.to_bounding_box().b for cell in group),
|
|
)
|
|
return TextCell(
|
|
index=group[0].index,
|
|
text=merged_text,
|
|
orig=merged_text,
|
|
rect=BoundingRectangle.from_bounding_box(merged_bbox),
|
|
from_ocr=False,
|
|
)
|
|
|
|
rows = group_rows(cells)
|
|
merged_cells = [cell for row in rows for cell in merge_row(row)]
|
|
|
|
for i, cell in enumerate(merged_cells, 1):
|
|
cell.index = i
|
|
|
|
return merged_cells
|
|
|
|
def draw_clusters_and_cells():
|
|
image = (
|
|
self.get_page_image()
|
|
) # make new image to avoid drawing on the saved ones
|
|
draw = ImageDraw.Draw(image)
|
|
for c in cells:
|
|
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
|
cell_color = (
|
|
random.randint(30, 140),
|
|
random.randint(30, 140),
|
|
random.randint(30, 140),
|
|
)
|
|
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
|
image.show()
|
|
|
|
# before merge:
|
|
# draw_clusters_and_cells()
|
|
|
|
cells = merge_horizontal_cells(cells)
|
|
|
|
# after merge:
|
|
# draw_clusters_and_cells()
|
|
|
|
return cells
|
|
|
|
def get_page_image(
|
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
|
) -> Image.Image:
|
|
|
|
page_size = self.get_size()
|
|
|
|
if not cropbox:
|
|
cropbox = BoundingBox(
|
|
l=0,
|
|
r=page_size.width,
|
|
t=0,
|
|
b=page_size.height,
|
|
coord_origin=CoordOrigin.TOPLEFT,
|
|
)
|
|
padbox = BoundingBox(
|
|
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
|
)
|
|
else:
|
|
padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
|
|
padbox.r = page_size.width - padbox.r
|
|
padbox.t = page_size.height - padbox.t
|
|
|
|
with pypdfium2_lock:
|
|
image = (
|
|
self._ppage.render(
|
|
scale=scale * 1.5,
|
|
rotation=0, # no additional rotation
|
|
crop=padbox.as_tuple(),
|
|
)
|
|
.to_pil()
|
|
.resize(
|
|
size=(round(cropbox.width * scale), round(cropbox.height * scale))
|
|
)
|
|
) # We resize the image from 1.5x the given scale to make it sharper.
|
|
|
|
return image
|
|
|
|
def get_size(self) -> Size:
|
|
with pypdfium2_lock:
|
|
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
|
|
|
def unload(self):
|
|
self._ppage = None
|
|
self.text_page = None
|
|
|
|
|
|
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
|
super().__init__(in_doc, path_or_stream)
|
|
|
|
try:
|
|
with pypdfium2_lock:
|
|
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
|
except PdfiumError as e:
|
|
raise RuntimeError(
|
|
f"pypdfium could not load document with hash {self.document_hash}"
|
|
) from e
|
|
|
|
def page_count(self) -> int:
|
|
with pypdfium2_lock:
|
|
return len(self._pdoc)
|
|
|
|
def load_page(self, page_no: int) -> PyPdfiumPageBackend:
|
|
with pypdfium2_lock:
|
|
return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
|
|
|
|
def is_valid(self) -> bool:
|
|
return self.page_count() > 0
|
|
|
|
def unload(self):
|
|
super().unload()
|
|
with pypdfium2_lock:
|
|
self._pdoc.close()
|
|
self._pdoc = None
|