fix: Improve OCR results, stricten criteria before dropping bitmap areas (#719)
fix: Properly care for all bitmap elements in OCR Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
9a6b5c8c8d
commit
5a060f237d
@ -132,7 +132,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
return cells
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 32 * 32
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
|
||||
for i in range(len(self._dpage["images"])):
|
||||
bitmap = self._dpage["images"][i]
|
||||
|
@ -140,7 +140,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
return cells
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 32 * 32
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
|
||||
images = self._dpage["sanitized"]["images"]["data"]
|
||||
images_header = self._dpage["sanitized"]["images"]["header"]
|
||||
|
@ -39,7 +39,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
return self.valid
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 32 * 32
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
||||
pos = obj.get_pos()
|
||||
cropbox = BoundingBox.from_tuple(
|
||||
|
@ -139,7 +139,7 @@ class EasyOcrOptions(OcrOptions):
|
||||
|
||||
use_gpu: Optional[bool] = None
|
||||
|
||||
confidence_threshold: float = 0.65
|
||||
confidence_threshold: float = 0.5
|
||||
|
||||
model_storage_directory: Optional[str] = None
|
||||
recog_network: Optional[str] = "standard"
|
||||
|
@ -8,7 +8,7 @@ import numpy as np
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from PIL import Image, ImageDraw
|
||||
from rtree import index
|
||||
from scipy.ndimage import find_objects, label
|
||||
from scipy.ndimage import binary_dilation, find_objects, label
|
||||
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
@ -43,6 +43,12 @@ class BaseOcrModel(BasePageModel):
|
||||
|
||||
np_image = np.array(image)
|
||||
|
||||
# Dilate the image by 10 pixels to merge nearby bitmap rectangles
|
||||
structure = np.ones(
|
||||
(20, 20)
|
||||
) # Create a 20x20 structure element (10 pixels in all directions)
|
||||
np_image = binary_dilation(np_image > 0, structure=structure)
|
||||
|
||||
# Find the connected components
|
||||
labeled_image, num_features = label(
|
||||
np_image > 0
|
||||
@ -72,7 +78,7 @@ class BaseOcrModel(BasePageModel):
|
||||
bitmap_rects = []
|
||||
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
||||
|
||||
# return full-page rectangle if sufficiently covered with bitmaps
|
||||
# return full-page rectangle if page is dominantly covered with bitmaps
|
||||
if self.options.force_full_page_ocr or coverage > max(
|
||||
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
|
||||
):
|
||||
@ -85,17 +91,11 @@ class BaseOcrModel(BasePageModel):
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
]
|
||||
# return individual rectangles if the bitmap coverage is smaller
|
||||
else: # coverage <= BITMAP_COVERAGE_TRESHOLD:
|
||||
|
||||
# skip OCR if the bitmap area on the page is smaller than the options threshold
|
||||
ocr_rects = [
|
||||
rect
|
||||
for rect in ocr_rects
|
||||
if rect.area() / (page.size.width * page.size.height)
|
||||
> self.options.bitmap_area_threshold
|
||||
]
|
||||
# return individual rectangles if the bitmap coverage is above the threshold
|
||||
elif coverage > self.options.bitmap_area_threshold:
|
||||
return ocr_rects
|
||||
else: # overall coverage of bitmaps is too low, drop all bitmap rectangles.
|
||||
return []
|
||||
|
||||
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
||||
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
||||
@ -162,6 +162,9 @@ class BaseOcrModel(BasePageModel):
|
||||
x0 *= scale_x
|
||||
x1 *= scale_x
|
||||
|
||||
if y1 <= y0:
|
||||
y1, y0 = y0, y1
|
||||
|
||||
color = "gray"
|
||||
if isinstance(tc, OcrCell):
|
||||
color = "magenta"
|
||||
|
Loading…
Reference in New Issue
Block a user