fix: Improve OCR results, stricten criteria before dropping bitmap areas (#719)

fix: Properly care for all bitmap elements in OCR Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-01-10 10:38:49 +01:00 · 2025-01-10 10:38:49 +01:00 · 5a060f237d
commit 5a060f237d
parent 9a6b5c8c8d
5 changed files with 19 additions and 16 deletions
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@ -132,7 +132,7 @@ class DoclingParsePageBackend(PdfPageBackend):
        return cells

    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 32 * 32
+        AREA_THRESHOLD = 0  # 32 * 32

        for i in range(len(self._dpage["images"])):
            bitmap = self._dpage["images"][i]
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docling/backend/docling_parse_v2_backend.py
@ -140,7 +140,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
        return cells

    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 32 * 32
+        AREA_THRESHOLD = 0  # 32 * 32

        images = self._dpage["sanitized"]["images"]["data"]
        images_header = self._dpage["sanitized"]["images"]["header"]
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@ -39,7 +39,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
        return self.valid

    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 32 * 32
+        AREA_THRESHOLD = 0  # 32 * 32
        for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
            pos = obj.get_pos()
            cropbox = BoundingBox.from_tuple(
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -139,7 +139,7 @@ class EasyOcrOptions(OcrOptions):

    use_gpu: Optional[bool] = None

-    confidence_threshold: float = 0.65
+    confidence_threshold: float = 0.5

    model_storage_directory: Optional[str] = None
    recog_network: Optional[str] = "standard"
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@ -8,7 +8,7 @@ import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from PIL import Image, ImageDraw
 from rtree import index
-from scipy.ndimage import find_objects, label
+from scipy.ndimage import binary_dilation, find_objects, label

 from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
@ -43,6 +43,12 @@ class BaseOcrModel(BasePageModel):

            np_image = np.array(image)

+            # Dilate the image by 10 pixels to merge nearby bitmap rectangles
+            structure = np.ones(
+                (20, 20)
+            )  # Create a 20x20 structure element (10 pixels in all directions)
+            np_image = binary_dilation(np_image > 0, structure=structure)
+
            # Find the connected components
            labeled_image, num_features = label(
                np_image > 0
@ -72,7 +78,7 @@ class BaseOcrModel(BasePageModel):
            bitmap_rects = []
        coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)

-        # return full-page rectangle if sufficiently covered with bitmaps
+        # return full-page rectangle if page is dominantly covered with bitmaps
        if self.options.force_full_page_ocr or coverage > max(
            BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
        ):
@ -85,17 +91,11 @@ class BaseOcrModel(BasePageModel):
                    coord_origin=CoordOrigin.TOPLEFT,
                )
            ]
-        # return individual rectangles if the bitmap coverage is smaller
-        else:  # coverage <= BITMAP_COVERAGE_TRESHOLD:
-
-            # skip OCR if the bitmap area on the page is smaller than the options threshold
-            ocr_rects = [
-                rect
-                for rect in ocr_rects
-                if rect.area() / (page.size.width * page.size.height)
-                > self.options.bitmap_area_threshold
-            ]
+        # return individual rectangles if the bitmap coverage is above the threshold
+        elif coverage > self.options.bitmap_area_threshold:
            return ocr_rects
+        else:  # overall coverage of bitmaps is too low, drop all bitmap rectangles.
+            return []

    # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
    def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
@ -162,6 +162,9 @@ class BaseOcrModel(BasePageModel):
            x0 *= scale_x
            x1 *= scale_x

+            if y1 <= y0:
+                y1, y0 = y0, y1
+
            color = "gray"
            if isinstance(tc, OcrCell):
                color = "magenta"