fix: Improve OCR results, stricten criteria before dropping bitmap areas (#719)
fix: Properly care for all bitmap elements in OCR Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -132,7 +132,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
return cells
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 32 * 32
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
|
||||
for i in range(len(self._dpage["images"])):
|
||||
bitmap = self._dpage["images"][i]
|
||||
|
||||
@@ -140,7 +140,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
return cells
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 32 * 32
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
|
||||
images = self._dpage["sanitized"]["images"]["data"]
|
||||
images_header = self._dpage["sanitized"]["images"]["header"]
|
||||
|
||||
@@ -39,7 +39,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
return self.valid
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 32 * 32
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
||||
pos = obj.get_pos()
|
||||
cropbox = BoundingBox.from_tuple(
|
||||
|
||||
Reference in New Issue
Block a user