feat: add options for choosing OCR engines (#118)
--------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> Signed-off-by: Peter Staar <taa@zurich.ibm.com> Co-authored-by: Nikos Livathinos <nli@zurich.ibm.com> Co-authored-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
@@ -4,21 +4,33 @@ from typing import Iterable
|
||||
import numpy
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
||||
from docling.datamodel.pipeline_options import EasyOcrOptions
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EasyOcrModel(BaseOcrModel):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
def __init__(self, enabled: bool, options: EasyOcrOptions):
|
||||
super().__init__(enabled=enabled, options=options)
|
||||
self.options: EasyOcrOptions
|
||||
|
||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||
|
||||
if self.enabled:
|
||||
import easyocr
|
||||
try:
|
||||
import easyocr
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
|
||||
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
||||
)
|
||||
|
||||
self.reader = easyocr.Reader(config["lang"])
|
||||
self.reader = easyocr.Reader(
|
||||
lang_list=self.options.lang,
|
||||
model_storage_directory=self.options.model_storage_directory,
|
||||
download_enabled=self.options.download_enabled,
|
||||
)
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
|
||||
@@ -31,6 +43,9 @@ class EasyOcrModel(BaseOcrModel):
|
||||
|
||||
all_ocr_cells = []
|
||||
for ocr_rect in ocr_rects:
|
||||
# Skip zero area boxes
|
||||
if ocr_rect.area() == 0:
|
||||
continue
|
||||
high_res_image = page._backend.get_page_image(
|
||||
scale=self.scale, cropbox=ocr_rect
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user