feat(ocr): added support for RapidOCR engine (#415)

* adding rapidocr engine for ocr in docling

Signed-off-by: swayam-singhal <swayam.singhal@inito.com>

* fixing styling format

Signed-off-by: Swaymaw <swaymaw@gmail.com>

* updating pyproject.toml and poetry.lock to fix ci bugs

Signed-off-by: Swaymaw <swaymaw@gmail.com>

* help poetry pinning for python3.9

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* simplifying rapidocr options so that device can be changed using a single option for all models

Signed-off-by: Swaymaw <swaymaw@gmail.com>

* fix styling issues and small bug in rapidOcrOptions

Signed-off-by: Swaymaw <swaymaw@gmail.com>

* use default device until we enable global management

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: swayam-singhal <swayam.singhal@inito.com>
Signed-off-by: Swaymaw <swaymaw@gmail.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: swayam-singhal <swayam.singhal@inito.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Swaymaw
2024-11-27 18:27:41 +05:30
committed by GitHub
parent 767563bf8b
commit 85b29990be
9 changed files with 405 additions and 13 deletions

View File

@@ -13,6 +13,7 @@ from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
RapidOcrOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
@@ -26,6 +27,7 @@ from docling.models.page_preprocessing_model import (
PagePreprocessingModel,
PagePreprocessingOptions,
)
from docling.models.rapid_ocr_model import RapidOcrModel
from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel
@@ -121,6 +123,11 @@ class StandardPdfPipeline(PaginatedPipeline):
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
elif isinstance(self.pipeline_options.ocr_options, RapidOcrOptions):
return RapidOcrModel(
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
if "darwin" != sys.platform:
raise RuntimeError(