feat: add factory for ocr engines via plugins (#1010)

* add factory for ocr engines

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply pre-commit after rebase

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add picture description factory

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix enable option

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* switch to create methods

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* make `options` an explicit kwarg

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* keep old lock of docling-core

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix lock

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add allow_external_plugins option

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add factory return and ignore options type

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
Co-authored-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-03-18 13:58:05 +01:00
committed by GitHub
parent 3960b199d6
commit 6eaae3cba0
21 changed files with 485 additions and 158 deletions

View File

@@ -1,10 +1,9 @@
import logging
import os
import re
import warnings
from enum import Enum
from pathlib import Path
from typing import Annotated, Any, Dict, List, Literal, Optional, Union
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
from pydantic import (
AnyUrl,
@@ -13,13 +12,8 @@ from pydantic import (
Field,
field_validator,
model_validator,
validator,
)
from pydantic_settings import (
BaseSettings,
PydanticBaseSettingsSource,
SettingsConfigDict,
)
from pydantic_settings import BaseSettings, SettingsConfigDict
from typing_extensions import deprecated
_log = logging.getLogger(__name__)
@@ -83,6 +77,12 @@ class AcceleratorOptions(BaseSettings):
return data
class BaseOptions(BaseModel):
"""Base class for options."""
kind: ClassVar[str]
class TableFormerMode(str, Enum):
"""Modes for the TableFormer model."""
@@ -102,10 +102,9 @@ class TableStructureOptions(BaseModel):
mode: TableFormerMode = TableFormerMode.ACCURATE
class OcrOptions(BaseModel):
class OcrOptions(BaseOptions):
"""OCR options."""
kind: str
lang: List[str]
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
bitmap_area_threshold: float = (
@@ -116,7 +115,7 @@ class OcrOptions(BaseModel):
class RapidOcrOptions(OcrOptions):
"""Options for the RapidOCR engine."""
kind: Literal["rapidocr"] = "rapidocr"
kind: ClassVar[Literal["rapidocr"]] = "rapidocr"
# English and chinese are the most commly used models and have been tested with RapidOCR.
lang: List[str] = [
@@ -155,7 +154,7 @@ class RapidOcrOptions(OcrOptions):
class EasyOcrOptions(OcrOptions):
"""Options for the EasyOCR engine."""
kind: Literal["easyocr"] = "easyocr"
kind: ClassVar[Literal["easyocr"]] = "easyocr"
lang: List[str] = ["fr", "de", "es", "en"]
use_gpu: Optional[bool] = None
@@ -175,7 +174,7 @@ class EasyOcrOptions(OcrOptions):
class TesseractCliOcrOptions(OcrOptions):
"""Options for the TesseractCli engine."""
kind: Literal["tesseract"] = "tesseract"
kind: ClassVar[Literal["tesseract"]] = "tesseract"
lang: List[str] = ["fra", "deu", "spa", "eng"]
tesseract_cmd: str = "tesseract"
path: Optional[str] = None
@@ -188,7 +187,7 @@ class TesseractCliOcrOptions(OcrOptions):
class TesseractOcrOptions(OcrOptions):
"""Options for the Tesseract engine."""
kind: Literal["tesserocr"] = "tesserocr"
kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
lang: List[str] = ["fra", "deu", "spa", "eng"]
path: Optional[str] = None
@@ -200,7 +199,7 @@ class TesseractOcrOptions(OcrOptions):
class OcrMacOptions(OcrOptions):
"""Options for the Mac OCR engine."""
kind: Literal["ocrmac"] = "ocrmac"
kind: ClassVar[Literal["ocrmac"]] = "ocrmac"
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
recognition: str = "accurate"
framework: str = "vision"
@@ -210,8 +209,7 @@ class OcrMacOptions(OcrOptions):
)
class PictureDescriptionBaseOptions(BaseModel):
kind: str
class PictureDescriptionBaseOptions(BaseOptions):
batch_size: int = 8
scale: float = 2
@@ -221,7 +219,7 @@ class PictureDescriptionBaseOptions(BaseModel):
class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
kind: Literal["api"] = "api"
kind: ClassVar[Literal["api"]] = "api"
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
headers: Dict[str, str] = {}
@@ -233,7 +231,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
kind: Literal["vlm"] = "vlm"
kind: ClassVar[Literal["vlm"]] = "vlm"
repo_id: str
prompt: str = "Describe this image in a few sentences."
@@ -305,6 +303,7 @@ class PdfBackend(str, Enum):
# Define an enum for the ocr engines
@deprecated("Use ocr_factory.registered_enum")
class OcrEngine(str, Enum):
"""Enum of valid OCR engines."""
@@ -324,6 +323,7 @@ class PipelineOptions(BaseModel):
document_timeout: Optional[float] = None
accelerator_options: AcceleratorOptions = AcceleratorOptions()
enable_remote_services: bool = False
allow_external_plugins: bool = False
class PaginatedPipelineOptions(PipelineOptions):
@@ -359,17 +359,10 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
# If True, text from backend will be used instead of generated text
table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[
EasyOcrOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
OcrMacOptions,
RapidOcrOptions,
] = Field(EasyOcrOptions(), discriminator="kind")
picture_description_options: Annotated[
Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
Field(discriminator="kind"),
] = smolvlm_picture_description
ocr_options: OcrOptions = EasyOcrOptions()
picture_description_options: PictureDescriptionBaseOptions = (
smolvlm_picture_description
)
images_scale: float = 1.0
generate_page_images: bool = False