feat: add factory for ocr engines via plugins (#1010)
* add factory for ocr engines Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply pre-commit after rebase Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add picture description factory Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix enable option Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * switch to create methods Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * make `options` an explicit kwarg Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * keep old lock of docling-core Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix lock Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add allow_external_plugins option Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add factory return and ignore options type Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> Co-authored-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
@@ -1,10 +1,9 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import warnings
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Any, Dict, List, Literal, Optional, Union
|
||||
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
||||
|
||||
from pydantic import (
|
||||
AnyUrl,
|
||||
@@ -13,13 +12,8 @@ from pydantic import (
|
||||
Field,
|
||||
field_validator,
|
||||
model_validator,
|
||||
validator,
|
||||
)
|
||||
from pydantic_settings import (
|
||||
BaseSettings,
|
||||
PydanticBaseSettingsSource,
|
||||
SettingsConfigDict,
|
||||
)
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
from typing_extensions import deprecated
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@@ -83,6 +77,12 @@ class AcceleratorOptions(BaseSettings):
|
||||
return data
|
||||
|
||||
|
||||
class BaseOptions(BaseModel):
|
||||
"""Base class for options."""
|
||||
|
||||
kind: ClassVar[str]
|
||||
|
||||
|
||||
class TableFormerMode(str, Enum):
|
||||
"""Modes for the TableFormer model."""
|
||||
|
||||
@@ -102,10 +102,9 @@ class TableStructureOptions(BaseModel):
|
||||
mode: TableFormerMode = TableFormerMode.ACCURATE
|
||||
|
||||
|
||||
class OcrOptions(BaseModel):
|
||||
class OcrOptions(BaseOptions):
|
||||
"""OCR options."""
|
||||
|
||||
kind: str
|
||||
lang: List[str]
|
||||
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
||||
bitmap_area_threshold: float = (
|
||||
@@ -116,7 +115,7 @@ class OcrOptions(BaseModel):
|
||||
class RapidOcrOptions(OcrOptions):
|
||||
"""Options for the RapidOCR engine."""
|
||||
|
||||
kind: Literal["rapidocr"] = "rapidocr"
|
||||
kind: ClassVar[Literal["rapidocr"]] = "rapidocr"
|
||||
|
||||
# English and chinese are the most commly used models and have been tested with RapidOCR.
|
||||
lang: List[str] = [
|
||||
@@ -155,7 +154,7 @@ class RapidOcrOptions(OcrOptions):
|
||||
class EasyOcrOptions(OcrOptions):
|
||||
"""Options for the EasyOCR engine."""
|
||||
|
||||
kind: Literal["easyocr"] = "easyocr"
|
||||
kind: ClassVar[Literal["easyocr"]] = "easyocr"
|
||||
lang: List[str] = ["fr", "de", "es", "en"]
|
||||
|
||||
use_gpu: Optional[bool] = None
|
||||
@@ -175,7 +174,7 @@ class EasyOcrOptions(OcrOptions):
|
||||
class TesseractCliOcrOptions(OcrOptions):
|
||||
"""Options for the TesseractCli engine."""
|
||||
|
||||
kind: Literal["tesseract"] = "tesseract"
|
||||
kind: ClassVar[Literal["tesseract"]] = "tesseract"
|
||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||
tesseract_cmd: str = "tesseract"
|
||||
path: Optional[str] = None
|
||||
@@ -188,7 +187,7 @@ class TesseractCliOcrOptions(OcrOptions):
|
||||
class TesseractOcrOptions(OcrOptions):
|
||||
"""Options for the Tesseract engine."""
|
||||
|
||||
kind: Literal["tesserocr"] = "tesserocr"
|
||||
kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
|
||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||
path: Optional[str] = None
|
||||
|
||||
@@ -200,7 +199,7 @@ class TesseractOcrOptions(OcrOptions):
|
||||
class OcrMacOptions(OcrOptions):
|
||||
"""Options for the Mac OCR engine."""
|
||||
|
||||
kind: Literal["ocrmac"] = "ocrmac"
|
||||
kind: ClassVar[Literal["ocrmac"]] = "ocrmac"
|
||||
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
||||
recognition: str = "accurate"
|
||||
framework: str = "vision"
|
||||
@@ -210,8 +209,7 @@ class OcrMacOptions(OcrOptions):
|
||||
)
|
||||
|
||||
|
||||
class PictureDescriptionBaseOptions(BaseModel):
|
||||
kind: str
|
||||
class PictureDescriptionBaseOptions(BaseOptions):
|
||||
batch_size: int = 8
|
||||
scale: float = 2
|
||||
|
||||
@@ -221,7 +219,7 @@ class PictureDescriptionBaseOptions(BaseModel):
|
||||
|
||||
|
||||
class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
||||
kind: Literal["api"] = "api"
|
||||
kind: ClassVar[Literal["api"]] = "api"
|
||||
|
||||
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
|
||||
headers: Dict[str, str] = {}
|
||||
@@ -233,7 +231,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
||||
|
||||
|
||||
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
||||
kind: Literal["vlm"] = "vlm"
|
||||
kind: ClassVar[Literal["vlm"]] = "vlm"
|
||||
|
||||
repo_id: str
|
||||
prompt: str = "Describe this image in a few sentences."
|
||||
@@ -305,6 +303,7 @@ class PdfBackend(str, Enum):
|
||||
|
||||
|
||||
# Define an enum for the ocr engines
|
||||
@deprecated("Use ocr_factory.registered_enum")
|
||||
class OcrEngine(str, Enum):
|
||||
"""Enum of valid OCR engines."""
|
||||
|
||||
@@ -324,6 +323,7 @@ class PipelineOptions(BaseModel):
|
||||
document_timeout: Optional[float] = None
|
||||
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
||||
enable_remote_services: bool = False
|
||||
allow_external_plugins: bool = False
|
||||
|
||||
|
||||
class PaginatedPipelineOptions(PipelineOptions):
|
||||
@@ -359,17 +359,10 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||
# If True, text from backend will be used instead of generated text
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||
ocr_options: Union[
|
||||
EasyOcrOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
OcrMacOptions,
|
||||
RapidOcrOptions,
|
||||
] = Field(EasyOcrOptions(), discriminator="kind")
|
||||
picture_description_options: Annotated[
|
||||
Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
|
||||
Field(discriminator="kind"),
|
||||
] = smolvlm_picture_description
|
||||
ocr_options: OcrOptions = EasyOcrOptions()
|
||||
picture_description_options: PictureDescriptionBaseOptions = (
|
||||
smolvlm_picture_description
|
||||
)
|
||||
|
||||
images_scale: float = 1.0
|
||||
generate_page_images: bool = False
|
||||
|
||||
Reference in New Issue
Block a user