feat: add factory for ocr engines via plugins (#1010)
* add factory for ocr engines Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply pre-commit after rebase Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add picture description factory Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix enable option Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * switch to create methods Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * make `options` an explicit kwarg Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * keep old lock of docling-core Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix lock Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add allow_external_plugins option Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add factory return and ignore options type Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> Co-authored-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
parent
3960b199d6
commit
6eaae3cba0
@ -9,6 +9,7 @@ import warnings
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
||||||
|
|
||||||
|
import rich.table
|
||||||
import typer
|
import typer
|
||||||
from docling_core.types.doc import ImageRefMode
|
from docling_core.types.doc import ImageRefMode
|
||||||
from docling_core.utils.file import resolve_source_to_path
|
from docling_core.utils.file import resolve_source_to_path
|
||||||
@ -30,18 +31,14 @@ from docling.datamodel.pipeline_options import (
|
|||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
OcrEngine,
|
|
||||||
OcrMacOptions,
|
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
PdfBackend,
|
PdfBackend,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
RapidOcrOptions,
|
|
||||||
TableFormerMode,
|
TableFormerMode,
|
||||||
TesseractCliOcrOptions,
|
|
||||||
TesseractOcrOptions,
|
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||||
|
from docling.models.factories import get_ocr_factory
|
||||||
|
|
||||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||||
@ -49,8 +46,11 @@ warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr
|
|||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
|
|
||||||
|
console = Console()
|
||||||
err_console = Console(stderr=True)
|
err_console = Console(stderr=True)
|
||||||
|
|
||||||
|
ocr_factory_internal = get_ocr_factory(allow_external_plugins=False)
|
||||||
|
ocr_engines_enum_internal = ocr_factory_internal.get_enum()
|
||||||
|
|
||||||
app = typer.Typer(
|
app = typer.Typer(
|
||||||
name="Docling",
|
name="Docling",
|
||||||
@ -78,6 +78,24 @@ def version_callback(value: bool):
|
|||||||
raise typer.Exit()
|
raise typer.Exit()
|
||||||
|
|
||||||
|
|
||||||
|
def show_external_plugins_callback(value: bool):
|
||||||
|
if value:
|
||||||
|
ocr_factory_all = get_ocr_factory(allow_external_plugins=True)
|
||||||
|
table = rich.table.Table(title="Available OCR engines")
|
||||||
|
table.add_column("Name", justify="right")
|
||||||
|
table.add_column("Plugin")
|
||||||
|
table.add_column("Package")
|
||||||
|
for meta in ocr_factory_all.registered_meta.values():
|
||||||
|
if not meta.module.startswith("docling."):
|
||||||
|
table.add_row(
|
||||||
|
f"[bold]{meta.kind}[/bold]",
|
||||||
|
meta.plugin_name,
|
||||||
|
meta.module.split(".")[0],
|
||||||
|
)
|
||||||
|
rich.print(table)
|
||||||
|
raise typer.Exit()
|
||||||
|
|
||||||
|
|
||||||
def export_documents(
|
def export_documents(
|
||||||
conv_results: Iterable[ConversionResult],
|
conv_results: Iterable[ConversionResult],
|
||||||
output_dir: Path,
|
output_dir: Path,
|
||||||
@ -196,8 +214,16 @@ def convert(
|
|||||||
),
|
),
|
||||||
] = False,
|
] = False,
|
||||||
ocr_engine: Annotated[
|
ocr_engine: Annotated[
|
||||||
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
str,
|
||||||
] = OcrEngine.EASYOCR,
|
typer.Option(
|
||||||
|
...,
|
||||||
|
help=(
|
||||||
|
f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
|
||||||
|
f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
|
||||||
|
f"Use the option --show-external-plugins to see the options allowed with external plugins."
|
||||||
|
),
|
||||||
|
),
|
||||||
|
] = EasyOcrOptions.kind,
|
||||||
ocr_lang: Annotated[
|
ocr_lang: Annotated[
|
||||||
Optional[str],
|
Optional[str],
|
||||||
typer.Option(
|
typer.Option(
|
||||||
@ -241,6 +267,21 @@ def convert(
|
|||||||
..., help="Must be enabled when using models connecting to remote services."
|
..., help="Must be enabled when using models connecting to remote services."
|
||||||
),
|
),
|
||||||
] = False,
|
] = False,
|
||||||
|
allow_external_plugins: Annotated[
|
||||||
|
bool,
|
||||||
|
typer.Option(
|
||||||
|
..., help="Must be enabled for loading modules from third-party plugins."
|
||||||
|
),
|
||||||
|
] = False,
|
||||||
|
show_external_plugins: Annotated[
|
||||||
|
bool,
|
||||||
|
typer.Option(
|
||||||
|
...,
|
||||||
|
help="List the third-party plugins which are available when the option --allow-external-plugins is set.",
|
||||||
|
callback=show_external_plugins_callback,
|
||||||
|
is_eager=True,
|
||||||
|
),
|
||||||
|
] = False,
|
||||||
abort_on_error: Annotated[
|
abort_on_error: Annotated[
|
||||||
bool,
|
bool,
|
||||||
typer.Option(
|
typer.Option(
|
||||||
@ -368,18 +409,11 @@ def convert(
|
|||||||
export_txt = OutputFormat.TEXT in to_formats
|
export_txt = OutputFormat.TEXT in to_formats
|
||||||
export_doctags = OutputFormat.DOCTAGS in to_formats
|
export_doctags = OutputFormat.DOCTAGS in to_formats
|
||||||
|
|
||||||
if ocr_engine == OcrEngine.EASYOCR:
|
ocr_factory = get_ocr_factory(allow_external_plugins=allow_external_plugins)
|
||||||
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
|
||||||
elif ocr_engine == OcrEngine.TESSERACT_CLI:
|
kind=ocr_engine,
|
||||||
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
force_full_page_ocr=force_ocr,
|
||||||
elif ocr_engine == OcrEngine.TESSERACT:
|
)
|
||||||
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
|
||||||
elif ocr_engine == OcrEngine.OCRMAC:
|
|
||||||
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
|
||||||
elif ocr_engine == OcrEngine.RAPIDOCR:
|
|
||||||
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
|
||||||
else:
|
|
||||||
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
|
||||||
|
|
||||||
ocr_lang_list = _split_list(ocr_lang)
|
ocr_lang_list = _split_list(ocr_lang)
|
||||||
if ocr_lang_list is not None:
|
if ocr_lang_list is not None:
|
||||||
@ -387,6 +421,7 @@ def convert(
|
|||||||
|
|
||||||
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
||||||
pipeline_options = PdfPipelineOptions(
|
pipeline_options = PdfPipelineOptions(
|
||||||
|
allow_external_plugins=allow_external_plugins,
|
||||||
enable_remote_services=enable_remote_services,
|
enable_remote_services=enable_remote_services,
|
||||||
accelerator_options=accelerator_options,
|
accelerator_options=accelerator_options,
|
||||||
do_ocr=ocr,
|
do_ocr=ocr,
|
||||||
|
@ -1,10 +1,9 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import warnings
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Annotated, Any, Dict, List, Literal, Optional, Union
|
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
||||||
|
|
||||||
from pydantic import (
|
from pydantic import (
|
||||||
AnyUrl,
|
AnyUrl,
|
||||||
@ -13,13 +12,8 @@ from pydantic import (
|
|||||||
Field,
|
Field,
|
||||||
field_validator,
|
field_validator,
|
||||||
model_validator,
|
model_validator,
|
||||||
validator,
|
|
||||||
)
|
|
||||||
from pydantic_settings import (
|
|
||||||
BaseSettings,
|
|
||||||
PydanticBaseSettingsSource,
|
|
||||||
SettingsConfigDict,
|
|
||||||
)
|
)
|
||||||
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -83,6 +77,12 @@ class AcceleratorOptions(BaseSettings):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
class BaseOptions(BaseModel):
|
||||||
|
"""Base class for options."""
|
||||||
|
|
||||||
|
kind: ClassVar[str]
|
||||||
|
|
||||||
|
|
||||||
class TableFormerMode(str, Enum):
|
class TableFormerMode(str, Enum):
|
||||||
"""Modes for the TableFormer model."""
|
"""Modes for the TableFormer model."""
|
||||||
|
|
||||||
@ -102,10 +102,9 @@ class TableStructureOptions(BaseModel):
|
|||||||
mode: TableFormerMode = TableFormerMode.ACCURATE
|
mode: TableFormerMode = TableFormerMode.ACCURATE
|
||||||
|
|
||||||
|
|
||||||
class OcrOptions(BaseModel):
|
class OcrOptions(BaseOptions):
|
||||||
"""OCR options."""
|
"""OCR options."""
|
||||||
|
|
||||||
kind: str
|
|
||||||
lang: List[str]
|
lang: List[str]
|
||||||
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
||||||
bitmap_area_threshold: float = (
|
bitmap_area_threshold: float = (
|
||||||
@ -116,7 +115,7 @@ class OcrOptions(BaseModel):
|
|||||||
class RapidOcrOptions(OcrOptions):
|
class RapidOcrOptions(OcrOptions):
|
||||||
"""Options for the RapidOCR engine."""
|
"""Options for the RapidOCR engine."""
|
||||||
|
|
||||||
kind: Literal["rapidocr"] = "rapidocr"
|
kind: ClassVar[Literal["rapidocr"]] = "rapidocr"
|
||||||
|
|
||||||
# English and chinese are the most commly used models and have been tested with RapidOCR.
|
# English and chinese are the most commly used models and have been tested with RapidOCR.
|
||||||
lang: List[str] = [
|
lang: List[str] = [
|
||||||
@ -155,7 +154,7 @@ class RapidOcrOptions(OcrOptions):
|
|||||||
class EasyOcrOptions(OcrOptions):
|
class EasyOcrOptions(OcrOptions):
|
||||||
"""Options for the EasyOCR engine."""
|
"""Options for the EasyOCR engine."""
|
||||||
|
|
||||||
kind: Literal["easyocr"] = "easyocr"
|
kind: ClassVar[Literal["easyocr"]] = "easyocr"
|
||||||
lang: List[str] = ["fr", "de", "es", "en"]
|
lang: List[str] = ["fr", "de", "es", "en"]
|
||||||
|
|
||||||
use_gpu: Optional[bool] = None
|
use_gpu: Optional[bool] = None
|
||||||
@ -175,7 +174,7 @@ class EasyOcrOptions(OcrOptions):
|
|||||||
class TesseractCliOcrOptions(OcrOptions):
|
class TesseractCliOcrOptions(OcrOptions):
|
||||||
"""Options for the TesseractCli engine."""
|
"""Options for the TesseractCli engine."""
|
||||||
|
|
||||||
kind: Literal["tesseract"] = "tesseract"
|
kind: ClassVar[Literal["tesseract"]] = "tesseract"
|
||||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||||
tesseract_cmd: str = "tesseract"
|
tesseract_cmd: str = "tesseract"
|
||||||
path: Optional[str] = None
|
path: Optional[str] = None
|
||||||
@ -188,7 +187,7 @@ class TesseractCliOcrOptions(OcrOptions):
|
|||||||
class TesseractOcrOptions(OcrOptions):
|
class TesseractOcrOptions(OcrOptions):
|
||||||
"""Options for the Tesseract engine."""
|
"""Options for the Tesseract engine."""
|
||||||
|
|
||||||
kind: Literal["tesserocr"] = "tesserocr"
|
kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
|
||||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||||
path: Optional[str] = None
|
path: Optional[str] = None
|
||||||
|
|
||||||
@ -200,7 +199,7 @@ class TesseractOcrOptions(OcrOptions):
|
|||||||
class OcrMacOptions(OcrOptions):
|
class OcrMacOptions(OcrOptions):
|
||||||
"""Options for the Mac OCR engine."""
|
"""Options for the Mac OCR engine."""
|
||||||
|
|
||||||
kind: Literal["ocrmac"] = "ocrmac"
|
kind: ClassVar[Literal["ocrmac"]] = "ocrmac"
|
||||||
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
||||||
recognition: str = "accurate"
|
recognition: str = "accurate"
|
||||||
framework: str = "vision"
|
framework: str = "vision"
|
||||||
@ -210,8 +209,7 @@ class OcrMacOptions(OcrOptions):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class PictureDescriptionBaseOptions(BaseModel):
|
class PictureDescriptionBaseOptions(BaseOptions):
|
||||||
kind: str
|
|
||||||
batch_size: int = 8
|
batch_size: int = 8
|
||||||
scale: float = 2
|
scale: float = 2
|
||||||
|
|
||||||
@ -221,7 +219,7 @@ class PictureDescriptionBaseOptions(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
||||||
kind: Literal["api"] = "api"
|
kind: ClassVar[Literal["api"]] = "api"
|
||||||
|
|
||||||
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
|
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
|
||||||
headers: Dict[str, str] = {}
|
headers: Dict[str, str] = {}
|
||||||
@ -233,7 +231,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
|||||||
|
|
||||||
|
|
||||||
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
||||||
kind: Literal["vlm"] = "vlm"
|
kind: ClassVar[Literal["vlm"]] = "vlm"
|
||||||
|
|
||||||
repo_id: str
|
repo_id: str
|
||||||
prompt: str = "Describe this image in a few sentences."
|
prompt: str = "Describe this image in a few sentences."
|
||||||
@ -305,6 +303,7 @@ class PdfBackend(str, Enum):
|
|||||||
|
|
||||||
|
|
||||||
# Define an enum for the ocr engines
|
# Define an enum for the ocr engines
|
||||||
|
@deprecated("Use ocr_factory.registered_enum")
|
||||||
class OcrEngine(str, Enum):
|
class OcrEngine(str, Enum):
|
||||||
"""Enum of valid OCR engines."""
|
"""Enum of valid OCR engines."""
|
||||||
|
|
||||||
@ -324,6 +323,7 @@ class PipelineOptions(BaseModel):
|
|||||||
document_timeout: Optional[float] = None
|
document_timeout: Optional[float] = None
|
||||||
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
||||||
enable_remote_services: bool = False
|
enable_remote_services: bool = False
|
||||||
|
allow_external_plugins: bool = False
|
||||||
|
|
||||||
|
|
||||||
class PaginatedPipelineOptions(PipelineOptions):
|
class PaginatedPipelineOptions(PipelineOptions):
|
||||||
@ -359,17 +359,10 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|||||||
# If True, text from backend will be used instead of generated text
|
# If True, text from backend will be used instead of generated text
|
||||||
|
|
||||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||||
ocr_options: Union[
|
ocr_options: OcrOptions = EasyOcrOptions()
|
||||||
EasyOcrOptions,
|
picture_description_options: PictureDescriptionBaseOptions = (
|
||||||
TesseractCliOcrOptions,
|
smolvlm_picture_description
|
||||||
TesseractOcrOptions,
|
)
|
||||||
OcrMacOptions,
|
|
||||||
RapidOcrOptions,
|
|
||||||
] = Field(EasyOcrOptions(), discriminator="kind")
|
|
||||||
picture_description_options: Annotated[
|
|
||||||
Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
|
|
||||||
Field(discriminator="kind"),
|
|
||||||
] = smolvlm_picture_description
|
|
||||||
|
|
||||||
images_scale: float = 1.0
|
images_scale: float = 1.0
|
||||||
generate_page_images: bool = False
|
generate_page_images: bool = False
|
||||||
|
@ -1,14 +1,22 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any, Generic, Iterable, Optional
|
from typing import Any, Generic, Iterable, Optional, Protocol, Type
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
||||||
from typing_extensions import TypeVar
|
from typing_extensions import TypeVar
|
||||||
|
|
||||||
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
|
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
|
from docling.datamodel.pipeline_options import BaseOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
|
|
||||||
|
|
||||||
|
class BaseModelWithOptions(Protocol):
|
||||||
|
@classmethod
|
||||||
|
def get_options_type(cls) -> Type[BaseOptions]: ...
|
||||||
|
|
||||||
|
def __init__(self, *, options: BaseOptions, **kwargs): ...
|
||||||
|
|
||||||
|
|
||||||
class BasePageModel(ABC):
|
class BasePageModel(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def __call__(
|
def __call__(
|
||||||
|
@ -2,7 +2,7 @@ import copy
|
|||||||
import logging
|
import logging
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List
|
from typing import Iterable, List, Optional, Type
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
@ -13,15 +13,22 @@ from scipy.ndimage import binary_dilation, find_objects, label
|
|||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import OcrOptions
|
from docling.datamodel.pipeline_options import AcceleratorOptions, OcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BaseModelWithOptions, BasePageModel
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class BaseOcrModel(BasePageModel):
|
class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
||||||
def __init__(self, enabled: bool, options: OcrOptions):
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
enabled: bool,
|
||||||
|
artifacts_path: Optional[Path],
|
||||||
|
options: OcrOptions,
|
||||||
|
accelerator_options: AcceleratorOptions,
|
||||||
|
):
|
||||||
self.enabled = enabled
|
self.enabled = enabled
|
||||||
self.options = options
|
self.options = options
|
||||||
|
|
||||||
@ -186,3 +193,8 @@ class BaseOcrModel(BasePageModel):
|
|||||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
) -> Iterable[Page]:
|
) -> Iterable[Page]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@abstractmethod
|
||||||
|
def get_options_type(cls) -> Type[OcrOptions]:
|
||||||
|
pass
|
||||||
|
@ -2,7 +2,7 @@ import logging
|
|||||||
import warnings
|
import warnings
|
||||||
import zipfile
|
import zipfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Optional
|
from typing import Iterable, List, Optional, Type
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
@ -14,6 +14,7 @@ from docling.datamodel.pipeline_options import (
|
|||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
|
OcrOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
@ -34,7 +35,12 @@ class EasyOcrModel(BaseOcrModel):
|
|||||||
options: EasyOcrOptions,
|
options: EasyOcrOptions,
|
||||||
accelerator_options: AcceleratorOptions,
|
accelerator_options: AcceleratorOptions,
|
||||||
):
|
):
|
||||||
super().__init__(enabled=enabled, options=options)
|
super().__init__(
|
||||||
|
enabled=enabled,
|
||||||
|
artifacts_path=artifacts_path,
|
||||||
|
options=options,
|
||||||
|
accelerator_options=accelerator_options,
|
||||||
|
)
|
||||||
self.options: EasyOcrOptions
|
self.options: EasyOcrOptions
|
||||||
|
|
||||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||||
@ -180,3 +186,7 @@ class EasyOcrModel(BaseOcrModel):
|
|||||||
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
||||||
|
|
||||||
yield page
|
yield page
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_options_type(cls) -> Type[OcrOptions]:
|
||||||
|
return EasyOcrOptions
|
||||||
|
27
docling/models/factories/__init__.py
Normal file
27
docling/models/factories/__init__.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import logging
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
|
from docling.models.factories.ocr_factory import OcrFactory
|
||||||
|
from docling.models.factories.picture_description_factory import (
|
||||||
|
PictureDescriptionFactory,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
||||||
|
factory = OcrFactory()
|
||||||
|
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
||||||
|
logger.info("Registered ocr engines: %r", factory.registered_kind)
|
||||||
|
return factory
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def get_picture_description_factory(
|
||||||
|
allow_external_plugins: bool = False,
|
||||||
|
) -> PictureDescriptionFactory:
|
||||||
|
factory = PictureDescriptionFactory()
|
||||||
|
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
||||||
|
logger.info("Registered picture descriptions: %r", factory.registered_kind)
|
||||||
|
return factory
|
122
docling/models/factories/base_factory.py
Normal file
122
docling/models/factories/base_factory.py
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
import enum
|
||||||
|
import logging
|
||||||
|
from abc import ABCMeta
|
||||||
|
from typing import Generic, Optional, Type, TypeVar
|
||||||
|
|
||||||
|
from pluggy import PluginManager
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from docling.datamodel.pipeline_options import BaseOptions
|
||||||
|
from docling.models.base_model import BaseModelWithOptions
|
||||||
|
|
||||||
|
A = TypeVar("A", bound=BaseModelWithOptions)
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class FactoryMeta(BaseModel):
|
||||||
|
kind: str
|
||||||
|
plugin_name: str
|
||||||
|
module: str
|
||||||
|
|
||||||
|
|
||||||
|
class BaseFactory(Generic[A], metaclass=ABCMeta):
|
||||||
|
default_plugin_name = "docling"
|
||||||
|
|
||||||
|
def __init__(self, plugin_attr_name: str, plugin_name=default_plugin_name):
|
||||||
|
self.plugin_name = plugin_name
|
||||||
|
self.plugin_attr_name = plugin_attr_name
|
||||||
|
|
||||||
|
self._classes: dict[Type[BaseOptions], Type[A]] = {}
|
||||||
|
self._meta: dict[Type[BaseOptions], FactoryMeta] = {}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def registered_kind(self) -> list[str]:
|
||||||
|
return list(opt.kind for opt in self._classes.keys())
|
||||||
|
|
||||||
|
def get_enum(self) -> enum.Enum:
|
||||||
|
return enum.Enum(
|
||||||
|
self.plugin_attr_name + "_enum",
|
||||||
|
names={kind: kind for kind in self.registered_kind},
|
||||||
|
type=str,
|
||||||
|
module=__name__,
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes(self):
|
||||||
|
return self._classes
|
||||||
|
|
||||||
|
@property
|
||||||
|
def registered_meta(self):
|
||||||
|
return self._meta
|
||||||
|
|
||||||
|
def create_instance(self, options: BaseOptions, **kwargs) -> A:
|
||||||
|
try:
|
||||||
|
_cls = self._classes[type(options)]
|
||||||
|
return _cls(options=options, **kwargs)
|
||||||
|
except KeyError:
|
||||||
|
raise RuntimeError(self._err_msg_on_class_not_found(options.kind))
|
||||||
|
|
||||||
|
def create_options(self, kind: str, *args, **kwargs) -> BaseOptions:
|
||||||
|
for opt_cls, _ in self._classes.items():
|
||||||
|
if opt_cls.kind == kind:
|
||||||
|
return opt_cls(*args, **kwargs)
|
||||||
|
raise RuntimeError(self._err_msg_on_class_not_found(kind))
|
||||||
|
|
||||||
|
def _err_msg_on_class_not_found(self, kind: str):
|
||||||
|
msg = []
|
||||||
|
|
||||||
|
for opt, cls in self._classes.items():
|
||||||
|
msg.append(f"\t{opt.kind!r} => {cls!r}")
|
||||||
|
|
||||||
|
msg_str = "\n".join(msg)
|
||||||
|
|
||||||
|
return f"No class found with the name {kind!r}, known classes are:\n{msg_str}"
|
||||||
|
|
||||||
|
def register(self, cls: Type[A], plugin_name: str, plugin_module_name: str):
|
||||||
|
opt_type = cls.get_options_type()
|
||||||
|
|
||||||
|
if opt_type in self._classes:
|
||||||
|
raise ValueError(
|
||||||
|
f"{opt_type.kind!r} already registered to class {self._classes[opt_type]!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
self._classes[opt_type] = cls
|
||||||
|
self._meta[opt_type] = FactoryMeta(
|
||||||
|
kind=opt_type.kind, plugin_name=plugin_name, module=plugin_module_name
|
||||||
|
)
|
||||||
|
|
||||||
|
def load_from_plugins(
|
||||||
|
self, plugin_name: Optional[str] = None, allow_external_plugins: bool = False
|
||||||
|
):
|
||||||
|
plugin_name = plugin_name or self.plugin_name
|
||||||
|
|
||||||
|
plugin_manager = PluginManager(plugin_name)
|
||||||
|
plugin_manager.load_setuptools_entrypoints(plugin_name)
|
||||||
|
|
||||||
|
for plugin_name, plugin_module in plugin_manager.list_name_plugin():
|
||||||
|
plugin_module_name = str(plugin_module.__name__) # type: ignore
|
||||||
|
|
||||||
|
if not allow_external_plugins and not plugin_module_name.startswith(
|
||||||
|
"docling."
|
||||||
|
):
|
||||||
|
logger.warning(
|
||||||
|
f"The plugin {plugin_name} will not be loaded because Docling is being executed with allow_external_plugins=false."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
attr = getattr(plugin_module, self.plugin_attr_name, None)
|
||||||
|
|
||||||
|
if callable(attr):
|
||||||
|
logger.info("Loading plugin %r", plugin_name)
|
||||||
|
|
||||||
|
config = attr()
|
||||||
|
self.process_plugin(config, plugin_name, plugin_module_name)
|
||||||
|
|
||||||
|
def process_plugin(self, config, plugin_name: str, plugin_module_name: str):
|
||||||
|
for item in config[self.plugin_attr_name]:
|
||||||
|
try:
|
||||||
|
self.register(item, plugin_name, plugin_module_name)
|
||||||
|
except ValueError:
|
||||||
|
logger.warning("%r already registered", item)
|
11
docling/models/factories/ocr_factory.py
Normal file
11
docling/models/factories/ocr_factory.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
|
from docling.models.factories.base_factory import BaseFactory
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class OcrFactory(BaseFactory[BaseOcrModel]):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__("ocr_engines", *args, **kwargs)
|
11
docling/models/factories/picture_description_factory.py
Normal file
11
docling/models/factories/picture_description_factory.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
from docling.models.factories.base_factory import BaseFactory
|
||||||
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PictureDescriptionFactory(BaseFactory[PictureDescriptionBaseModel]):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__("picture_description", *args, **kwargs)
|
@ -1,13 +1,19 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Iterable, Optional, Tuple
|
from pathlib import Path
|
||||||
|
from typing import Iterable, Optional, Tuple, Type
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import OcrMacOptions
|
from docling.datamodel.pipeline_options import (
|
||||||
|
AcceleratorOptions,
|
||||||
|
OcrMacOptions,
|
||||||
|
OcrOptions,
|
||||||
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docling.utils.profiling import TimeRecorder
|
||||||
@ -16,13 +22,26 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class OcrMacModel(BaseOcrModel):
|
class OcrMacModel(BaseOcrModel):
|
||||||
def __init__(self, enabled: bool, options: OcrMacOptions):
|
def __init__(
|
||||||
super().__init__(enabled=enabled, options=options)
|
self,
|
||||||
|
enabled: bool,
|
||||||
|
artifacts_path: Optional[Path],
|
||||||
|
options: OcrMacOptions,
|
||||||
|
accelerator_options: AcceleratorOptions,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
enabled=enabled,
|
||||||
|
artifacts_path=artifacts_path,
|
||||||
|
options=options,
|
||||||
|
accelerator_options=accelerator_options,
|
||||||
|
)
|
||||||
self.options: OcrMacOptions
|
self.options: OcrMacOptions
|
||||||
|
|
||||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
|
if "darwin" != sys.platform:
|
||||||
|
raise RuntimeError(f"OcrMac is only supported on Mac.")
|
||||||
install_errmsg = (
|
install_errmsg = (
|
||||||
"ocrmac is not correctly installed. "
|
"ocrmac is not correctly installed. "
|
||||||
"Please install it via `pip install ocrmac` to use this OCR engine. "
|
"Please install it via `pip install ocrmac` to use this OCR engine. "
|
||||||
@ -121,3 +140,7 @@ class OcrMacModel(BaseOcrModel):
|
|||||||
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
||||||
|
|
||||||
yield page
|
yield page
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_options_type(cls) -> Type[OcrOptions]:
|
||||||
|
return OcrMacOptions
|
||||||
|
@ -1,13 +1,18 @@
|
|||||||
import base64
|
import base64
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
from typing import Iterable, List, Optional
|
from pathlib import Path
|
||||||
|
from typing import Iterable, List, Optional, Type, Union
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from pydantic import BaseModel, ConfigDict
|
from pydantic import BaseModel, ConfigDict
|
||||||
|
|
||||||
from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
|
from docling.datamodel.pipeline_options import (
|
||||||
|
AcceleratorOptions,
|
||||||
|
PictureDescriptionApiOptions,
|
||||||
|
PictureDescriptionBaseOptions,
|
||||||
|
)
|
||||||
from docling.exceptions import OperationNotAllowed
|
from docling.exceptions import OperationNotAllowed
|
||||||
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
||||||
|
|
||||||
@ -46,13 +51,25 @@ class ApiResponse(BaseModel):
|
|||||||
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
||||||
# elements_batch_size = 4
|
# elements_batch_size = 4
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
||||||
|
return PictureDescriptionApiOptions
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
enabled: bool,
|
enabled: bool,
|
||||||
enable_remote_services: bool,
|
enable_remote_services: bool,
|
||||||
|
artifacts_path: Optional[Union[Path, str]],
|
||||||
options: PictureDescriptionApiOptions,
|
options: PictureDescriptionApiOptions,
|
||||||
|
accelerator_options: AcceleratorOptions,
|
||||||
):
|
):
|
||||||
super().__init__(enabled=enabled, options=options)
|
super().__init__(
|
||||||
|
enabled=enabled,
|
||||||
|
enable_remote_services=enable_remote_services,
|
||||||
|
artifacts_path=artifacts_path,
|
||||||
|
options=options,
|
||||||
|
accelerator_options=accelerator_options,
|
||||||
|
)
|
||||||
self.options: PictureDescriptionApiOptions
|
self.options: PictureDescriptionApiOptions
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from abc import abstractmethod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Iterable, List, Optional, Union
|
from typing import Any, Iterable, List, Optional, Type, Union
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
@ -13,20 +14,30 @@ from docling_core.types.doc.document import ( # TODO: move import to docling_co
|
|||||||
)
|
)
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
|
from docling.datamodel.pipeline_options import (
|
||||||
|
AcceleratorOptions,
|
||||||
|
PictureDescriptionBaseOptions,
|
||||||
|
)
|
||||||
from docling.models.base_model import (
|
from docling.models.base_model import (
|
||||||
BaseItemAndImageEnrichmentModel,
|
BaseItemAndImageEnrichmentModel,
|
||||||
|
BaseModelWithOptions,
|
||||||
ItemAndImageEnrichmentElement,
|
ItemAndImageEnrichmentElement,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
|
class PictureDescriptionBaseModel(
|
||||||
|
BaseItemAndImageEnrichmentModel, BaseModelWithOptions
|
||||||
|
):
|
||||||
images_scale: float = 2.0
|
images_scale: float = 2.0
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
*,
|
||||||
enabled: bool,
|
enabled: bool,
|
||||||
|
enable_remote_services: bool,
|
||||||
|
artifacts_path: Optional[Union[Path, str]],
|
||||||
options: PictureDescriptionBaseOptions,
|
options: PictureDescriptionBaseOptions,
|
||||||
|
accelerator_options: AcceleratorOptions,
|
||||||
):
|
):
|
||||||
self.enabled = enabled
|
self.enabled = enabled
|
||||||
self.options = options
|
self.options = options
|
||||||
@ -62,3 +73,8 @@ class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
|
|||||||
PictureDescriptionData(text=output, provenance=self.provenance)
|
PictureDescriptionData(text=output, provenance=self.provenance)
|
||||||
)
|
)
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@abstractmethod
|
||||||
|
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
||||||
|
pass
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Union
|
from typing import Iterable, Optional, Type, Union
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
|
PictureDescriptionBaseOptions,
|
||||||
PictureDescriptionVlmOptions,
|
PictureDescriptionVlmOptions,
|
||||||
)
|
)
|
||||||
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
||||||
@ -13,14 +14,25 @@ from docling.utils.accelerator_utils import decide_device
|
|||||||
|
|
||||||
class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
||||||
|
return PictureDescriptionVlmOptions
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
enabled: bool,
|
enabled: bool,
|
||||||
|
enable_remote_services: bool,
|
||||||
artifacts_path: Optional[Union[Path, str]],
|
artifacts_path: Optional[Union[Path, str]],
|
||||||
options: PictureDescriptionVlmOptions,
|
options: PictureDescriptionVlmOptions,
|
||||||
accelerator_options: AcceleratorOptions,
|
accelerator_options: AcceleratorOptions,
|
||||||
):
|
):
|
||||||
super().__init__(enabled=enabled, options=options)
|
super().__init__(
|
||||||
|
enabled=enabled,
|
||||||
|
enable_remote_services=enable_remote_services,
|
||||||
|
artifacts_path=artifacts_path,
|
||||||
|
options=options,
|
||||||
|
accelerator_options=accelerator_options,
|
||||||
|
)
|
||||||
self.options: PictureDescriptionVlmOptions
|
self.options: PictureDescriptionVlmOptions
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
|
0
docling/models/plugins/__init__.py
Normal file
0
docling/models/plugins/__init__.py
Normal file
28
docling/models/plugins/defaults.py
Normal file
28
docling/models/plugins/defaults.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
|
from docling.models.ocr_mac_model import OcrMacModel
|
||||||
|
from docling.models.picture_description_api_model import PictureDescriptionApiModel
|
||||||
|
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
||||||
|
from docling.models.rapid_ocr_model import RapidOcrModel
|
||||||
|
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||||
|
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||||
|
|
||||||
|
|
||||||
|
def ocr_engines():
|
||||||
|
return {
|
||||||
|
"ocr_engines": [
|
||||||
|
EasyOcrModel,
|
||||||
|
OcrMacModel,
|
||||||
|
RapidOcrModel,
|
||||||
|
TesseractOcrModel,
|
||||||
|
TesseractOcrCliModel,
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def picture_description():
|
||||||
|
return {
|
||||||
|
"picture_description": [
|
||||||
|
PictureDescriptionVlmModel,
|
||||||
|
PictureDescriptionApiModel,
|
||||||
|
]
|
||||||
|
}
|
@ -1,5 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
from typing import Iterable
|
from pathlib import Path
|
||||||
|
from typing import Iterable, Optional, Type
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
@ -10,6 +11,7 @@ from docling.datamodel.document import ConversionResult
|
|||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
|
OcrOptions,
|
||||||
RapidOcrOptions,
|
RapidOcrOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
@ -24,10 +26,16 @@ class RapidOcrModel(BaseOcrModel):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
enabled: bool,
|
enabled: bool,
|
||||||
|
artifacts_path: Optional[Path],
|
||||||
options: RapidOcrOptions,
|
options: RapidOcrOptions,
|
||||||
accelerator_options: AcceleratorOptions,
|
accelerator_options: AcceleratorOptions,
|
||||||
):
|
):
|
||||||
super().__init__(enabled=enabled, options=options)
|
super().__init__(
|
||||||
|
enabled=enabled,
|
||||||
|
artifacts_path=artifacts_path,
|
||||||
|
options=options,
|
||||||
|
accelerator_options=accelerator_options,
|
||||||
|
)
|
||||||
self.options: RapidOcrOptions
|
self.options: RapidOcrOptions
|
||||||
|
|
||||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||||
@ -135,3 +143,7 @@ class RapidOcrModel(BaseOcrModel):
|
|||||||
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
||||||
|
|
||||||
yield page
|
yield page
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_options_type(cls) -> Type[OcrOptions]:
|
||||||
|
return RapidOcrOptions
|
||||||
|
@ -3,8 +3,9 @@ import io
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
from subprocess import DEVNULL, PIPE, Popen
|
from subprocess import DEVNULL, PIPE, Popen
|
||||||
from typing import Iterable, List, Optional, Tuple
|
from typing import Iterable, List, Optional, Tuple, Type
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
@ -12,7 +13,11 @@ from docling_core.types.doc.page import BoundingRectangle, TextCell
|
|||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
from docling.datamodel.pipeline_options import (
|
||||||
|
AcceleratorOptions,
|
||||||
|
OcrOptions,
|
||||||
|
TesseractCliOcrOptions,
|
||||||
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.utils.ocr_utils import map_tesseract_script
|
from docling.utils.ocr_utils import map_tesseract_script
|
||||||
@ -22,8 +27,19 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class TesseractOcrCliModel(BaseOcrModel):
|
class TesseractOcrCliModel(BaseOcrModel):
|
||||||
def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
|
def __init__(
|
||||||
super().__init__(enabled=enabled, options=options)
|
self,
|
||||||
|
enabled: bool,
|
||||||
|
artifacts_path: Optional[Path],
|
||||||
|
options: TesseractCliOcrOptions,
|
||||||
|
accelerator_options: AcceleratorOptions,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
enabled=enabled,
|
||||||
|
artifacts_path=artifacts_path,
|
||||||
|
options=options,
|
||||||
|
accelerator_options=accelerator_options,
|
||||||
|
)
|
||||||
self.options: TesseractCliOcrOptions
|
self.options: TesseractCliOcrOptions
|
||||||
|
|
||||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||||
@ -257,3 +273,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
||||||
|
|
||||||
yield page
|
yield page
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_options_type(cls) -> Type[OcrOptions]:
|
||||||
|
return TesseractCliOcrOptions
|
||||||
|
@ -1,12 +1,17 @@
|
|||||||
import logging
|
import logging
|
||||||
from typing import Iterable
|
from pathlib import Path
|
||||||
|
from typing import Iterable, Optional, Type
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
from docling.datamodel.pipeline_options import (
|
||||||
|
AcceleratorOptions,
|
||||||
|
OcrOptions,
|
||||||
|
TesseractOcrOptions,
|
||||||
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.utils.ocr_utils import map_tesseract_script
|
from docling.utils.ocr_utils import map_tesseract_script
|
||||||
@ -16,8 +21,19 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class TesseractOcrModel(BaseOcrModel):
|
class TesseractOcrModel(BaseOcrModel):
|
||||||
def __init__(self, enabled: bool, options: TesseractOcrOptions):
|
def __init__(
|
||||||
super().__init__(enabled=enabled, options=options)
|
self,
|
||||||
|
enabled: bool,
|
||||||
|
artifacts_path: Optional[Path],
|
||||||
|
options: TesseractOcrOptions,
|
||||||
|
accelerator_options: AcceleratorOptions,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
enabled=enabled,
|
||||||
|
artifacts_path=artifacts_path,
|
||||||
|
options=options,
|
||||||
|
accelerator_options=accelerator_options,
|
||||||
|
)
|
||||||
self.options: TesseractOcrOptions
|
self.options: TesseractOcrOptions
|
||||||
|
|
||||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||||
@ -200,3 +216,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
||||||
|
|
||||||
yield page
|
yield page
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_options_type(cls) -> Type[OcrOptions]:
|
||||||
|
return TesseractOcrOptions
|
||||||
|
@ -10,16 +10,7 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
|
|||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import AssembledUnit, Page
|
from docling.datamodel.base_models import AssembledUnit, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
EasyOcrOptions,
|
|
||||||
OcrMacOptions,
|
|
||||||
PdfPipelineOptions,
|
|
||||||
PictureDescriptionApiOptions,
|
|
||||||
PictureDescriptionVlmOptions,
|
|
||||||
RapidOcrOptions,
|
|
||||||
TesseractCliOcrOptions,
|
|
||||||
TesseractOcrOptions,
|
|
||||||
)
|
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
||||||
@ -27,22 +18,16 @@ from docling.models.document_picture_classifier import (
|
|||||||
DocumentPictureClassifier,
|
DocumentPictureClassifier,
|
||||||
DocumentPictureClassifierOptions,
|
DocumentPictureClassifierOptions,
|
||||||
)
|
)
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.factories import get_ocr_factory, get_picture_description_factory
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
from docling.models.ocr_mac_model import OcrMacModel
|
|
||||||
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
||||||
from docling.models.page_preprocessing_model import (
|
from docling.models.page_preprocessing_model import (
|
||||||
PagePreprocessingModel,
|
PagePreprocessingModel,
|
||||||
PagePreprocessingOptions,
|
PagePreprocessingOptions,
|
||||||
)
|
)
|
||||||
from docling.models.picture_description_api_model import PictureDescriptionApiModel
|
|
||||||
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
||||||
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
|
||||||
from docling.models.rapid_ocr_model import RapidOcrModel
|
|
||||||
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
|
||||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
|
||||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||||
from docling.utils.model_downloader import download_models
|
from docling.utils.model_downloader import download_models
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
@ -78,10 +63,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
|
self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
|
||||||
|
|
||||||
if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
|
ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
|
||||||
raise RuntimeError(
|
|
||||||
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
|
||||||
)
|
|
||||||
|
|
||||||
self.build_pipe = [
|
self.build_pipe = [
|
||||||
# Pre-processing
|
# Pre-processing
|
||||||
@ -164,66 +146,30 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
output_dir = download_models(output_dir=local_dir, force=force, progress=False)
|
output_dir = download_models(output_dir=local_dir, force=force, progress=False)
|
||||||
return output_dir
|
return output_dir
|
||||||
|
|
||||||
def get_ocr_model(
|
def get_ocr_model(self, artifacts_path: Optional[Path] = None) -> BaseOcrModel:
|
||||||
self, artifacts_path: Optional[Path] = None
|
factory = get_ocr_factory(
|
||||||
) -> Optional[BaseOcrModel]:
|
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
||||||
if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
|
)
|
||||||
return EasyOcrModel(
|
return factory.create_instance(
|
||||||
enabled=self.pipeline_options.do_ocr,
|
options=self.pipeline_options.ocr_options,
|
||||||
artifacts_path=artifacts_path,
|
enabled=self.pipeline_options.do_ocr,
|
||||||
options=self.pipeline_options.ocr_options,
|
artifacts_path=artifacts_path,
|
||||||
accelerator_options=self.pipeline_options.accelerator_options,
|
accelerator_options=self.pipeline_options.accelerator_options,
|
||||||
)
|
)
|
||||||
elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
|
|
||||||
return TesseractOcrCliModel(
|
|
||||||
enabled=self.pipeline_options.do_ocr,
|
|
||||||
options=self.pipeline_options.ocr_options,
|
|
||||||
)
|
|
||||||
elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions):
|
|
||||||
return TesseractOcrModel(
|
|
||||||
enabled=self.pipeline_options.do_ocr,
|
|
||||||
options=self.pipeline_options.ocr_options,
|
|
||||||
)
|
|
||||||
elif isinstance(self.pipeline_options.ocr_options, RapidOcrOptions):
|
|
||||||
return RapidOcrModel(
|
|
||||||
enabled=self.pipeline_options.do_ocr,
|
|
||||||
options=self.pipeline_options.ocr_options,
|
|
||||||
accelerator_options=self.pipeline_options.accelerator_options,
|
|
||||||
)
|
|
||||||
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
|
|
||||||
if "darwin" != sys.platform:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
|
|
||||||
)
|
|
||||||
return OcrMacModel(
|
|
||||||
enabled=self.pipeline_options.do_ocr,
|
|
||||||
options=self.pipeline_options.ocr_options,
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_picture_description_model(
|
def get_picture_description_model(
|
||||||
self, artifacts_path: Optional[Path] = None
|
self, artifacts_path: Optional[Path] = None
|
||||||
) -> Optional[PictureDescriptionBaseModel]:
|
) -> Optional[PictureDescriptionBaseModel]:
|
||||||
if isinstance(
|
factory = get_picture_description_factory(
|
||||||
self.pipeline_options.picture_description_options,
|
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
||||||
PictureDescriptionApiOptions,
|
)
|
||||||
):
|
return factory.create_instance(
|
||||||
return PictureDescriptionApiModel(
|
options=self.pipeline_options.picture_description_options,
|
||||||
enabled=self.pipeline_options.do_picture_description,
|
enabled=self.pipeline_options.do_picture_description,
|
||||||
enable_remote_services=self.pipeline_options.enable_remote_services,
|
enable_remote_services=self.pipeline_options.enable_remote_services,
|
||||||
options=self.pipeline_options.picture_description_options,
|
artifacts_path=artifacts_path,
|
||||||
)
|
accelerator_options=self.pipeline_options.accelerator_options,
|
||||||
elif isinstance(
|
)
|
||||||
self.pipeline_options.picture_description_options,
|
|
||||||
PictureDescriptionVlmOptions,
|
|
||||||
):
|
|
||||||
return PictureDescriptionVlmModel(
|
|
||||||
enabled=self.pipeline_options.do_picture_description,
|
|
||||||
artifacts_path=artifacts_path,
|
|
||||||
options=self.pipeline_options.picture_description_options,
|
|
||||||
accelerator_options=self.pipeline_options.accelerator_options,
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
|
|
||||||
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||||
with TimeRecorder(conv_res, "page_init"):
|
with TimeRecorder(conv_res, "page_init"):
|
||||||
|
4
poetry.lock
generated
4
poetry.lock
generated
@ -1,4 +1,4 @@
|
|||||||
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
|
# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "accelerate"
|
name = "accelerate"
|
||||||
@ -7838,4 +7838,4 @@ vlm = ["accelerate", "transformers", "transformers"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "6917ebe61625f5b719df46c3f1597c61241b2a3b81bae640d9167d20d0182dd8"
|
content-hash = "a9ace62bd5b629cb2f20186b750d7c63f383f37f2e3df04cfcc821fc83c877b8"
|
||||||
|
@ -88,6 +88,7 @@ accelerate = [
|
|||||||
]
|
]
|
||||||
pillow = ">=10.0.0,<12.0.0"
|
pillow = ">=10.0.0,<12.0.0"
|
||||||
tqdm = "^4.65.0"
|
tqdm = "^4.65.0"
|
||||||
|
pluggy = "^1.0.0"
|
||||||
pylatexenc = "^2.10"
|
pylatexenc = "^2.10"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
@ -156,6 +157,9 @@ rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
|||||||
docling = "docling.cli.main:app"
|
docling = "docling.cli.main:app"
|
||||||
docling-tools = "docling.cli.tools:app"
|
docling-tools = "docling.cli.tools:app"
|
||||||
|
|
||||||
|
[tool.poetry.plugins."docling"]
|
||||||
|
"docling_defaults" = "docling.models.plugins.defaults"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core"]
|
requires = ["poetry-core"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
Loading…
Reference in New Issue
Block a user