Docling/docling/datamodel/pipeline_options_asr_model.py

from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Union

from pydantic import AnyUrl, BaseModel
from typing_extensions import deprecated

from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.pipeline_options_vlm_model import (
    # InferenceFramework,
    TransformersModelType,
)


class BaseAsrOptions(BaseModel):
    kind: str
    # prompt: str


class InferenceAsrFramework(str, Enum):
    # MLX = "mlx" # disabled for now
    # TRANSFORMERS = "transformers" # disabled for now
    WHISPER = "whisper"


class InlineAsrOptions(BaseAsrOptions):
    kind: Literal["inline_model_options"] = "inline_model_options"

    repo_id: str

    verbose: bool = False
    timestamps: bool = True

    temperature: float = 0.0
    max_new_tokens: int = 256
    max_time_chunk: float = 30.0

    torch_dtype: Optional[str] = None
    supported_devices: List[AcceleratorDevice] = [
        AcceleratorDevice.CPU,
        AcceleratorDevice.CUDA,
        AcceleratorDevice.MPS,
    ]

    @property
    def repo_cache_folder(self) -> str:
        return self.repo_id.replace("/", "--")


class InlineAsrNativeWhisperOptions(InlineAsrOptions):
    inference_framework: InferenceAsrFramework = InferenceAsrFramework.WHISPER

    language: str = "en"
    supported_devices: List[AcceleratorDevice] = [
        AcceleratorDevice.CPU,
        AcceleratorDevice.CUDA,
    ]
    word_timestamps: bool = True