
* scaffolding in place Signed-off-by: Peter Staar <taa@zurich.ibm.com> * doing scaffolding for audio pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * WIP: got first transcription working Signed-off-by: Peter Staar <taa@zurich.ibm.com> * all working, time to start cleaning up Signed-off-by: Peter Staar <taa@zurich.ibm.com> * first working ASR pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added openai-whisper as a first transcription model Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updating with asr_options Signed-off-by: Peter Staar <taa@zurich.ibm.com> * finalised the first working ASR pipeline with Whisper Signed-off-by: Peter Staar <taa@zurich.ibm.com> * use whisper from the latest git commit Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * updated comment Signed-off-by: Peter Staar <taa@zurich.ibm.com> * AudioBackend -> DummyBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * file rename Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename to NoOpBackend, add test for ASR pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Support every format in NoOpBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add missing audio file and test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Install ffmpeg system dependency for ASR test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
58 lines
1.4 KiB
Python
58 lines
1.4 KiB
Python
from enum import Enum
|
|
from typing import Any, Dict, List, Literal, Optional, Union
|
|
|
|
from pydantic import AnyUrl, BaseModel
|
|
from typing_extensions import deprecated
|
|
|
|
from docling.datamodel.accelerator_options import AcceleratorDevice
|
|
from docling.datamodel.pipeline_options_vlm_model import (
|
|
# InferenceFramework,
|
|
TransformersModelType,
|
|
)
|
|
|
|
|
|
class BaseAsrOptions(BaseModel):
|
|
kind: str
|
|
# prompt: str
|
|
|
|
|
|
class InferenceAsrFramework(str, Enum):
|
|
# MLX = "mlx" # disabled for now
|
|
# TRANSFORMERS = "transformers" # disabled for now
|
|
WHISPER = "whisper"
|
|
|
|
|
|
class InlineAsrOptions(BaseAsrOptions):
|
|
kind: Literal["inline_model_options"] = "inline_model_options"
|
|
|
|
repo_id: str
|
|
|
|
verbose: bool = False
|
|
timestamps: bool = True
|
|
|
|
temperature: float = 0.0
|
|
max_new_tokens: int = 256
|
|
max_time_chunk: float = 30.0
|
|
|
|
torch_dtype: Optional[str] = None
|
|
supported_devices: List[AcceleratorDevice] = [
|
|
AcceleratorDevice.CPU,
|
|
AcceleratorDevice.CUDA,
|
|
AcceleratorDevice.MPS,
|
|
]
|
|
|
|
@property
|
|
def repo_cache_folder(self) -> str:
|
|
return self.repo_id.replace("/", "--")
|
|
|
|
|
|
class InlineAsrNativeWhisperOptions(InlineAsrOptions):
|
|
inference_framework: InferenceAsrFramework = InferenceAsrFramework.WHISPER
|
|
|
|
language: str = "en"
|
|
supported_devices: List[AcceleratorDevice] = [
|
|
AcceleratorDevice.CPU,
|
|
AcceleratorDevice.CUDA,
|
|
]
|
|
word_timestamps: bool = True
|