
* scaffolding in place Signed-off-by: Peter Staar <taa@zurich.ibm.com> * doing scaffolding for audio pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * WIP: got first transcription working Signed-off-by: Peter Staar <taa@zurich.ibm.com> * all working, time to start cleaning up Signed-off-by: Peter Staar <taa@zurich.ibm.com> * first working ASR pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added openai-whisper as a first transcription model Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updating with asr_options Signed-off-by: Peter Staar <taa@zurich.ibm.com> * finalised the first working ASR pipeline with Whisper Signed-off-by: Peter Staar <taa@zurich.ibm.com> * use whisper from the latest git commit Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * updated comment Signed-off-by: Peter Staar <taa@zurich.ibm.com> * AudioBackend -> DummyBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * file rename Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename to NoOpBackend, add test for ASR pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Support every format in NoOpBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add missing audio file and test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Install ffmpeg system dependency for ASR test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
93 lines
2.1 KiB
Python
93 lines
2.1 KiB
Python
import logging
|
|
from enum import Enum
|
|
|
|
from pydantic import (
|
|
AnyUrl,
|
|
)
|
|
|
|
from docling.datamodel.accelerator_options import AcceleratorDevice
|
|
from docling.datamodel.pipeline_options_asr_model import (
|
|
# AsrResponseFormat,
|
|
# ApiAsrOptions,
|
|
InferenceAsrFramework,
|
|
InlineAsrNativeWhisperOptions,
|
|
TransformersModelType,
|
|
)
|
|
|
|
_log = logging.getLogger(__name__)
|
|
|
|
WHISPER_TINY = InlineAsrNativeWhisperOptions(
|
|
repo_id="tiny",
|
|
inference_framework=InferenceAsrFramework.WHISPER,
|
|
verbose=True,
|
|
timestamps=True,
|
|
word_timestamps=True,
|
|
temperatue=0.0,
|
|
max_new_tokens=256,
|
|
max_time_chunk=30.0,
|
|
)
|
|
|
|
WHISPER_SMALL = InlineAsrNativeWhisperOptions(
|
|
repo_id="small",
|
|
inference_framework=InferenceAsrFramework.WHISPER,
|
|
verbose=True,
|
|
timestamps=True,
|
|
word_timestamps=True,
|
|
temperatue=0.0,
|
|
max_new_tokens=256,
|
|
max_time_chunk=30.0,
|
|
)
|
|
|
|
WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
|
|
repo_id="medium",
|
|
inference_framework=InferenceAsrFramework.WHISPER,
|
|
verbose=True,
|
|
timestamps=True,
|
|
word_timestamps=True,
|
|
temperatue=0.0,
|
|
max_new_tokens=256,
|
|
max_time_chunk=30.0,
|
|
)
|
|
|
|
WHISPER_BASE = InlineAsrNativeWhisperOptions(
|
|
repo_id="base",
|
|
inference_framework=InferenceAsrFramework.WHISPER,
|
|
verbose=True,
|
|
timestamps=True,
|
|
word_timestamps=True,
|
|
temperatue=0.0,
|
|
max_new_tokens=256,
|
|
max_time_chunk=30.0,
|
|
)
|
|
|
|
WHISPER_LARGE = InlineAsrNativeWhisperOptions(
|
|
repo_id="large",
|
|
inference_framework=InferenceAsrFramework.WHISPER,
|
|
verbose=True,
|
|
timestamps=True,
|
|
word_timestamps=True,
|
|
temperatue=0.0,
|
|
max_new_tokens=256,
|
|
max_time_chunk=30.0,
|
|
)
|
|
|
|
WHISPER_TURBO = InlineAsrNativeWhisperOptions(
|
|
repo_id="turbo",
|
|
inference_framework=InferenceAsrFramework.WHISPER,
|
|
verbose=True,
|
|
timestamps=True,
|
|
word_timestamps=True,
|
|
temperatue=0.0,
|
|
max_new_tokens=256,
|
|
max_time_chunk=30.0,
|
|
)
|
|
|
|
|
|
class AsrModelType(str, Enum):
|
|
WHISPER_TINY = "whisper_tiny"
|
|
WHISPER_SMALL = "whisper_small"
|
|
WHISPER_MEDIUM = "whisper_medium"
|
|
WHISPER_BASE = "whisper_base"
|
|
WHISPER_LARGE = "whisper_large"
|
|
WHISPER_TURBO = "whisper_turbo"
|