
* scaffolding in place Signed-off-by: Peter Staar <taa@zurich.ibm.com> * doing scaffolding for audio pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * WIP: got first transcription working Signed-off-by: Peter Staar <taa@zurich.ibm.com> * all working, time to start cleaning up Signed-off-by: Peter Staar <taa@zurich.ibm.com> * first working ASR pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added openai-whisper as a first transcription model Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updating with asr_options Signed-off-by: Peter Staar <taa@zurich.ibm.com> * finalised the first working ASR pipeline with Whisper Signed-off-by: Peter Staar <taa@zurich.ibm.com> * use whisper from the latest git commit Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * updated comment Signed-off-by: Peter Staar <taa@zurich.ibm.com> * AudioBackend -> DummyBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * file rename Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename to NoOpBackend, add test for ASR pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Support every format in NoOpBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add missing audio file and test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Install ffmpeg system dependency for ASR test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
254 lines
8.9 KiB
Python
254 lines
8.9 KiB
Python
import logging
|
|
import os
|
|
import re
|
|
from io import BytesIO
|
|
from pathlib import Path
|
|
from typing import List, Optional, Union, cast
|
|
|
|
from docling_core.types.doc import DoclingDocument, DocumentOrigin
|
|
|
|
# import whisper # type: ignore
|
|
# import librosa
|
|
# import numpy as np
|
|
# import soundfile as sf # type: ignore
|
|
from docling_core.types.doc.labels import DocItemLabel
|
|
from pydantic import BaseModel, Field, validator
|
|
|
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
from docling.backend.noop_backend import NoOpBackend
|
|
|
|
# from pydub import AudioSegment # type: ignore
|
|
# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
|
|
from docling.datamodel.accelerator_options import (
|
|
AcceleratorOptions,
|
|
)
|
|
from docling.datamodel.base_models import (
|
|
ConversionStatus,
|
|
FormatToMimeType,
|
|
)
|
|
from docling.datamodel.document import ConversionResult, InputDocument
|
|
from docling.datamodel.pipeline_options import (
|
|
AsrPipelineOptions,
|
|
)
|
|
from docling.datamodel.pipeline_options_asr_model import (
|
|
InlineAsrNativeWhisperOptions,
|
|
# AsrResponseFormat,
|
|
InlineAsrOptions,
|
|
)
|
|
from docling.datamodel.pipeline_options_vlm_model import (
|
|
InferenceFramework,
|
|
)
|
|
from docling.datamodel.settings import settings
|
|
from docling.pipeline.base_pipeline import BasePipeline
|
|
from docling.utils.accelerator_utils import decide_device
|
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
|
|
|
_log = logging.getLogger(__name__)
|
|
|
|
|
|
class _ConversationWord(BaseModel):
|
|
text: str
|
|
start_time: Optional[float] = Field(
|
|
None, description="Start time in seconds from video start"
|
|
)
|
|
end_time: Optional[float] = Field(
|
|
None, ge=0, description="End time in seconds from video start"
|
|
)
|
|
|
|
|
|
class _ConversationItem(BaseModel):
|
|
text: str
|
|
start_time: Optional[float] = Field(
|
|
None, description="Start time in seconds from video start"
|
|
)
|
|
end_time: Optional[float] = Field(
|
|
None, ge=0, description="End time in seconds from video start"
|
|
)
|
|
speaker_id: Optional[int] = Field(None, description="Numeric speaker identifier")
|
|
speaker: Optional[str] = Field(
|
|
None, description="Speaker name, defaults to speaker-{speaker_id}"
|
|
)
|
|
words: Optional[list[_ConversationWord]] = Field(
|
|
None, description="Individual words with time-stamps"
|
|
)
|
|
|
|
def __lt__(self, other):
|
|
if not isinstance(other, _ConversationItem):
|
|
return NotImplemented
|
|
return self.start_time < other.start_time
|
|
|
|
def __eq__(self, other):
|
|
if not isinstance(other, _ConversationItem):
|
|
return NotImplemented
|
|
return self.start_time == other.start_time
|
|
|
|
def to_string(self) -> str:
|
|
"""Format the conversation entry as a string"""
|
|
result = ""
|
|
if (self.start_time is not None) and (self.end_time is not None):
|
|
result += f"[time: {self.start_time}-{self.end_time}] "
|
|
|
|
if self.speaker is not None:
|
|
result += f"[speaker:{self.speaker}] "
|
|
|
|
result += self.text
|
|
return result
|
|
|
|
|
|
class _NativeWhisperModel:
|
|
def __init__(
|
|
self,
|
|
enabled: bool,
|
|
artifacts_path: Optional[Path],
|
|
accelerator_options: AcceleratorOptions,
|
|
asr_options: InlineAsrNativeWhisperOptions,
|
|
):
|
|
"""
|
|
Transcriber using native Whisper.
|
|
"""
|
|
self.enabled = enabled
|
|
|
|
_log.info(f"artifacts-path: {artifacts_path}")
|
|
_log.info(f"accelerator_options: {accelerator_options}")
|
|
|
|
if self.enabled:
|
|
try:
|
|
import whisper # type: ignore
|
|
except ImportError:
|
|
raise ImportError(
|
|
"whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
|
|
)
|
|
self.asr_options = asr_options
|
|
self.max_tokens = asr_options.max_new_tokens
|
|
self.temperature = asr_options.temperature
|
|
|
|
self.device = decide_device(
|
|
accelerator_options.device,
|
|
supported_devices=asr_options.supported_devices,
|
|
)
|
|
_log.info(f"Available device for Whisper: {self.device}")
|
|
|
|
self.model_name = asr_options.repo_id
|
|
_log.info(f"loading _NativeWhisperModel({self.model_name})")
|
|
if artifacts_path is not None:
|
|
_log.info(f"loading {self.model_name} from {artifacts_path}")
|
|
self.model = whisper.load_model(
|
|
name=self.model_name,
|
|
device=self.device,
|
|
download_root=str(artifacts_path),
|
|
)
|
|
else:
|
|
self.model = whisper.load_model(
|
|
name=self.model_name, device=self.device
|
|
)
|
|
|
|
self.verbose = asr_options.verbose
|
|
self.timestamps = asr_options.timestamps
|
|
self.word_timestamps = asr_options.word_timestamps
|
|
|
|
def run(self, conv_res: ConversionResult) -> ConversionResult:
|
|
audio_path: Path = Path(conv_res.input.file).resolve()
|
|
|
|
try:
|
|
conversation = self.transcribe(audio_path)
|
|
|
|
# Ensure we have a proper DoclingDocument
|
|
origin = DocumentOrigin(
|
|
filename=conv_res.input.file.name or "audio.wav",
|
|
mimetype="audio/x-wav",
|
|
binary_hash=conv_res.input.document_hash,
|
|
)
|
|
conv_res.document = DoclingDocument(
|
|
name=conv_res.input.file.stem or "audio.wav", origin=origin
|
|
)
|
|
|
|
for citem in conversation:
|
|
conv_res.document.add_text(
|
|
label=DocItemLabel.TEXT, text=citem.to_string()
|
|
)
|
|
|
|
conv_res.status = ConversionStatus.SUCCESS
|
|
return conv_res
|
|
|
|
except Exception as exc:
|
|
_log.error(f"Audio tranciption has an error: {exc}")
|
|
|
|
conv_res.status = ConversionStatus.FAILURE
|
|
return conv_res
|
|
|
|
def transcribe(self, fpath: Path) -> list[_ConversationItem]:
|
|
result = self.model.transcribe(
|
|
str(fpath), verbose=self.verbose, word_timestamps=self.word_timestamps
|
|
)
|
|
|
|
convo: list[_ConversationItem] = []
|
|
for _ in result["segments"]:
|
|
item = _ConversationItem(
|
|
start_time=_["start"], end_time=_["end"], text=_["text"], words=[]
|
|
)
|
|
if "words" in _ and self.word_timestamps:
|
|
item.words = []
|
|
for __ in _["words"]:
|
|
item.words.append(
|
|
_ConversationWord(
|
|
start_time=__["start"],
|
|
end_time=__["end"],
|
|
text=__["word"],
|
|
)
|
|
)
|
|
convo.append(item)
|
|
|
|
return convo
|
|
|
|
|
|
class AsrPipeline(BasePipeline):
|
|
def __init__(self, pipeline_options: AsrPipelineOptions):
|
|
super().__init__(pipeline_options)
|
|
self.keep_backend = True
|
|
|
|
self.pipeline_options: AsrPipelineOptions = pipeline_options
|
|
|
|
artifacts_path: Optional[Path] = None
|
|
if pipeline_options.artifacts_path is not None:
|
|
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
|
elif settings.artifacts_path is not None:
|
|
artifacts_path = Path(settings.artifacts_path).expanduser()
|
|
|
|
if artifacts_path is not None and not artifacts_path.is_dir():
|
|
raise RuntimeError(
|
|
f"The value of {artifacts_path=} is not valid. "
|
|
"When defined, it must point to a folder containing all models required by the pipeline."
|
|
)
|
|
|
|
if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
|
|
asr_options: InlineAsrNativeWhisperOptions = (
|
|
self.pipeline_options.asr_options
|
|
)
|
|
self._model = _NativeWhisperModel(
|
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
|
artifacts_path=artifacts_path,
|
|
accelerator_options=pipeline_options.accelerator_options,
|
|
asr_options=asr_options,
|
|
)
|
|
else:
|
|
_log.error(f"No model support for {self.pipeline_options.asr_options}")
|
|
|
|
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
|
status = ConversionStatus.SUCCESS
|
|
return status
|
|
|
|
@classmethod
|
|
def get_default_options(cls) -> AsrPipelineOptions:
|
|
return AsrPipelineOptions()
|
|
|
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
_log.info(f"start _build_document in AsrPipeline: {conv_res.input.file}")
|
|
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
|
self._model.run(conv_res=conv_res)
|
|
|
|
return conv_res
|
|
|
|
@classmethod
|
|
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
|
return isinstance(backend, NoOpBackend)
|