feat: Support audio input (#1763)

* scaffolding in place

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* doing scaffolding for audio pipeline

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* WIP: got first transcription working

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* all working, time to start cleaning up

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* first working ASR pipeline

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added openai-whisper as a first transcription model

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updating with asr_options

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* finalised the first working ASR pipeline with Whisper

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* use whisper from the latest git commit

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Update docling/datamodel/pipeline_options.py

Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com>

* Update docling/datamodel/pipeline_options.py

Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com>

* updated comment

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* AudioBackend -> DummyBackend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* file rename

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Rename to NoOpBackend, add test for ASR pipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Support every format in NoOpBackend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add missing audio file and test

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Install ffmpeg system dependency for ASR test

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2025-06-23 14:47:26 +02:00
committed by GitHub
parent d26dac61a8
commit 1557e7ce3e
14 changed files with 941 additions and 62 deletions
+92
View File
@@ -0,0 +1,92 @@
import logging
from enum import Enum
from pydantic import (
AnyUrl,
)
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.pipeline_options_asr_model import (
# AsrResponseFormat,
# ApiAsrOptions,
InferenceAsrFramework,
InlineAsrNativeWhisperOptions,
TransformersModelType,
)
_log = logging.getLogger(__name__)
WHISPER_TINY = InlineAsrNativeWhisperOptions(
repo_id="tiny",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
WHISPER_SMALL = InlineAsrNativeWhisperOptions(
repo_id="small",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
repo_id="medium",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
WHISPER_BASE = InlineAsrNativeWhisperOptions(
repo_id="base",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
WHISPER_LARGE = InlineAsrNativeWhisperOptions(
repo_id="large",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
WHISPER_TURBO = InlineAsrNativeWhisperOptions(
repo_id="turbo",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
class AsrModelType(str, Enum):
WHISPER_TINY = "whisper_tiny"
WHISPER_SMALL = "whisper_small"
WHISPER_MEDIUM = "whisper_medium"
WHISPER_BASE = "whisper_base"
WHISPER_LARGE = "whisper_large"
WHISPER_TURBO = "whisper_turbo"
+3
View File
@@ -49,6 +49,7 @@ class InputFormat(str, Enum):
XML_USPTO = "xml_uspto"
XML_JATS = "xml_jats"
JSON_DOCLING = "json_docling"
AUDIO = "audio"
class OutputFormat(str, Enum):
@@ -73,6 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.XLSX: ["xlsx", "xlsm"],
InputFormat.XML_USPTO: ["xml", "txt"],
InputFormat.JSON_DOCLING: ["json"],
InputFormat.AUDIO: ["wav", "mp3"],
}
FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
],
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
InputFormat.JSON_DOCLING: ["application/json"],
InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
}
MimeTypeToFormat: dict[str, list[InputFormat]] = {
+3 -1
View File
@@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel):
backend: Type[AbstractDocumentBackend]
if format not in format_options.keys():
_log.error(
f"Input document {obj.name} does not match any allowed format."
f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
)
backend = _DummyBackend
else:
@@ -318,6 +318,8 @@ class _DocumentConversionInput(BaseModel):
mime = mime or _DocumentConversionInput._detect_csv(content)
mime = mime or "text/plain"
formats = MimeTypeToFormat.get(mime, [])
_log.info(f"detected formats: {formats}")
if formats:
if len(formats) == 1 and mime not in ("text/plain"):
return formats[0]
+12 -1
View File
@@ -11,8 +11,13 @@ from pydantic import (
)
from typing_extensions import deprecated
from docling.datamodel import asr_model_specs
# Import the following for backwards compatibility
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.pipeline_options_asr_model import (
InlineAsrOptions,
)
from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions,
InferenceFramework,
@@ -260,6 +265,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
)
class AsrPipelineOptions(PipelineOptions):
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
artifacts_path: Optional[Union[Path, str]] = None
class PdfPipelineOptions(PaginatedPipelineOptions):
"""Options for the PDF pipeline."""
@@ -297,6 +307,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
)
class PdfPipeline(str, Enum):
class ProcessingPipeline(str, Enum):
STANDARD = "standard"
VLM = "vlm"
ASR = "asr"
@@ -0,0 +1,57 @@
from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Union
from pydantic import AnyUrl, BaseModel
from typing_extensions import deprecated
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.pipeline_options_vlm_model import (
# InferenceFramework,
TransformersModelType,
)
class BaseAsrOptions(BaseModel):
kind: str
# prompt: str
class InferenceAsrFramework(str, Enum):
# MLX = "mlx" # disabled for now
# TRANSFORMERS = "transformers" # disabled for now
WHISPER = "whisper"
class InlineAsrOptions(BaseAsrOptions):
kind: Literal["inline_model_options"] = "inline_model_options"
repo_id: str
verbose: bool = False
timestamps: bool = True
temperature: float = 0.0
max_new_tokens: int = 256
max_time_chunk: float = 30.0
torch_dtype: Optional[str] = None
supported_devices: List[AcceleratorDevice] = [
AcceleratorDevice.CPU,
AcceleratorDevice.CUDA,
AcceleratorDevice.MPS,
]
@property
def repo_cache_folder(self) -> str:
return self.repo_id.replace("/", "--")
class InlineAsrNativeWhisperOptions(InlineAsrOptions):
inference_framework: InferenceAsrFramework = InferenceAsrFramework.WHISPER
language: str = "en"
supported_devices: List[AcceleratorDevice] = [
AcceleratorDevice.CPU,
AcceleratorDevice.CUDA,
]
word_timestamps: bool = True