feat: Support audio input (#1763)

* scaffolding in place Signed-off-by: Peter Staar <taa@zurich.ibm.com> * doing scaffolding for audio pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * WIP: got first transcription working Signed-off-by: Peter Staar <taa@zurich.ibm.com> * all working, time to start cleaning up Signed-off-by: Peter Staar <taa@zurich.ibm.com> * first working ASR pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added openai-whisper as a first transcription model Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updating with asr_options Signed-off-by: Peter Staar <taa@zurich.ibm.com> * finalised the first working ASR pipeline with Whisper Signed-off-by: Peter Staar <taa@zurich.ibm.com> * use whisper from the latest git commit Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * updated comment Signed-off-by: Peter Staar <taa@zurich.ibm.com> * AudioBackend -> DummyBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * file rename Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename to NoOpBackend, add test for ASR pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Support every format in NoOpBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add missing audio file and test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Install ffmpeg system dependency for ASR test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
2025-06-23 14:47:26 +02:00
parent d26dac61a8
commit 1557e7ce3e
14 changed files with 941 additions and 62 deletions
@@ -0,0 +1,92 @@
+import logging
+from enum import Enum
+
+from pydantic import (
+    AnyUrl,
+)
+
+from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.pipeline_options_asr_model import (
+    # AsrResponseFormat,
+    # ApiAsrOptions,
+    InferenceAsrFramework,
+    InlineAsrNativeWhisperOptions,
+    TransformersModelType,
+)
+
+_log = logging.getLogger(__name__)
+
+WHISPER_TINY = InlineAsrNativeWhisperOptions(
+    repo_id="tiny",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+
+WHISPER_SMALL = InlineAsrNativeWhisperOptions(
+    repo_id="small",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+
+WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
+    repo_id="medium",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+
+WHISPER_BASE = InlineAsrNativeWhisperOptions(
+    repo_id="base",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+
+WHISPER_LARGE = InlineAsrNativeWhisperOptions(
+    repo_id="large",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+
+WHISPER_TURBO = InlineAsrNativeWhisperOptions(
+    repo_id="turbo",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+
+
+class AsrModelType(str, Enum):
+    WHISPER_TINY = "whisper_tiny"
+    WHISPER_SMALL = "whisper_small"
+    WHISPER_MEDIUM = "whisper_medium"
+    WHISPER_BASE = "whisper_base"
+    WHISPER_LARGE = "whisper_large"
+    WHISPER_TURBO = "whisper_turbo"
@@ -49,6 +49,7 @@ class InputFormat(str, Enum):
    XML_USPTO = "xml_uspto"
    XML_JATS = "xml_jats"
    JSON_DOCLING = "json_docling"
+    AUDIO = "audio"


 class OutputFormat(str, Enum):
@@ -73,6 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.XLSX: ["xlsx", "xlsm"],
    InputFormat.XML_USPTO: ["xml", "txt"],
    InputFormat.JSON_DOCLING: ["json"],
+    InputFormat.AUDIO: ["wav", "mp3"],
 }

 FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
    ],
    InputFormat.XML_USPTO: ["application/xml", "text/plain"],
    InputFormat.JSON_DOCLING: ["application/json"],
+    InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
 }

 MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel):
            backend: Type[AbstractDocumentBackend]
            if format not in format_options.keys():
                _log.error(
-                    f"Input document {obj.name} does not match any allowed format."
+                    f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
                )
                backend = _DummyBackend
            else:
@@ -318,6 +318,8 @@ class _DocumentConversionInput(BaseModel):
        mime = mime or _DocumentConversionInput._detect_csv(content)
        mime = mime or "text/plain"
        formats = MimeTypeToFormat.get(mime, [])
+        _log.info(f"detected formats: {formats}")
+
        if formats:
            if len(formats) == 1 and mime not in ("text/plain"):
                return formats[0]
@@ -11,8 +11,13 @@ from pydantic import (
 )
 from typing_extensions import deprecated

+from docling.datamodel import asr_model_specs
+
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.pipeline_options_asr_model import (
+    InlineAsrOptions,
+)
 from docling.datamodel.pipeline_options_vlm_model import (
    ApiVlmOptions,
    InferenceFramework,
@@ -260,6 +265,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
    )


+class AsrPipelineOptions(PipelineOptions):
+    asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
+    artifacts_path: Optional[Union[Path, str]] = None
+
+
 class PdfPipelineOptions(PaginatedPipelineOptions):
    """Options for the PDF pipeline."""

@@ -297,6 +307,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
    )


-class PdfPipeline(str, Enum):
+class ProcessingPipeline(str, Enum):
    STANDARD = "standard"
    VLM = "vlm"
+    ASR = "asr"
@@ -0,0 +1,57 @@
+from enum import Enum
+from typing import Any, Dict, List, Literal, Optional, Union
+
+from pydantic import AnyUrl, BaseModel
+from typing_extensions import deprecated
+
+from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.pipeline_options_vlm_model import (
+    # InferenceFramework,
+    TransformersModelType,
+)
+
+
+class BaseAsrOptions(BaseModel):
+    kind: str
+    # prompt: str
+
+
+class InferenceAsrFramework(str, Enum):
+    # MLX = "mlx" # disabled for now
+    # TRANSFORMERS = "transformers" # disabled for now
+    WHISPER = "whisper"
+
+
+class InlineAsrOptions(BaseAsrOptions):
+    kind: Literal["inline_model_options"] = "inline_model_options"
+
+    repo_id: str
+
+    verbose: bool = False
+    timestamps: bool = True
+
+    temperature: float = 0.0
+    max_new_tokens: int = 256
+    max_time_chunk: float = 30.0
+
+    torch_dtype: Optional[str] = None
+    supported_devices: List[AcceleratorDevice] = [
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        AcceleratorDevice.MPS,
+    ]
+
+    @property
+    def repo_cache_folder(self) -> str:
+        return self.repo_id.replace("/", "--")
+
+
+class InlineAsrNativeWhisperOptions(InlineAsrOptions):
+    inference_framework: InferenceAsrFramework = InferenceAsrFramework.WHISPER
+
+    language: str = "en"
+    supported_devices: List[AcceleratorDevice] = [
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+    ]
+    word_timestamps: bool = True