feat: Support audio input (#1763)

* scaffolding in place Signed-off-by: Peter Staar <taa@zurich.ibm.com> * doing scaffolding for audio pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * WIP: got first transcription working Signed-off-by: Peter Staar <taa@zurich.ibm.com> * all working, time to start cleaning up Signed-off-by: Peter Staar <taa@zurich.ibm.com> * first working ASR pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added openai-whisper as a first transcription model Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updating with asr_options Signed-off-by: Peter Staar <taa@zurich.ibm.com> * finalised the first working ASR pipeline with Whisper Signed-off-by: Peter Staar <taa@zurich.ibm.com> * use whisper from the latest git commit Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * updated comment Signed-off-by: Peter Staar <taa@zurich.ibm.com> * AudioBackend -> DummyBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * file rename Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename to NoOpBackend, add test for ASR pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Support every format in NoOpBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add missing audio file and test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Install ffmpeg system dependency for ASR test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
2025-06-23 14:47:26 +02:00
parent d26dac61a8
commit 1557e7ce3e
14 changed files with 941 additions and 62 deletions
@@ -29,6 +29,15 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.asr_model_specs import (
+    WHISPER_BASE,
+    WHISPER_LARGE,
+    WHISPER_MEDIUM,
+    WHISPER_SMALL,
+    WHISPER_TINY,
+    WHISPER_TURBO,
+    AsrModelType,
+)
 from docling.datamodel.base_models import (
    ConversionStatus,
    FormatToExtensions,
@@ -37,12 +46,14 @@ from docling.datamodel.base_models import (
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
+    AsrPipelineOptions,
    EasyOcrOptions,
    OcrOptions,
    PaginatedPipelineOptions,
    PdfBackend,
-    PdfPipeline,
    PdfPipelineOptions,
+    PipelineOptions,
+    ProcessingPipeline,
    TableFormerMode,
    VlmPipelineOptions,
 )
@@ -54,8 +65,14 @@ from docling.datamodel.vlm_model_specs import (
    SMOLDOCLING_TRANSFORMERS,
    VlmModelType,
 )
-from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
+from docling.document_converter import (
+    AudioFormatOption,
+    DocumentConverter,
+    FormatOption,
+    PdfFormatOption,
+)
 from docling.models.factories import get_ocr_factory
+from docling.pipeline.asr_pipeline import AsrPipeline
 from docling.pipeline.vlm_pipeline import VlmPipeline

 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -296,13 +313,17 @@ def convert(  # noqa: C901
        ),
    ] = ImageRefMode.EMBEDDED,
    pipeline: Annotated[
-        PdfPipeline,
+        ProcessingPipeline,
        typer.Option(..., help="Choose the pipeline to process PDF or image files."),
-    ] = PdfPipeline.STANDARD,
+    ] = ProcessingPipeline.STANDARD,
    vlm_model: Annotated[
        VlmModelType,
        typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
    ] = VlmModelType.SMOLDOCLING,
+    asr_model: Annotated[
+        AsrModelType,
+        typer.Option(..., help="Choose the ASR model to use with audio/video files."),
+    ] = AsrModelType.WHISPER_TINY,
    ocr: Annotated[
        bool,
        typer.Option(
@@ -450,12 +471,14 @@ def convert(  # noqa: C901
        ),
    ] = None,
 ):
+    log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
+
    if verbose == 0:
-        logging.basicConfig(level=logging.WARNING)
+        logging.basicConfig(level=logging.WARNING, format=log_format)
    elif verbose == 1:
-        logging.basicConfig(level=logging.INFO)
+        logging.basicConfig(level=logging.INFO, format=log_format)
    else:
-        logging.basicConfig(level=logging.DEBUG)
+        logging.basicConfig(level=logging.DEBUG, format=log_format)

    settings.debug.visualize_cells = debug_visualize_cells
    settings.debug.visualize_layout = debug_visualize_layout
@@ -530,9 +553,12 @@ def convert(  # noqa: C901
            ocr_options.lang = ocr_lang_list

        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
-        pipeline_options: PaginatedPipelineOptions
+        # pipeline_options: PaginatedPipelineOptions
+        pipeline_options: PipelineOptions

-        if pipeline == PdfPipeline.STANDARD:
+        format_options: Dict[InputFormat, FormatOption] = {}
+
+        if pipeline == ProcessingPipeline.STANDARD:
            pipeline_options = PdfPipelineOptions(
                allow_external_plugins=allow_external_plugins,
                enable_remote_services=enable_remote_services,
@@ -574,7 +600,13 @@ def convert(  # noqa: C901
                pipeline_options=pipeline_options,
                backend=backend,  # pdf_backend
            )
-        elif pipeline == PdfPipeline.VLM:
+
+            format_options = {
+                InputFormat.PDF: pdf_format_option,
+                InputFormat.IMAGE: pdf_format_option,
+            }
+
+        elif pipeline == ProcessingPipeline.VLM:
            pipeline_options = VlmPipelineOptions(
                enable_remote_services=enable_remote_services,
            )
@@ -600,13 +632,48 @@ def convert(  # noqa: C901
                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
            )

+            format_options = {
+                InputFormat.PDF: pdf_format_option,
+                InputFormat.IMAGE: pdf_format_option,
+            }
+
+        elif pipeline == ProcessingPipeline.ASR:
+            pipeline_options = AsrPipelineOptions(
+                # enable_remote_services=enable_remote_services,
+                # artifacts_path = artifacts_path
+            )
+
+            if asr_model == AsrModelType.WHISPER_TINY:
+                pipeline_options.asr_options = WHISPER_TINY
+            elif asr_model == AsrModelType.WHISPER_SMALL:
+                pipeline_options.asr_options = WHISPER_SMALL
+            elif asr_model == AsrModelType.WHISPER_MEDIUM:
+                pipeline_options.asr_options = WHISPER_MEDIUM
+            elif asr_model == AsrModelType.WHISPER_BASE:
+                pipeline_options.asr_options = WHISPER_BASE
+            elif asr_model == AsrModelType.WHISPER_LARGE:
+                pipeline_options.asr_options = WHISPER_LARGE
+            elif asr_model == AsrModelType.WHISPER_TURBO:
+                pipeline_options.asr_options = WHISPER_TURBO
+            else:
+                _log.error(f"{asr_model} is not known")
+                raise ValueError(f"{asr_model} is not known")
+
+            _log.info(f"pipeline_options: {pipeline_options}")
+
+            audio_format_option = AudioFormatOption(
+                pipeline_cls=AsrPipeline,
+                pipeline_options=pipeline_options,
+            )
+
+            format_options = {
+                InputFormat.AUDIO: audio_format_option,
+            }
+
        if artifacts_path is not None:
            pipeline_options.artifacts_path = artifacts_path
+            # audio_pipeline_options.artifacts_path = artifacts_path

-        format_options: Dict[InputFormat, FormatOption] = {
-            InputFormat.PDF: pdf_format_option,
-            InputFormat.IMAGE: pdf_format_option,
-        }
        doc_converter = DocumentConverter(
            allowed_formats=from_formats,
            format_options=format_options,
@@ -614,6 +681,7 @@ def convert(  # noqa: C901

        start_time = time.time()

+        _log.info(f"paths: {input_doc_paths}")
        conv_results = doc_converter.convert_all(
            input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
        )