feat: Support audio input (#1763)

* scaffolding in place

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* doing scaffolding for audio pipeline

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* WIP: got first transcription working

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* all working, time to start cleaning up

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* first working ASR pipeline

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added openai-whisper as a first transcription model

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updating with asr_options

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* finalised the first working ASR pipeline with Whisper

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* use whisper from the latest git commit

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Update docling/datamodel/pipeline_options.py

Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com>

* Update docling/datamodel/pipeline_options.py

Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com>

* updated comment

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* AudioBackend -> DummyBackend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* file rename

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Rename to NoOpBackend, add test for ASR pipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Support every format in NoOpBackend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add missing audio file and test

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Install ffmpeg system dependency for ASR test

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2025-06-23 14:47:26 +02:00
committed by GitHub
parent d26dac61a8
commit 1557e7ce3e
14 changed files with 941 additions and 62 deletions

View File

@@ -29,6 +29,15 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.asr_model_specs import (
WHISPER_BASE,
WHISPER_LARGE,
WHISPER_MEDIUM,
WHISPER_SMALL,
WHISPER_TINY,
WHISPER_TURBO,
AsrModelType,
)
from docling.datamodel.base_models import (
ConversionStatus,
FormatToExtensions,
@@ -37,12 +46,14 @@ from docling.datamodel.base_models import (
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AsrPipelineOptions,
EasyOcrOptions,
OcrOptions,
PaginatedPipelineOptions,
PdfBackend,
PdfPipeline,
PdfPipelineOptions,
PipelineOptions,
ProcessingPipeline,
TableFormerMode,
VlmPipelineOptions,
)
@@ -54,8 +65,14 @@ from docling.datamodel.vlm_model_specs import (
SMOLDOCLING_TRANSFORMERS,
VlmModelType,
)
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.document_converter import (
AudioFormatOption,
DocumentConverter,
FormatOption,
PdfFormatOption,
)
from docling.models.factories import get_ocr_factory
from docling.pipeline.asr_pipeline import AsrPipeline
from docling.pipeline.vlm_pipeline import VlmPipeline
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -296,13 +313,17 @@ def convert( # noqa: C901
),
] = ImageRefMode.EMBEDDED,
pipeline: Annotated[
PdfPipeline,
ProcessingPipeline,
typer.Option(..., help="Choose the pipeline to process PDF or image files."),
] = PdfPipeline.STANDARD,
] = ProcessingPipeline.STANDARD,
vlm_model: Annotated[
VlmModelType,
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
] = VlmModelType.SMOLDOCLING,
asr_model: Annotated[
AsrModelType,
typer.Option(..., help="Choose the ASR model to use with audio/video files."),
] = AsrModelType.WHISPER_TINY,
ocr: Annotated[
bool,
typer.Option(
@@ -450,12 +471,14 @@ def convert( # noqa: C901
),
] = None,
):
log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
if verbose == 0:
logging.basicConfig(level=logging.WARNING)
logging.basicConfig(level=logging.WARNING, format=log_format)
elif verbose == 1:
logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=logging.INFO, format=log_format)
else:
logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.DEBUG, format=log_format)
settings.debug.visualize_cells = debug_visualize_cells
settings.debug.visualize_layout = debug_visualize_layout
@@ -530,9 +553,12 @@ def convert( # noqa: C901
ocr_options.lang = ocr_lang_list
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
pipeline_options: PaginatedPipelineOptions
# pipeline_options: PaginatedPipelineOptions
pipeline_options: PipelineOptions
if pipeline == PdfPipeline.STANDARD:
format_options: Dict[InputFormat, FormatOption] = {}
if pipeline == ProcessingPipeline.STANDARD:
pipeline_options = PdfPipelineOptions(
allow_external_plugins=allow_external_plugins,
enable_remote_services=enable_remote_services,
@@ -574,7 +600,13 @@ def convert( # noqa: C901
pipeline_options=pipeline_options,
backend=backend, # pdf_backend
)
elif pipeline == PdfPipeline.VLM:
format_options = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
}
elif pipeline == ProcessingPipeline.VLM:
pipeline_options = VlmPipelineOptions(
enable_remote_services=enable_remote_services,
)
@@ -600,13 +632,48 @@ def convert( # noqa: C901
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
)
format_options = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
}
elif pipeline == ProcessingPipeline.ASR:
pipeline_options = AsrPipelineOptions(
# enable_remote_services=enable_remote_services,
# artifacts_path = artifacts_path
)
if asr_model == AsrModelType.WHISPER_TINY:
pipeline_options.asr_options = WHISPER_TINY
elif asr_model == AsrModelType.WHISPER_SMALL:
pipeline_options.asr_options = WHISPER_SMALL
elif asr_model == AsrModelType.WHISPER_MEDIUM:
pipeline_options.asr_options = WHISPER_MEDIUM
elif asr_model == AsrModelType.WHISPER_BASE:
pipeline_options.asr_options = WHISPER_BASE
elif asr_model == AsrModelType.WHISPER_LARGE:
pipeline_options.asr_options = WHISPER_LARGE
elif asr_model == AsrModelType.WHISPER_TURBO:
pipeline_options.asr_options = WHISPER_TURBO
else:
_log.error(f"{asr_model} is not known")
raise ValueError(f"{asr_model} is not known")
_log.info(f"pipeline_options: {pipeline_options}")
audio_format_option = AudioFormatOption(
pipeline_cls=AsrPipeline,
pipeline_options=pipeline_options,
)
format_options = {
InputFormat.AUDIO: audio_format_option,
}
if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path
# audio_pipeline_options.artifacts_path = artifacts_path
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
}
doc_converter = DocumentConverter(
allowed_formats=from_formats,
format_options=format_options,
@@ -614,6 +681,7 @@ def convert( # noqa: C901
start_time = time.time()
_log.info(f"paths: {input_doc_paths}")
conv_results = doc_converter.convert_all(
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
)