feat: Support audio input (#1763)
* scaffolding in place Signed-off-by: Peter Staar <taa@zurich.ibm.com> * doing scaffolding for audio pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * WIP: got first transcription working Signed-off-by: Peter Staar <taa@zurich.ibm.com> * all working, time to start cleaning up Signed-off-by: Peter Staar <taa@zurich.ibm.com> * first working ASR pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added openai-whisper as a first transcription model Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updating with asr_options Signed-off-by: Peter Staar <taa@zurich.ibm.com> * finalised the first working ASR pipeline with Whisper Signed-off-by: Peter Staar <taa@zurich.ibm.com> * use whisper from the latest git commit Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * updated comment Signed-off-by: Peter Staar <taa@zurich.ibm.com> * AudioBackend -> DummyBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * file rename Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename to NoOpBackend, add test for ASR pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Support every format in NoOpBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add missing audio file and test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Install ffmpeg system dependency for ASR test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
d26dac61a8
commit
1557e7ce3e
@@ -29,6 +29,15 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||
from docling.datamodel.asr_model_specs import (
|
||||
WHISPER_BASE,
|
||||
WHISPER_LARGE,
|
||||
WHISPER_MEDIUM,
|
||||
WHISPER_SMALL,
|
||||
WHISPER_TINY,
|
||||
WHISPER_TURBO,
|
||||
AsrModelType,
|
||||
)
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
FormatToExtensions,
|
||||
@@ -37,12 +46,14 @@ from docling.datamodel.base_models import (
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AsrPipelineOptions,
|
||||
EasyOcrOptions,
|
||||
OcrOptions,
|
||||
PaginatedPipelineOptions,
|
||||
PdfBackend,
|
||||
PdfPipeline,
|
||||
PdfPipelineOptions,
|
||||
PipelineOptions,
|
||||
ProcessingPipeline,
|
||||
TableFormerMode,
|
||||
VlmPipelineOptions,
|
||||
)
|
||||
@@ -54,8 +65,14 @@ from docling.datamodel.vlm_model_specs import (
|
||||
SMOLDOCLING_TRANSFORMERS,
|
||||
VlmModelType,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
from docling.document_converter import (
|
||||
AudioFormatOption,
|
||||
DocumentConverter,
|
||||
FormatOption,
|
||||
PdfFormatOption,
|
||||
)
|
||||
from docling.models.factories import get_ocr_factory
|
||||
from docling.pipeline.asr_pipeline import AsrPipeline
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
@@ -296,13 +313,17 @@ def convert( # noqa: C901
|
||||
),
|
||||
] = ImageRefMode.EMBEDDED,
|
||||
pipeline: Annotated[
|
||||
PdfPipeline,
|
||||
ProcessingPipeline,
|
||||
typer.Option(..., help="Choose the pipeline to process PDF or image files."),
|
||||
] = PdfPipeline.STANDARD,
|
||||
] = ProcessingPipeline.STANDARD,
|
||||
vlm_model: Annotated[
|
||||
VlmModelType,
|
||||
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
|
||||
] = VlmModelType.SMOLDOCLING,
|
||||
asr_model: Annotated[
|
||||
AsrModelType,
|
||||
typer.Option(..., help="Choose the ASR model to use with audio/video files."),
|
||||
] = AsrModelType.WHISPER_TINY,
|
||||
ocr: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
@@ -450,12 +471,14 @@ def convert( # noqa: C901
|
||||
),
|
||||
] = None,
|
||||
):
|
||||
log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
|
||||
|
||||
if verbose == 0:
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
logging.basicConfig(level=logging.WARNING, format=log_format)
|
||||
elif verbose == 1:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logging.basicConfig(level=logging.INFO, format=log_format)
|
||||
else:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logging.basicConfig(level=logging.DEBUG, format=log_format)
|
||||
|
||||
settings.debug.visualize_cells = debug_visualize_cells
|
||||
settings.debug.visualize_layout = debug_visualize_layout
|
||||
@@ -530,9 +553,12 @@ def convert( # noqa: C901
|
||||
ocr_options.lang = ocr_lang_list
|
||||
|
||||
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
||||
pipeline_options: PaginatedPipelineOptions
|
||||
# pipeline_options: PaginatedPipelineOptions
|
||||
pipeline_options: PipelineOptions
|
||||
|
||||
if pipeline == PdfPipeline.STANDARD:
|
||||
format_options: Dict[InputFormat, FormatOption] = {}
|
||||
|
||||
if pipeline == ProcessingPipeline.STANDARD:
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
allow_external_plugins=allow_external_plugins,
|
||||
enable_remote_services=enable_remote_services,
|
||||
@@ -574,7 +600,13 @@ def convert( # noqa: C901
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend, # pdf_backend
|
||||
)
|
||||
elif pipeline == PdfPipeline.VLM:
|
||||
|
||||
format_options = {
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
|
||||
elif pipeline == ProcessingPipeline.VLM:
|
||||
pipeline_options = VlmPipelineOptions(
|
||||
enable_remote_services=enable_remote_services,
|
||||
)
|
||||
@@ -600,13 +632,48 @@ def convert( # noqa: C901
|
||||
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
||||
)
|
||||
|
||||
format_options = {
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
|
||||
elif pipeline == ProcessingPipeline.ASR:
|
||||
pipeline_options = AsrPipelineOptions(
|
||||
# enable_remote_services=enable_remote_services,
|
||||
# artifacts_path = artifacts_path
|
||||
)
|
||||
|
||||
if asr_model == AsrModelType.WHISPER_TINY:
|
||||
pipeline_options.asr_options = WHISPER_TINY
|
||||
elif asr_model == AsrModelType.WHISPER_SMALL:
|
||||
pipeline_options.asr_options = WHISPER_SMALL
|
||||
elif asr_model == AsrModelType.WHISPER_MEDIUM:
|
||||
pipeline_options.asr_options = WHISPER_MEDIUM
|
||||
elif asr_model == AsrModelType.WHISPER_BASE:
|
||||
pipeline_options.asr_options = WHISPER_BASE
|
||||
elif asr_model == AsrModelType.WHISPER_LARGE:
|
||||
pipeline_options.asr_options = WHISPER_LARGE
|
||||
elif asr_model == AsrModelType.WHISPER_TURBO:
|
||||
pipeline_options.asr_options = WHISPER_TURBO
|
||||
else:
|
||||
_log.error(f"{asr_model} is not known")
|
||||
raise ValueError(f"{asr_model} is not known")
|
||||
|
||||
_log.info(f"pipeline_options: {pipeline_options}")
|
||||
|
||||
audio_format_option = AudioFormatOption(
|
||||
pipeline_cls=AsrPipeline,
|
||||
pipeline_options=pipeline_options,
|
||||
)
|
||||
|
||||
format_options = {
|
||||
InputFormat.AUDIO: audio_format_option,
|
||||
}
|
||||
|
||||
if artifacts_path is not None:
|
||||
pipeline_options.artifacts_path = artifacts_path
|
||||
# audio_pipeline_options.artifacts_path = artifacts_path
|
||||
|
||||
format_options: Dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
doc_converter = DocumentConverter(
|
||||
allowed_formats=from_formats,
|
||||
format_options=format_options,
|
||||
@@ -614,6 +681,7 @@ def convert( # noqa: C901
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
_log.info(f"paths: {input_doc_paths}")
|
||||
conv_results = doc_converter.convert_all(
|
||||
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user