diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index e2eb19e..ddc06f2 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -60,7 +60,7 @@ jobs: run: | for file in docs/examples/*.py; do # Skip batch_convert.py - if [[ "$(basename "$file")" =~ ^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then + if [[ "$(basename "$file")" =~ ^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then echo "Skipping $file" continue fi diff --git a/README.md b/README.md index 309e103..c53e7b7 100644 --- a/README.md +++ b/README.md @@ -28,14 +28,15 @@ Docling simplifies document processing, parsing diverse formats — including ad ## Features -* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more +* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format -* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON +* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON * 🔒 Local execution capabilities for sensitive data and air-gapped environments * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI * 🔍 Extensive OCR support for scanned PDFs and images -* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) +* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) +* 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models * 💻 Simple and convenient CLI ### Coming soon diff --git a/docs/examples/minimal_asr_pipeline.py b/docs/examples/minimal_asr_pipeline.py new file mode 100644 index 0000000..72c1276 --- /dev/null +++ b/docs/examples/minimal_asr_pipeline.py @@ -0,0 +1,56 @@ +from pathlib import Path + +from docling_core.types.doc import DoclingDocument + +from docling.datamodel import asr_model_specs +from docling.datamodel.base_models import ConversionStatus, InputFormat +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import AsrPipelineOptions +from docling.document_converter import AudioFormatOption, DocumentConverter +from docling.pipeline.asr_pipeline import AsrPipeline + + +def get_asr_converter(): + """Create a DocumentConverter configured for ASR with whisper_turbo model.""" + pipeline_options = AsrPipelineOptions() + pipeline_options.asr_options = asr_model_specs.WHISPER_TURBO + + converter = DocumentConverter( + format_options={ + InputFormat.AUDIO: AudioFormatOption( + pipeline_cls=AsrPipeline, + pipeline_options=pipeline_options, + ) + } + ) + return converter + + +def asr_pipeline_conversion(audio_path: Path) -> DoclingDocument: + """ASR pipeline conversion using whisper_turbo""" + # Check if the test audio file exists + assert audio_path.exists(), f"Test audio file not found: {audio_path}" + + converter = get_asr_converter() + + # Convert the audio file + result: ConversionResult = converter.convert(audio_path) + + # Verify conversion was successful + assert result.status == ConversionStatus.SUCCESS, ( + f"Conversion failed with status: {result.status}" + ) + return result.document + + +if __name__ == "__main__": + audio_path = Path("tests/data/audio/sample_10s.mp3") + + doc = asr_pipeline_conversion(audio_path=audio_path) + print(doc.export_to_markdown()) + + # Expected output: + # + # [time: 0.0-4.0] Shakespeare on Scenery by Oscar Wilde + # + # [time: 5.28-9.96] This is a LibriVox recording. All LibriVox recordings are in the public domain. diff --git a/docs/index.md b/docs/index.md index ad9ac80..7ec40bf 100644 --- a/docs/index.md +++ b/docs/index.md @@ -20,14 +20,15 @@ Docling simplifies document processing, parsing diverse formats — including ad ## Features -* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more +* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format -* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON +* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON * 🔒 Local execution capabilities for sensitive data and air-gapped environments * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI * 🔍 Extensive OCR support for scanned PDFs and images -* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🔥 +* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) +* 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models * 💻 Simple and convenient CLI ### Coming soon diff --git a/mkdocs.yml b/mkdocs.yml index 4f82c19..d1c6753 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -80,6 +80,7 @@ nav: - "VLM pipeline with SmolDocling": examples/minimal_vlm_pipeline.py - "VLM pipeline with remote model": examples/vlm_pipeline_api_model.py - "VLM comparison": examples/compare_vlm_models.py + - "ASR pipeline with Whisper": examples/minimal_asr_pipeline.py - "Figure export": examples/export_figures.py - "Table export": examples/export_tables.py - "Multimodal export": examples/export_multimodal.py