Docling/docling/cli/main.py

import importlib
import json
import logging
import time
import warnings
from enum import Enum
from pathlib import Path
from typing import Annotated, Dict, Iterable, List, Optional

import typer
from docling_core.utils.file import resolve_file_source

from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import (
    ConversionStatus,
    FormatToExtensions,
    InputFormat,
    OutputFormat,
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrOptions,
    PdfPipelineOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption

warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")

_log = logging.getLogger(__name__)
from rich.console import Console

err_console = Console(stderr=True)


app = typer.Typer(
    name="Docling",
    no_args_is_help=True,
    add_completion=False,
    pretty_exceptions_enable=False,
)


def version_callback(value: bool):
    if value:
        docling_version = importlib.metadata.version("docling")
        docling_core_version = importlib.metadata.version("docling-core")
        docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
        docling_parse_version = importlib.metadata.version("docling-parse")
        print(f"Docling version: {docling_version}")
        print(f"Docling Core version: {docling_core_version}")
        print(f"Docling IBM Models version: {docling_ibm_models_version}")
        print(f"Docling Parse version: {docling_parse_version}")
        raise typer.Exit()


# Define an enum for the backend options
class Backend(str, Enum):
    PYPDFIUM2 = "pypdfium2"
    DOCLING = "docling"


# Define an enum for the ocr engines
class OcrEngine(str, Enum):
    EASYOCR = "easyocr"
    TESSERACT_CLI = "tesseract_cli"
    TESSERACT = "tesseract"


def export_documents(
    conv_results: Iterable[ConversionResult],
    output_dir: Path,
    export_json: bool,
    export_md: bool,
    export_txt: bool,
    export_doctags: bool,
):

    success_count = 0
    failure_count = 0

    for conv_res in conv_results:
        if conv_res.status == ConversionStatus.SUCCESS:
            success_count += 1
            doc_filename = conv_res.input.file.stem

            # Export Deep Search document JSON format:
            if export_json:
                fname = output_dir / f"{doc_filename}.json"
                with fname.open("w") as fp:
                    _log.info(f"writing JSON output to {fname}")
                    fp.write(json.dumps(conv_res.document.export_to_dict()))

            # Export Text format:
            if export_txt:
                fname = output_dir / f"{doc_filename}.txt"
                with fname.open("w") as fp:
                    _log.info(f"writing Text output to {fname}")
                    fp.write(conv_res.document.export_to_markdown(strict_text=True))

            # Export Markdown format:
            if export_md:
                fname = output_dir / f"{doc_filename}.md"
                with fname.open("w") as fp:
                    _log.info(f"writing Markdown output to {fname}")
                    fp.write(conv_res.document.export_to_markdown())

            # Export Document Tags format:
            if export_doctags:
                fname = output_dir / f"{doc_filename}.doctags"
                with fname.open("w") as fp:
                    _log.info(f"writing Doc Tags output to {fname}")
                    fp.write(conv_res.document.export_to_document_tokens())

        else:
            _log.warning(f"Document {conv_res.input.file} failed to convert.")
            failure_count += 1

    _log.info(
        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
    )


@app.command(no_args_is_help=True)
def convert(
    input_sources: Annotated[
        List[str],
        typer.Argument(
            ...,
            metavar="source",
            help="PDF files to convert. Can be local file / directory paths or URL.",
        ),
    ],
    from_formats: List[InputFormat] = typer.Option(
        None,
        "--from",
        help="Specify input formats to convert from. Defaults to all formats.",
    ),
    to_formats: List[OutputFormat] = typer.Option(
        None, "--to", help="Specify output formats. Defaults to Markdown."
    ),
    ocr: Annotated[
        bool,
        typer.Option(
            ..., help="If enabled, the bitmap content will be processed using OCR."
        ),
    ] = True,
    ocr_engine: Annotated[
        OcrEngine, typer.Option(..., help="The OCR engine to use.")
    ] = OcrEngine.EASYOCR,
    abort_on_error: Annotated[
        bool,
        typer.Option(
            ...,
            "--abort-on-error/--no-abort-on-error",
            help="If enabled, the bitmap content will be processed using OCR.",
        ),
    ] = False,
    output: Annotated[
        Path, typer.Option(..., help="Output directory where results are saved.")
    ] = Path("."),
    version: Annotated[
        Optional[bool],
        typer.Option(
            "--version",
            callback=version_callback,
            is_eager=True,
            help="Show version information.",
        ),
    ] = None,
):
    logging.basicConfig(level=logging.INFO)

    if from_formats is None:
        from_formats = [e for e in InputFormat]

    input_doc_paths: List[Path] = []
    for src in input_sources:
        source = resolve_file_source(source=src)
        if not source.exists():
            err_console.print(
                f"[red]Error: The input file {source} does not exist.[/red]"
            )
            raise typer.Abort()
        elif source.is_dir():
            for fmt in from_formats:
                for ext in FormatToExtensions[fmt]:
                    input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
                    input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
        else:
            input_doc_paths.append(source)

    if to_formats is None:
        to_formats = [OutputFormat.MARKDOWN]

    export_json = OutputFormat.JSON in to_formats
    export_md = OutputFormat.MARKDOWN in to_formats
    export_txt = OutputFormat.TEXT in to_formats
    export_doctags = OutputFormat.DOCTAGS in to_formats

    match ocr_engine:
        case OcrEngine.EASYOCR:
            ocr_options: OcrOptions = EasyOcrOptions()
        case OcrEngine.TESSERACT_CLI:
            ocr_options = TesseractCliOcrOptions()
        case OcrEngine.TESSERACT:
            ocr_options = TesseractOcrOptions()
        case _:
            raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

    pipeline_options = PdfPipelineOptions(
        do_ocr=ocr,
        ocr_options=ocr_options,
        do_table_structure=True,
    )
    pipeline_options.table_structure_options.do_cell_matching = True  # do_cell_matching

    format_options: Dict[InputFormat, FormatOption] = {
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
            backend=DoclingParseDocumentBackend,  # pdf_backend
        )
    }
    doc_converter = DocumentConverter(
        allowed_formats=from_formats,
        format_options=format_options,
    )

    start_time = time.time()

    conv_results = doc_converter.convert_all(
        input_doc_paths, raises_on_error=abort_on_error
    )

    output.mkdir(parents=True, exist_ok=True)
    export_documents(
        conv_results,
        output_dir=output,
        export_json=export_json,
        export_md=export_md,
        export_txt=export_txt,
        export_doctags=export_doctags,
    )

    end_time = time.time() - start_time

    _log.info(f"All documents were converted in {end_time:.2f} seconds.")


if __name__ == "__main__":
    app()