feat: pdf backend, table mode as options and artifacts path (#203)

* feat: add more options in the CLI Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update CLI docs Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * expose artifacts-path as argument Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2024-11-04 14:26:05 +01:00 · 2024-11-04 14:26:05 +01:00 · 40ad987303
commit 40ad987303
parent af323c04ef
3 changed files with 63 additions and 26 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -5,12 +5,15 @@ import time
 import warnings
 from enum import Enum
 from pathlib import Path
-from typing import Annotated, Dict, Iterable, List, Optional
+from typing import Annotated, Dict, Iterable, List, Optional, Type

 import typer
 from docling_core.utils.file import resolve_file_source

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
+from docling.backend.pdf_backend import PdfDocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import (
    ConversionStatus,
    FormatToExtensions,
@ -22,6 +25,7 @@ from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrOptions,
    PdfPipelineOptions,
+    TableFormerMode,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
 )
@ -58,9 +62,10 @@ def version_callback(value: bool):


 # Define an enum for the backend options
-class Backend(str, Enum):
+class PdfBackend(str, Enum):
    PYPDFIUM2 = "pypdfium2"
-    DOCLING = "docling"
+    DLPARSE_V1 = "dlparse_v1"
+    DLPARSE_V2 = "dlparse_v2"


 # Define an enum for the ocr engines
@ -151,6 +156,17 @@ def convert(
    ocr_engine: Annotated[
        OcrEngine, typer.Option(..., help="The OCR engine to use.")
    ] = OcrEngine.EASYOCR,
+    pdf_backend: Annotated[
+        PdfBackend, typer.Option(..., help="The PDF backend to use.")
+    ] = PdfBackend.DLPARSE_V1,
+    table_mode: Annotated[
+        TableFormerMode,
+        typer.Option(..., help="The mode to use in the table structure model."),
+    ] = TableFormerMode.FAST,
+    artifacts_path: Annotated[
+        Optional[Path],
+        typer.Option(..., help="If provided, the location of the model artifacts."),
+    ] = None,
    abort_on_error: Annotated[
        bool,
        typer.Option(
@ -217,11 +233,25 @@ def convert(
        do_table_structure=True,
    )
    pipeline_options.table_structure_options.do_cell_matching = True  # do_cell_matching
+    pipeline_options.table_structure_options.mode = table_mode
+
+    if artifacts_path is not None:
+        pipeline_options.artifacts_path = artifacts_path
+
+    match pdf_backend:
+        case PdfBackend.DLPARSE_V1:
+            backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
+        case PdfBackend.DLPARSE_V2:
+            backend = DoclingParseV2DocumentBackend
+        case PdfBackend.PYPDFIUM2:
+            backend = PyPdfiumDocumentBackend
+        case _:
+            raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")

    format_options: Dict[InputFormat, FormatOption] = {
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
-            backend=DoclingParseDocumentBackend,  # pdf_backend
+            backend=backend,  # pdf_backend
        )
    }
    doc_converter = DocumentConverter(
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -1,4 +1,4 @@
-from enum import Enum, auto
+from enum import Enum
 from pathlib import Path
 from typing import List, Literal, Optional, Union

@ -6,8 +6,8 @@ from pydantic import BaseModel, ConfigDict, Field


 class TableFormerMode(str, Enum):
-    FAST = auto()
-    ACCURATE = auto()
+    FAST = "fast"
+    ACCURATE = "accurate"


 class TableStructureOptions(BaseModel):
--- a/docs/usage.md
+++ b/docs/usage.md
@ -32,30 +32,37 @@ Here are the available options as of this writing (for an up-to-date listing, ru
 ```console
 $ docling --help

- Usage: docling [OPTIONS] source
-
+ Usage: docling [OPTIONS] source                                                                                             
+                                                                                                                             
 ╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
 │ *    input_sources      source  PDF files to convert. Can be local file / directory paths or URL. [default: None]         │
 │                                 [required]                                                                                │
 ╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 ╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ --from                                     [docx|pptx|html|image|pdf]         Specify input formats to convert from.      │
-│                                                                               Defaults to all formats.                    │
-│                                                                               [default: None]                             │
-│ --to                                       [md|json|text|doctags]             Specify output formats. Defaults to         │
-│                                                                               Markdown.                                   │
-│                                                                               [default: None]                             │
-│ --ocr               --no-ocr                                                  If enabled, the bitmap content will be      │
-│                                                                               processed using OCR.                        │
-│                                                                               [default: ocr]                              │
-│ --ocr-engine                               [easyocr|tesseract_cli|tesseract]  The OCR engine to use. [default: easyocr]   │
-│ --abort-on-error    --no-abort-on-error                                       If enabled, the bitmap content will be      │
-│                                                                               processed using OCR.                        │
-│                                                                               [default: no-abort-on-error]                │
-│ --output                                   PATH                               Output directory where results are saved.   │
-│                                                                               [default: .]                                │
-│ --version                                                                     Show version information.                   │
-│ --help                                                                        Show this message and exit.                 │
+│ --from                                     [docx|pptx|html|image|pdf|asciidoc|md]  Specify input formats to convert from. │
+│                                                                                    Defaults to all formats.               │
+│                                                                                    [default: None]                        │
+│ --to                                       [md|json|text|doctags]                  Specify output formats. Defaults to    │
+│                                                                                    Markdown.                              │
+│                                                                                    [default: None]                        │
+│ --ocr               --no-ocr                                                       If enabled, the bitmap content will be │
+│                                                                                    processed using OCR.                   │
+│                                                                                    [default: ocr]                         │
+│ --ocr-engine                               [easyocr|tesseract_cli|tesseract]       The OCR engine to use.                 │
+│                                                                                    [default: easyocr]                     │
+│ --pdf-backend                              [pypdfium2|dlparse_v1|dlparse_v2]       The PDF backend to use.                │
+│                                                                                    [default: dlparse_v1]                  │
+│ --table-mode                               [fast|accurate]                         The mode to use in the table structure │
+│                                                                                    model.                                 │
+│                                                                                    [default: fast]                        │
+│ --abort-on-error    --no-abort-on-error                                            If enabled, the bitmap content will be │
+│                                                                                    processed using OCR.                   │
+│                                                                                    [default: no-abort-on-error]           │
+│ --output                                   PATH                                    Output directory where results are     │
+│                                                                                    saved.                                 │
+│                                                                                    [default: .]                           │
+│ --version                                                                          Show version information.              │
+│ --help                                                                             Show this message and exit.            │
 ╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 ```
 </details>