feat: pdf backend, table mode as options and artifacts path (#203)

* feat: add more options in the CLI

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update CLI docs

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* expose artifacts-path as argument

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-11-04 14:26:05 +01:00 committed by GitHub
parent af323c04ef
commit 40ad987303
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 63 additions and 26 deletions

View File

@ -5,12 +5,15 @@ import time
import warnings
from enum import Enum
from pathlib import Path
from typing import Annotated, Dict, Iterable, List, Optional
from typing import Annotated, Dict, Iterable, List, Optional, Type
import typer
from docling_core.utils.file import resolve_file_source
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
FormatToExtensions,
@ -22,6 +25,7 @@ from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrOptions,
PdfPipelineOptions,
TableFormerMode,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
@ -58,9 +62,10 @@ def version_callback(value: bool):
# Define an enum for the backend options
class Backend(str, Enum):
class PdfBackend(str, Enum):
PYPDFIUM2 = "pypdfium2"
DOCLING = "docling"
DLPARSE_V1 = "dlparse_v1"
DLPARSE_V2 = "dlparse_v2"
# Define an enum for the ocr engines
@ -151,6 +156,17 @@ def convert(
ocr_engine: Annotated[
OcrEngine, typer.Option(..., help="The OCR engine to use.")
] = OcrEngine.EASYOCR,
pdf_backend: Annotated[
PdfBackend, typer.Option(..., help="The PDF backend to use.")
] = PdfBackend.DLPARSE_V1,
table_mode: Annotated[
TableFormerMode,
typer.Option(..., help="The mode to use in the table structure model."),
] = TableFormerMode.FAST,
artifacts_path: Annotated[
Optional[Path],
typer.Option(..., help="If provided, the location of the model artifacts."),
] = None,
abort_on_error: Annotated[
bool,
typer.Option(
@ -217,11 +233,25 @@ def convert(
do_table_structure=True,
)
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
pipeline_options.table_structure_options.mode = table_mode
if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path
match pdf_backend:
case PdfBackend.DLPARSE_V1:
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
case PdfBackend.DLPARSE_V2:
backend = DoclingParseV2DocumentBackend
case PdfBackend.PYPDFIUM2:
backend = PyPdfiumDocumentBackend
case _:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=DoclingParseDocumentBackend, # pdf_backend
backend=backend, # pdf_backend
)
}
doc_converter = DocumentConverter(

View File

@ -1,4 +1,4 @@
from enum import Enum, auto
from enum import Enum
from pathlib import Path
from typing import List, Literal, Optional, Union
@ -6,8 +6,8 @@ from pydantic import BaseModel, ConfigDict, Field
class TableFormerMode(str, Enum):
FAST = auto()
ACCURATE = auto()
FAST = "fast"
ACCURATE = "accurate"
class TableStructureOptions(BaseModel):

View File

@ -32,30 +32,37 @@ Here are the available options as of this writing (for an up-to-date listing, ru
```console
$ docling --help
Usage: docling [OPTIONS] source
Usage: docling [OPTIONS] source
╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] │
│ [required] │
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ --from [docx|pptx|html|image|pdf] Specify input formats to convert from. │
│ Defaults to all formats. │
│ [default: None] │
│ --to [md|json|text|doctags] Specify output formats. Defaults to │
│ Markdown. │
│ [default: None] │
│ --ocr --no-ocr If enabled, the bitmap content will be │
│ processed using OCR. │
│ [default: ocr] │
│ --ocr-engine [easyocr|tesseract_cli|tesseract] The OCR engine to use. [default: easyocr] │
│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │
│ processed using OCR. │
│ [default: no-abort-on-error] │
│ --output PATH Output directory where results are saved. │
│ [default: .] │
│ --version Show version information. │
│ --help Show this message and exit. │
│ --from [docx|pptx|html|image|pdf|asciidoc|md] Specify input formats to convert from. │
│ Defaults to all formats. │
│ [default: None] │
│ --to [md|json|text|doctags] Specify output formats. Defaults to │
│ Markdown. │
│ [default: None] │
│ --ocr --no-ocr If enabled, the bitmap content will be │
│ processed using OCR. │
│ [default: ocr] │
│ --ocr-engine [easyocr|tesseract_cli|tesseract] The OCR engine to use. │
│ [default: easyocr] │
│ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │
│ [default: dlparse_v1] │
│ --table-mode [fast|accurate] The mode to use in the table structure │
│ model. │
│ [default: fast] │
│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │
│ processed using OCR. │
│ [default: no-abort-on-error] │
│ --output PATH Output directory where results are │
│ saved. │
│ [default: .] │
│ --version Show version information. │
│ --help Show this message and exit. │
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
```
</details>