feat: pdf backend, table mode as options and artifacts path (#203)
* feat: add more options in the CLI Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update CLI docs Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * expose artifacts-path as argument Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
af323c04ef
commit
40ad987303
@ -5,12 +5,15 @@ import time
|
||||
import warnings
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Dict, Iterable, List, Optional
|
||||
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
||||
|
||||
import typer
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
FormatToExtensions,
|
||||
@ -22,6 +25,7 @@ from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
TableFormerMode,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
@ -58,9 +62,10 @@ def version_callback(value: bool):
|
||||
|
||||
|
||||
# Define an enum for the backend options
|
||||
class Backend(str, Enum):
|
||||
class PdfBackend(str, Enum):
|
||||
PYPDFIUM2 = "pypdfium2"
|
||||
DOCLING = "docling"
|
||||
DLPARSE_V1 = "dlparse_v1"
|
||||
DLPARSE_V2 = "dlparse_v2"
|
||||
|
||||
|
||||
# Define an enum for the ocr engines
|
||||
@ -151,6 +156,17 @@ def convert(
|
||||
ocr_engine: Annotated[
|
||||
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
||||
] = OcrEngine.EASYOCR,
|
||||
pdf_backend: Annotated[
|
||||
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
||||
] = PdfBackend.DLPARSE_V1,
|
||||
table_mode: Annotated[
|
||||
TableFormerMode,
|
||||
typer.Option(..., help="The mode to use in the table structure model."),
|
||||
] = TableFormerMode.FAST,
|
||||
artifacts_path: Annotated[
|
||||
Optional[Path],
|
||||
typer.Option(..., help="If provided, the location of the model artifacts."),
|
||||
] = None,
|
||||
abort_on_error: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
@ -217,11 +233,25 @@ def convert(
|
||||
do_table_structure=True,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
||||
pipeline_options.table_structure_options.mode = table_mode
|
||||
|
||||
if artifacts_path is not None:
|
||||
pipeline_options.artifacts_path = artifacts_path
|
||||
|
||||
match pdf_backend:
|
||||
case PdfBackend.DLPARSE_V1:
|
||||
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
||||
case PdfBackend.DLPARSE_V2:
|
||||
backend = DoclingParseV2DocumentBackend
|
||||
case PdfBackend.PYPDFIUM2:
|
||||
backend = PyPdfiumDocumentBackend
|
||||
case _:
|
||||
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
||||
|
||||
format_options: Dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=DoclingParseDocumentBackend, # pdf_backend
|
||||
backend=backend, # pdf_backend
|
||||
)
|
||||
}
|
||||
doc_converter = DocumentConverter(
|
||||
|
@ -1,4 +1,4 @@
|
||||
from enum import Enum, auto
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import List, Literal, Optional, Union
|
||||
|
||||
@ -6,8 +6,8 @@ from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
|
||||
class TableFormerMode(str, Enum):
|
||||
FAST = auto()
|
||||
ACCURATE = auto()
|
||||
FAST = "fast"
|
||||
ACCURATE = "accurate"
|
||||
|
||||
|
||||
class TableStructureOptions(BaseModel):
|
||||
|
@ -32,30 +32,37 @@ Here are the available options as of this writing (for an up-to-date listing, ru
|
||||
```console
|
||||
$ docling --help
|
||||
|
||||
Usage: docling [OPTIONS] source
|
||||
|
||||
Usage: docling [OPTIONS] source
|
||||
|
||||
╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
||||
│ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] │
|
||||
│ [required] │
|
||||
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
||||
╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
||||
│ --from [docx|pptx|html|image|pdf] Specify input formats to convert from. │
|
||||
│ Defaults to all formats. │
|
||||
│ [default: None] │
|
||||
│ --to [md|json|text|doctags] Specify output formats. Defaults to │
|
||||
│ Markdown. │
|
||||
│ [default: None] │
|
||||
│ --ocr --no-ocr If enabled, the bitmap content will be │
|
||||
│ processed using OCR. │
|
||||
│ [default: ocr] │
|
||||
│ --ocr-engine [easyocr|tesseract_cli|tesseract] The OCR engine to use. [default: easyocr] │
|
||||
│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │
|
||||
│ processed using OCR. │
|
||||
│ [default: no-abort-on-error] │
|
||||
│ --output PATH Output directory where results are saved. │
|
||||
│ [default: .] │
|
||||
│ --version Show version information. │
|
||||
│ --help Show this message and exit. │
|
||||
│ --from [docx|pptx|html|image|pdf|asciidoc|md] Specify input formats to convert from. │
|
||||
│ Defaults to all formats. │
|
||||
│ [default: None] │
|
||||
│ --to [md|json|text|doctags] Specify output formats. Defaults to │
|
||||
│ Markdown. │
|
||||
│ [default: None] │
|
||||
│ --ocr --no-ocr If enabled, the bitmap content will be │
|
||||
│ processed using OCR. │
|
||||
│ [default: ocr] │
|
||||
│ --ocr-engine [easyocr|tesseract_cli|tesseract] The OCR engine to use. │
|
||||
│ [default: easyocr] │
|
||||
│ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │
|
||||
│ [default: dlparse_v1] │
|
||||
│ --table-mode [fast|accurate] The mode to use in the table structure │
|
||||
│ model. │
|
||||
│ [default: fast] │
|
||||
│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │
|
||||
│ processed using OCR. │
|
||||
│ [default: no-abort-on-error] │
|
||||
│ --output PATH Output directory where results are │
|
||||
│ saved. │
|
||||
│ [default: .] │
|
||||
│ --version Show version information. │
|
||||
│ --help Show this message and exit. │
|
||||
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
||||
```
|
||||
</details>
|
||||
|
Loading…
Reference in New Issue
Block a user