feat: pdf backend, table mode as options and artifacts path (#203)
* feat: add more options in the CLI Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update CLI docs Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * expose artifacts-path as argument Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
af323c04ef
commit
40ad987303
@ -5,12 +5,15 @@ import time
|
|||||||
import warnings
|
import warnings
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Annotated, Dict, Iterable, List, Optional
|
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
||||||
|
|
||||||
import typer
|
import typer
|
||||||
from docling_core.utils.file import resolve_file_source
|
from docling_core.utils.file import resolve_file_source
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
FormatToExtensions,
|
FormatToExtensions,
|
||||||
@ -22,6 +25,7 @@ from docling.datamodel.pipeline_options import (
|
|||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
|
TableFormerMode,
|
||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
TesseractOcrOptions,
|
TesseractOcrOptions,
|
||||||
)
|
)
|
||||||
@ -58,9 +62,10 @@ def version_callback(value: bool):
|
|||||||
|
|
||||||
|
|
||||||
# Define an enum for the backend options
|
# Define an enum for the backend options
|
||||||
class Backend(str, Enum):
|
class PdfBackend(str, Enum):
|
||||||
PYPDFIUM2 = "pypdfium2"
|
PYPDFIUM2 = "pypdfium2"
|
||||||
DOCLING = "docling"
|
DLPARSE_V1 = "dlparse_v1"
|
||||||
|
DLPARSE_V2 = "dlparse_v2"
|
||||||
|
|
||||||
|
|
||||||
# Define an enum for the ocr engines
|
# Define an enum for the ocr engines
|
||||||
@ -151,6 +156,17 @@ def convert(
|
|||||||
ocr_engine: Annotated[
|
ocr_engine: Annotated[
|
||||||
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
||||||
] = OcrEngine.EASYOCR,
|
] = OcrEngine.EASYOCR,
|
||||||
|
pdf_backend: Annotated[
|
||||||
|
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
||||||
|
] = PdfBackend.DLPARSE_V1,
|
||||||
|
table_mode: Annotated[
|
||||||
|
TableFormerMode,
|
||||||
|
typer.Option(..., help="The mode to use in the table structure model."),
|
||||||
|
] = TableFormerMode.FAST,
|
||||||
|
artifacts_path: Annotated[
|
||||||
|
Optional[Path],
|
||||||
|
typer.Option(..., help="If provided, the location of the model artifacts."),
|
||||||
|
] = None,
|
||||||
abort_on_error: Annotated[
|
abort_on_error: Annotated[
|
||||||
bool,
|
bool,
|
||||||
typer.Option(
|
typer.Option(
|
||||||
@ -217,11 +233,25 @@ def convert(
|
|||||||
do_table_structure=True,
|
do_table_structure=True,
|
||||||
)
|
)
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
||||||
|
pipeline_options.table_structure_options.mode = table_mode
|
||||||
|
|
||||||
|
if artifacts_path is not None:
|
||||||
|
pipeline_options.artifacts_path = artifacts_path
|
||||||
|
|
||||||
|
match pdf_backend:
|
||||||
|
case PdfBackend.DLPARSE_V1:
|
||||||
|
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
||||||
|
case PdfBackend.DLPARSE_V2:
|
||||||
|
backend = DoclingParseV2DocumentBackend
|
||||||
|
case PdfBackend.PYPDFIUM2:
|
||||||
|
backend = PyPdfiumDocumentBackend
|
||||||
|
case _:
|
||||||
|
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
||||||
|
|
||||||
format_options: Dict[InputFormat, FormatOption] = {
|
format_options: Dict[InputFormat, FormatOption] = {
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
backend=DoclingParseDocumentBackend, # pdf_backend
|
backend=backend, # pdf_backend
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
doc_converter = DocumentConverter(
|
doc_converter = DocumentConverter(
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from enum import Enum, auto
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Literal, Optional, Union
|
from typing import List, Literal, Optional, Union
|
||||||
|
|
||||||
@ -6,8 +6,8 @@ from pydantic import BaseModel, ConfigDict, Field
|
|||||||
|
|
||||||
|
|
||||||
class TableFormerMode(str, Enum):
|
class TableFormerMode(str, Enum):
|
||||||
FAST = auto()
|
FAST = "fast"
|
||||||
ACCURATE = auto()
|
ACCURATE = "accurate"
|
||||||
|
|
||||||
|
|
||||||
class TableStructureOptions(BaseModel):
|
class TableStructureOptions(BaseModel):
|
||||||
|
@ -32,30 +32,37 @@ Here are the available options as of this writing (for an up-to-date listing, ru
|
|||||||
```console
|
```console
|
||||||
$ docling --help
|
$ docling --help
|
||||||
|
|
||||||
Usage: docling [OPTIONS] source
|
Usage: docling [OPTIONS] source
|
||||||
|
|
||||||
╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
||||||
│ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] │
|
│ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] │
|
||||||
│ [required] │
|
│ [required] │
|
||||||
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
||||||
╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
||||||
│ --from [docx|pptx|html|image|pdf] Specify input formats to convert from. │
|
│ --from [docx|pptx|html|image|pdf|asciidoc|md] Specify input formats to convert from. │
|
||||||
│ Defaults to all formats. │
|
│ Defaults to all formats. │
|
||||||
│ [default: None] │
|
│ [default: None] │
|
||||||
│ --to [md|json|text|doctags] Specify output formats. Defaults to │
|
│ --to [md|json|text|doctags] Specify output formats. Defaults to │
|
||||||
│ Markdown. │
|
│ Markdown. │
|
||||||
│ [default: None] │
|
│ [default: None] │
|
||||||
│ --ocr --no-ocr If enabled, the bitmap content will be │
|
│ --ocr --no-ocr If enabled, the bitmap content will be │
|
||||||
│ processed using OCR. │
|
│ processed using OCR. │
|
||||||
│ [default: ocr] │
|
│ [default: ocr] │
|
||||||
│ --ocr-engine [easyocr|tesseract_cli|tesseract] The OCR engine to use. [default: easyocr] │
|
│ --ocr-engine [easyocr|tesseract_cli|tesseract] The OCR engine to use. │
|
||||||
│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │
|
│ [default: easyocr] │
|
||||||
│ processed using OCR. │
|
│ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │
|
||||||
│ [default: no-abort-on-error] │
|
│ [default: dlparse_v1] │
|
||||||
│ --output PATH Output directory where results are saved. │
|
│ --table-mode [fast|accurate] The mode to use in the table structure │
|
||||||
│ [default: .] │
|
│ model. │
|
||||||
│ --version Show version information. │
|
│ [default: fast] │
|
||||||
│ --help Show this message and exit. │
|
│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │
|
||||||
|
│ processed using OCR. │
|
||||||
|
│ [default: no-abort-on-error] │
|
||||||
|
│ --output PATH Output directory where results are │
|
||||||
|
│ saved. │
|
||||||
|
│ [default: .] │
|
||||||
|
│ --version Show version information. │
|
||||||
|
│ --help Show this message and exit. │
|
||||||
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
||||||
```
|
```
|
||||||
</details>
|
</details>
|
||||||
|
Loading…
Reference in New Issue
Block a user