
Signed-off-by: Abhishek Kumar <abhishekrocketeer@gmail.com> Testing: (.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --document-timeout=10 --verbose INFO:docling.document_converter:Going to convert document batch... INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf WARNING:docling.pipeline.base_pipeline:Document processing time (24.555 seconds) exceeded the specified timeout of 10.000 seconds INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 36.29 sec. WARNING:docling.cli.main:Document /var/folders/d7/dsfkllxs0xs8x2t4fcjknj4c0000gn/T/tmpl6p08u5i/2206.01062v1.pdf failed to convert. INFO:docling.cli.main:Processed 1 docs, of which 1 failed INFO:docling.cli.main:All documents were converted in 36.29 seconds. (.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --document-timeout=100 --verbose INFO:docling.document_converter:Going to convert document batch... INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 58.36 sec. INFO:docling.cli.main:writing Markdown output to 2206.01062v1.md INFO:docling.cli.main:Processed 1 docs, of which 0 failed INFO:docling.cli.main:All documents were converted in 58.56 seconds. (.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --verbose INFO:docling.document_converter:Going to convert document batch... INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 59.82 sec. INFO:docling.cli.main:writing Markdown output to 2206.01062v1.md INFO:docling.cli.main:Processed 1 docs, of which 0 failed INFO:docling.cli.main:All documents were converted in 59.88 seconds. (.venv) mario@Abhisheks-MacBook-Air docling % docling Usage: docling [OPTIONS] source ╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] [required] │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ --from [docx|pptx|html|image|pdf|asciido Specify input formats to convert │ │ c|md|xlsx] from. Defaults to all formats. │ │ [default: None] │ │ --to [md|json|html|text|doctags] Specify output formats. Defaults to │ │ Markdown. │ │ [default: None] │ │ --image-export-mode [placeholder|embedded|referenced] Image export mode for the document │ │ (only in case of JSON, Markdown or │ │ HTML). With `placeholder`, only the │ │ position of the image is marked in │ │ the output. In `embedded` mode, the │ │ image is embedded as base64 encoded │ │ string. In `referenced` mode, the │ │ image is exported in PNG format and │ │ referenced from the main exported │ │ document. │ │ [default: embedded] │ │ --ocr --no-ocr If enabled, the bitmap content will │ │ be processed using OCR. │ │ [default: ocr] │ │ --force-ocr --no-force-ocr Replace any existing text with OCR │ │ generated text over the full │ │ content. │ │ [default: no-force-ocr] │ │ --ocr-engine [easyocr|tesseract_cli|tesseract| The OCR engine to use. │ │ ocrmac|rapidocr] [default: easyocr] │ │ --ocr-lang TEXT Provide a comma-separated list of │ │ languages used by the OCR engine. │ │ Note that each OCR engine has │ │ different values for the language │ │ names. │ │ [default: None] │ │ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │ │ [default: dlparse_v2] │ │ --table-mode [fast|accurate] The mode to use in the table │ │ structure model. │ │ [default: fast] │ │ --artifacts-path PATH If provided, the location of the │ │ model artifacts. │ │ [default: None] │ │ --abort-on-error --no-abort-on-error If enabled, the bitmap content will │ │ be processed using OCR. │ │ [default: no-abort-on-error] │ │ --output PATH Output directory where results are │ │ saved. │ │ [default: .] │ │ --verbose -v INTEGER Set the verbosity level. -v for │ │ info logging, -vv for debug │ │ logging. │ │ [default: 0] │ │ --debug-visualize-cells --no-debug-visualize-cells Enable debug output which │ │ visualizes the PDF cells │ │ [default: no-debug-visualize-cells] │ │ --debug-visualize-ocr --no-debug-visualize-ocr Enable debug output which │ │ visualizes the OCR cells │ │ [default: no-debug-visualize-ocr] │ │ --debug-visualize-layout --no-debug-visualize-layout Enable debug output which │ │ visualizes the layour clusters │ │ [default: │ │ no-debug-visualize-layout] │ │ --debug-visualize-tables --no-debug-visualize-tables Enable debug output which │ │ visualizes the table cells │ │ [default: │ │ no-debug-visualize-tables] │ │ --version Show version information. │ │ --document-timeout FLOAT The timeout for processing each │ │ document, in seconds. │ │ [default: None] │ │ --help Show this message and exit. │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
185 lines
5.4 KiB
Python
185 lines
5.4 KiB
Python
from enum import Enum
|
|
from pathlib import Path
|
|
from typing import List, Literal, Optional, Union
|
|
|
|
from pydantic import BaseModel, ConfigDict, Field
|
|
|
|
|
|
class TableFormerMode(str, Enum):
|
|
"""Modes for the TableFormer model."""
|
|
|
|
FAST = "fast"
|
|
ACCURATE = "accurate"
|
|
|
|
|
|
class TableStructureOptions(BaseModel):
|
|
"""Options for the table structure."""
|
|
|
|
do_cell_matching: bool = (
|
|
True
|
|
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
|
# are merged across table columns.
|
|
# False: Let table structure model define the text cells, ignore PDF cells.
|
|
)
|
|
mode: TableFormerMode = TableFormerMode.FAST
|
|
|
|
|
|
class OcrOptions(BaseModel):
|
|
"""OCR options."""
|
|
|
|
kind: str
|
|
lang: List[str]
|
|
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
|
bitmap_area_threshold: float = (
|
|
0.05 # percentage of the area for a bitmap to processed with OCR
|
|
)
|
|
|
|
|
|
class RapidOcrOptions(OcrOptions):
|
|
"""Options for the RapidOCR engine."""
|
|
|
|
kind: Literal["rapidocr"] = "rapidocr"
|
|
|
|
# English and chinese are the most commly used models and have been tested with RapidOCR.
|
|
lang: List[str] = [
|
|
"english",
|
|
"chinese",
|
|
] # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
|
|
# For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
|
|
|
|
# For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
|
|
text_score: float = 0.5 # same default as rapidocr
|
|
|
|
use_det: Optional[bool] = None # same default as rapidocr
|
|
use_cls: Optional[bool] = None # same default as rapidocr
|
|
use_rec: Optional[bool] = None # same default as rapidocr
|
|
|
|
# class Device(Enum):
|
|
# CPU = "CPU"
|
|
# CUDA = "CUDA"
|
|
# DIRECTML = "DIRECTML"
|
|
# AUTO = "AUTO"
|
|
|
|
# device: Device = Device.AUTO # Default value is AUTO
|
|
|
|
print_verbose: bool = False # same default as rapidocr
|
|
|
|
det_model_path: Optional[str] = None # same default as rapidocr
|
|
cls_model_path: Optional[str] = None # same default as rapidocr
|
|
rec_model_path: Optional[str] = None # same default as rapidocr
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
)
|
|
|
|
|
|
class EasyOcrOptions(OcrOptions):
|
|
"""Options for the EasyOCR engine."""
|
|
|
|
kind: Literal["easyocr"] = "easyocr"
|
|
lang: List[str] = ["fr", "de", "es", "en"]
|
|
use_gpu: bool = True # same default as easyocr.Reader
|
|
model_storage_directory: Optional[str] = None
|
|
download_enabled: bool = True # same default as easyocr.Reader
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
protected_namespaces=(),
|
|
)
|
|
|
|
|
|
class TesseractCliOcrOptions(OcrOptions):
|
|
"""Options for the TesseractCli engine."""
|
|
|
|
kind: Literal["tesseract"] = "tesseract"
|
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
|
tesseract_cmd: str = "tesseract"
|
|
path: Optional[str] = None
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
)
|
|
|
|
|
|
class TesseractOcrOptions(OcrOptions):
|
|
"""Options for the Tesseract engine."""
|
|
|
|
kind: Literal["tesserocr"] = "tesserocr"
|
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
|
path: Optional[str] = None
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
)
|
|
|
|
|
|
class OcrMacOptions(OcrOptions):
|
|
"""Options for the Mac OCR engine."""
|
|
|
|
kind: Literal["ocrmac"] = "ocrmac"
|
|
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
|
recognition: str = "accurate"
|
|
framework: str = "vision"
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
)
|
|
|
|
|
|
# Define an enum for the backend options
|
|
class PdfBackend(str, Enum):
|
|
"""Enum of valid PDF backends."""
|
|
|
|
PYPDFIUM2 = "pypdfium2"
|
|
DLPARSE_V1 = "dlparse_v1"
|
|
DLPARSE_V2 = "dlparse_v2"
|
|
|
|
|
|
# Define an enum for the ocr engines
|
|
class OcrEngine(str, Enum):
|
|
"""Enum of valid OCR engines."""
|
|
|
|
EASYOCR = "easyocr"
|
|
TESSERACT_CLI = "tesseract_cli"
|
|
TESSERACT = "tesseract"
|
|
OCRMAC = "ocrmac"
|
|
RAPIDOCR = "rapidocr"
|
|
|
|
|
|
class PipelineOptions(BaseModel):
|
|
"""Base pipeline options."""
|
|
|
|
create_legacy_output: bool = (
|
|
True # This default will be set to False on a future version of docling
|
|
)
|
|
document_timeout: Optional[float] = None
|
|
|
|
|
|
class PdfPipelineOptions(PipelineOptions):
|
|
"""Options for the PDF pipeline."""
|
|
|
|
artifacts_path: Optional[Union[Path, str]] = None
|
|
do_table_structure: bool = True # True: perform table structure extraction
|
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
|
|
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
|
ocr_options: Union[
|
|
EasyOcrOptions,
|
|
TesseractCliOcrOptions,
|
|
TesseractOcrOptions,
|
|
OcrMacOptions,
|
|
RapidOcrOptions,
|
|
] = Field(EasyOcrOptions(), discriminator="kind")
|
|
|
|
images_scale: float = 1.0
|
|
generate_page_images: bool = False
|
|
generate_picture_images: bool = False
|
|
generate_table_images: bool = Field(
|
|
default=False,
|
|
deprecated=(
|
|
"Field `generate_table_images` is deprecated. "
|
|
"To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
|
|
"before conversion and then use the `TableItem.get_image` function."
|
|
),
|
|
)
|