Docling/docling/datamodel/pipeline_options.py
Christoph Auer ec6cf6f7e8
feat: Introduce LayoutOptions to control layout postprocessing behaviour (#1870)
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-04 15:36:13 +02:00

322 lines
9.5 KiB
Python

import logging
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
from pydantic import (
AnyUrl,
BaseModel,
ConfigDict,
Field,
)
from typing_extensions import deprecated
from docling.datamodel import asr_model_specs
# Import the following for backwards compatibility
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.pipeline_options_asr_model import (
InlineAsrOptions,
)
from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions,
InferenceFramework,
InlineVlmOptions,
ResponseFormat,
)
from docling.datamodel.vlm_model_specs import (
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
VlmModelType,
)
_log = logging.getLogger(__name__)
class BaseOptions(BaseModel):
"""Base class for options."""
kind: ClassVar[str]
class TableFormerMode(str, Enum):
"""Modes for the TableFormer model."""
FAST = "fast"
ACCURATE = "accurate"
class TableStructureOptions(BaseModel):
"""Options for the table structure."""
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
# are merged across table columns.
# False: Let table structure model define the text cells, ignore PDF cells.
)
mode: TableFormerMode = TableFormerMode.ACCURATE
class OcrOptions(BaseOptions):
"""OCR options."""
lang: List[str]
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
bitmap_area_threshold: float = (
0.05 # percentage of the area for a bitmap to processed with OCR
)
class RapidOcrOptions(OcrOptions):
"""Options for the RapidOCR engine."""
kind: ClassVar[Literal["rapidocr"]] = "rapidocr"
# English and chinese are the most commly used models and have been tested with RapidOCR.
lang: List[str] = [
"english",
"chinese",
]
# However, language as a parameter is not supported by rapidocr yet
# and hence changing this options doesn't affect anything.
# For more details on supported languages by RapidOCR visit
# https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
# For more details on the following options visit
# https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
text_score: float = 0.5 # same default as rapidocr
use_det: Optional[bool] = None # same default as rapidocr
use_cls: Optional[bool] = None # same default as rapidocr
use_rec: Optional[bool] = None # same default as rapidocr
print_verbose: bool = False # same default as rapidocr
det_model_path: Optional[str] = None # same default as rapidocr
cls_model_path: Optional[str] = None # same default as rapidocr
rec_model_path: Optional[str] = None # same default as rapidocr
rec_keys_path: Optional[str] = None # same default as rapidocr
model_config = ConfigDict(
extra="forbid",
)
class EasyOcrOptions(OcrOptions):
"""Options for the EasyOCR engine."""
kind: ClassVar[Literal["easyocr"]] = "easyocr"
lang: List[str] = ["fr", "de", "es", "en"]
use_gpu: Optional[bool] = None
confidence_threshold: float = 0.5
model_storage_directory: Optional[str] = None
recog_network: Optional[str] = "standard"
download_enabled: bool = True
model_config = ConfigDict(
extra="forbid",
protected_namespaces=(),
)
class TesseractCliOcrOptions(OcrOptions):
"""Options for the TesseractCli engine."""
kind: ClassVar[Literal["tesseract"]] = "tesseract"
lang: List[str] = ["fra", "deu", "spa", "eng"]
tesseract_cmd: str = "tesseract"
path: Optional[str] = None
model_config = ConfigDict(
extra="forbid",
)
class TesseractOcrOptions(OcrOptions):
"""Options for the Tesseract engine."""
kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
lang: List[str] = ["fra", "deu", "spa", "eng"]
path: Optional[str] = None
model_config = ConfigDict(
extra="forbid",
)
class OcrMacOptions(OcrOptions):
"""Options for the Mac OCR engine."""
kind: ClassVar[Literal["ocrmac"]] = "ocrmac"
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
recognition: str = "accurate"
framework: str = "vision"
model_config = ConfigDict(
extra="forbid",
)
class PictureDescriptionBaseOptions(BaseOptions):
batch_size: int = 8
scale: float = 2
picture_area_threshold: float = (
0.05 # percentage of the area for a picture to processed with the models
)
class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
kind: ClassVar[Literal["api"]] = "api"
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
headers: Dict[str, str] = {}
params: Dict[str, Any] = {}
timeout: float = 20
concurrency: int = 1
prompt: str = "Describe this image in a few sentences."
provenance: str = ""
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
kind: ClassVar[Literal["vlm"]] = "vlm"
repo_id: str
prompt: str = "Describe this image in a few sentences."
# Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
@property
def repo_cache_folder(self) -> str:
return self.repo_id.replace("/", "--")
# SmolVLM
smolvlm_picture_description = PictureDescriptionVlmOptions(
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
)
# GraniteVision
granite_picture_description = PictureDescriptionVlmOptions(
repo_id="ibm-granite/granite-vision-3.2-2b-preview",
prompt="What is shown in this image?",
)
# Define an enum for the backend options
class PdfBackend(str, Enum):
"""Enum of valid PDF backends."""
PYPDFIUM2 = "pypdfium2"
DLPARSE_V1 = "dlparse_v1"
DLPARSE_V2 = "dlparse_v2"
DLPARSE_V4 = "dlparse_v4"
# Define an enum for the ocr engines
@deprecated("Use ocr_factory.registered_enum")
class OcrEngine(str, Enum):
"""Enum of valid OCR engines."""
EASYOCR = "easyocr"
TESSERACT_CLI = "tesseract_cli"
TESSERACT = "tesseract"
OCRMAC = "ocrmac"
RAPIDOCR = "rapidocr"
class PipelineOptions(BaseModel):
"""Base pipeline options."""
create_legacy_output: bool = (
True # This default will be set to False on a future version of docling
)
document_timeout: Optional[float] = None
accelerator_options: AcceleratorOptions = AcceleratorOptions()
enable_remote_services: bool = False
allow_external_plugins: bool = False
class PaginatedPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None
images_scale: float = 1.0
generate_page_images: bool = False
generate_picture_images: bool = False
class VlmPipelineOptions(PaginatedPipelineOptions):
generate_page_images: bool = True
force_backend_text: bool = (
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
smoldocling_vlm_conversion_options
)
class LayoutOptions(BaseModel):
"""Options for layout processing."""
create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
class AsrPipelineOptions(PipelineOptions):
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
artifacts_path: Optional[Union[Path, str]] = None
class PdfPipelineOptions(PaginatedPipelineOptions):
"""Options for the PDF pipeline."""
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
do_code_enrichment: bool = False # True: perform code OCR
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
do_picture_classification: bool = False # True: classify pictures in documents
do_picture_description: bool = False # True: run describe pictures in documents
force_backend_text: bool = (
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text
table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: OcrOptions = EasyOcrOptions()
picture_description_options: PictureDescriptionBaseOptions = (
smolvlm_picture_description
)
layout_options: LayoutOptions = LayoutOptions()
images_scale: float = 1.0
generate_page_images: bool = False
generate_picture_images: bool = False
generate_table_images: bool = Field(
default=False,
deprecated=(
"Field `generate_table_images` is deprecated. "
"To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
"before conversion and then use the `TableItem.get_image` function."
),
)
generate_parsed_pages: Literal[True] = (
True # Always True since parsed_page is now mandatory
)
class ProcessingPipeline(str, Enum):
STANDARD = "standard"
VLM = "vlm"
ASR = "asr"