Docling/docling/datamodel/pipeline_options.py
Nikos Livathinos c6b3763ecb
feat(OCR): Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning (#290)
- When the OCR is forced, any existing PDF cells are rejected.
- Introduce the force-ocr cmd parameter in docling CLI.
- Update unit tests.
- Add the full_page_ocr.py example in mkdocs.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
2024-11-12 09:46:14 +01:00

85 lines
2.5 KiB
Python

from enum import Enum
from pathlib import Path
from typing import List, Literal, Optional, Union
from pydantic import BaseModel, ConfigDict, Field
class TableFormerMode(str, Enum):
FAST = "fast"
ACCURATE = "accurate"
class TableStructureOptions(BaseModel):
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
# are merged across table columns.
# False: Let table structure model define the text cells, ignore PDF cells.
)
mode: TableFormerMode = TableFormerMode.FAST
class OcrOptions(BaseModel):
kind: str
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
bitmap_area_threshold: float = (
0.05 # percentage of the area for a bitmap to processed with OCR
)
class EasyOcrOptions(OcrOptions):
kind: Literal["easyocr"] = "easyocr"
lang: List[str] = ["fr", "de", "es", "en"]
use_gpu: bool = True # same default as easyocr.Reader
model_storage_directory: Optional[str] = None
download_enabled: bool = True # same default as easyocr.Reader
model_config = ConfigDict(
extra="forbid",
protected_namespaces=(),
)
class TesseractCliOcrOptions(OcrOptions):
kind: Literal["tesseract"] = "tesseract"
lang: List[str] = ["fra", "deu", "spa", "eng"]
tesseract_cmd: str = "tesseract"
path: Optional[str] = None
model_config = ConfigDict(
extra="forbid",
)
class TesseractOcrOptions(OcrOptions):
kind: Literal["tesserocr"] = "tesserocr"
lang: List[str] = ["fra", "deu", "spa", "eng"]
path: Optional[str] = None
model_config = ConfigDict(
extra="forbid",
)
class PipelineOptions(BaseModel):
create_legacy_output: bool = (
True # This defautl will be set to False on a future version of docling
)
class PdfPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
Field(EasyOcrOptions(), discriminator="kind")
)
images_scale: float = 1.0
generate_page_images: bool = False
generate_picture_images: bool = False
generate_table_images: bool = False