
* feat: add coverage_threshold to skip OCR for small images Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * filter individual boxes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename option Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
84 lines
2.4 KiB
Python
84 lines
2.4 KiB
Python
from enum import Enum, auto
|
|
from pathlib import Path
|
|
from typing import List, Literal, Optional, Union
|
|
|
|
from pydantic import BaseModel, ConfigDict, Field
|
|
|
|
|
|
class TableFormerMode(str, Enum):
|
|
FAST = auto()
|
|
ACCURATE = auto()
|
|
|
|
|
|
class TableStructureOptions(BaseModel):
|
|
do_cell_matching: bool = (
|
|
True
|
|
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
|
# are merged across table columns.
|
|
# False: Let table structure model define the text cells, ignore PDF cells.
|
|
)
|
|
mode: TableFormerMode = TableFormerMode.FAST
|
|
|
|
|
|
class OcrOptions(BaseModel):
|
|
kind: str
|
|
bitmap_area_threshold: float = (
|
|
0.05 # percentage of the area for a bitmap to processed with OCR
|
|
)
|
|
|
|
|
|
class EasyOcrOptions(OcrOptions):
|
|
kind: Literal["easyocr"] = "easyocr"
|
|
lang: List[str] = ["fr", "de", "es", "en"]
|
|
use_gpu: bool = True # same default as easyocr.Reader
|
|
model_storage_directory: Optional[str] = None
|
|
download_enabled: bool = True # same default as easyocr.Reader
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
protected_namespaces=(),
|
|
)
|
|
|
|
|
|
class TesseractCliOcrOptions(OcrOptions):
|
|
kind: Literal["tesseract"] = "tesseract"
|
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
|
tesseract_cmd: str = "tesseract"
|
|
path: Optional[str] = None
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
)
|
|
|
|
|
|
class TesseractOcrOptions(OcrOptions):
|
|
kind: Literal["tesserocr"] = "tesserocr"
|
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
|
path: Optional[str] = None
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
)
|
|
|
|
|
|
class PipelineOptions(BaseModel):
|
|
create_legacy_output: bool = (
|
|
True # This defautl will be set to False on a future version of docling
|
|
)
|
|
|
|
|
|
class PdfPipelineOptions(PipelineOptions):
|
|
artifacts_path: Optional[Union[Path, str]] = None
|
|
do_table_structure: bool = True # True: perform table structure extraction
|
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
|
|
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
|
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
|
|
Field(EasyOcrOptions(), discriminator="kind")
|
|
)
|
|
|
|
images_scale: float = 1.0
|
|
generate_page_images: bool = False
|
|
generate_picture_images: bool = False
|
|
generate_table_images: bool = False
|