
* feat: add support for `ocrmac` OCR engine on macOS - Integrates `ocrmac` as an OCR engine option for macOS users. - Adds configuration options and dependencies for `ocrmac`. - Updates documentation to reflect new engine support. This change allows macOS users to utilize `ocrmac` for improved OCR performance and compatibility. Signed-off-by: Suhwan Seo <nuridol@gmail.com> * updated the poetry lock Signed-off-by: Suhwan Seo <nuridol@gmail.com> * Fix linting issues, update CLI docs, and add error for ocrmac use on non-Mac systems - Resolved formatting and linting issues - Updated `--ocr-engine` CLI option documentation for `ocrmac` - Added RuntimeError for attempts to use `ocrmac` on non-Mac platforms Signed-off-by: Suhwan Seo <nuridol@gmail.com> * feat: add support for `ocrmac` OCR engine on macOS - Integrates `ocrmac` as an OCR engine option for macOS users. - Adds configuration options and dependencies for `ocrmac`. - Updates documentation to reflect new engine support. This change allows macOS users to utilize `ocrmac` for improved OCR performance and compatibility. Signed-off-by: Suhwan Seo <nuridol@gmail.com> * docs: update examples and installation for ocrmac support - Added `OcrMacOptions` to `custom_convert.py` and `full_page_ocr.py` examples. - Included usage comments and examples for `OcrMacOptions` in OCR pipelines. - Updated installation guide to include instructions for installing `ocrmac`, noting macOS version requirements (10.15+). - Highlighted that `ocrmac` leverages Apple's Vision framework as an OCR backend. This enhances documentation for users working on macOS to leverage `ocrmac` effectively. Signed-off-by: Suhwan Seo <nuridol@gmail.com> * fix: update `ocrmac` dependency with macOS-specific marker - Added `sys_platform == 'darwin'` marker to the `ocrmac` dependency in `pyproject.toml` to specify macOS compatibility. - Updated the content hash in `poetry.lock` to reflect the changes. This ensures the `ocrmac` dependency is only installed on macOS systems. Signed-off-by: Suhwan Seo <nuridol@gmail.com> --------- Signed-off-by: Suhwan Seo <nuridol@gmail.com> Co-authored-by: Suhwan Seo <nuridol@gmail.com>
104 lines
3.0 KiB
Python
104 lines
3.0 KiB
Python
from enum import Enum
|
|
from pathlib import Path
|
|
from typing import List, Literal, Optional, Union
|
|
|
|
from pydantic import BaseModel, ConfigDict, Field
|
|
|
|
|
|
class TableFormerMode(str, Enum):
|
|
FAST = "fast"
|
|
ACCURATE = "accurate"
|
|
|
|
|
|
class TableStructureOptions(BaseModel):
|
|
do_cell_matching: bool = (
|
|
True
|
|
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
|
# are merged across table columns.
|
|
# False: Let table structure model define the text cells, ignore PDF cells.
|
|
)
|
|
mode: TableFormerMode = TableFormerMode.FAST
|
|
|
|
|
|
class OcrOptions(BaseModel):
|
|
kind: str
|
|
lang: List[str]
|
|
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
|
bitmap_area_threshold: float = (
|
|
0.05 # percentage of the area for a bitmap to processed with OCR
|
|
)
|
|
|
|
|
|
class EasyOcrOptions(OcrOptions):
|
|
kind: Literal["easyocr"] = "easyocr"
|
|
lang: List[str] = ["fr", "de", "es", "en"]
|
|
use_gpu: bool = True # same default as easyocr.Reader
|
|
model_storage_directory: Optional[str] = None
|
|
download_enabled: bool = True # same default as easyocr.Reader
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
protected_namespaces=(),
|
|
)
|
|
|
|
|
|
class TesseractCliOcrOptions(OcrOptions):
|
|
kind: Literal["tesseract"] = "tesseract"
|
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
|
tesseract_cmd: str = "tesseract"
|
|
path: Optional[str] = None
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
)
|
|
|
|
|
|
class TesseractOcrOptions(OcrOptions):
|
|
kind: Literal["tesserocr"] = "tesserocr"
|
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
|
path: Optional[str] = None
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
)
|
|
|
|
|
|
class OcrMacOptions(OcrOptions):
|
|
kind: Literal["ocrmac"] = "ocrmac"
|
|
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
|
recognition: str = "accurate"
|
|
framework: str = "vision"
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
)
|
|
|
|
|
|
class PipelineOptions(BaseModel):
|
|
create_legacy_output: bool = (
|
|
True # This defautl will be set to False on a future version of docling
|
|
)
|
|
|
|
|
|
class PdfPipelineOptions(PipelineOptions):
|
|
artifacts_path: Optional[Union[Path, str]] = None
|
|
do_table_structure: bool = True # True: perform table structure extraction
|
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
|
|
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
|
ocr_options: Union[
|
|
EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
|
|
] = Field(EasyOcrOptions(), discriminator="kind")
|
|
|
|
images_scale: float = 1.0
|
|
generate_page_images: bool = False
|
|
generate_picture_images: bool = False
|
|
generate_table_images: bool = Field(
|
|
default=False,
|
|
deprecated=(
|
|
"Field `generate_table_images` is deprecated. "
|
|
"To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
|
|
"before conversion and then use the `TableItem.get_image` function."
|
|
),
|
|
)
|