Docling/docling/datamodel/settings.py
Michele Dolfi ed74fe2ec0
feat: new artifacts path and CLI utility (#876)
* fix artifacts path

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add docling-models utility

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* missing formatting

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* rename utility to docling-tools

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* rename download methods and deprecation warnings

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* propagate artifacts path usage for ocr models

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* move function to utils

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* remove unused file

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update docs

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* simplify downloading specific model(s)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

* minor refactor

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2025-02-06 15:46:32 +01:00

68 lines
1.8 KiB
Python

import sys
from pathlib import Path
from typing import Annotated, Tuple
from pydantic import BaseModel, PlainValidator
from pydantic_settings import BaseSettings, SettingsConfigDict
def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
if v[0] < 1 or v[1] < v[0]:
raise ValueError(
"Invalid page range: start must be ≥ 1 and end must be ≥ start."
)
return v
PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
class DocumentLimits(BaseModel):
max_num_pages: int = sys.maxsize
max_file_size: int = sys.maxsize
page_range: PageRange = DEFAULT_PAGE_RANGE
class BatchConcurrencySettings(BaseModel):
doc_batch_size: int = 2
doc_batch_concurrency: int = 2
page_batch_size: int = 4
page_batch_concurrency: int = 2
elements_batch_size: int = 16
# doc_batch_size: int = 1
# doc_batch_concurrency: int = 1
# page_batch_size: int = 1
# page_batch_concurrency: int = 1
# model_concurrency: int = 2
# To force models into single core: export OMP_NUM_THREADS=1
class DebugSettings(BaseModel):
visualize_cells: bool = False
visualize_ocr: bool = False
visualize_layout: bool = False
visualize_raw_layout: bool = False
visualize_tables: bool = False
profile_pipeline_timings: bool = False
# Path used to output debug information.
debug_output_path: str = str(Path.cwd() / "debug")
class AppSettings(BaseSettings):
model_config = SettingsConfigDict(env_prefix="DOCLING_", env_nested_delimiter="_")
perf: BatchConcurrencySettings
debug: DebugSettings
cache_dir: Path = Path.home() / ".cache" / "docling"
settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())