feat: allow artifacts_path to be defined as ENV (#940)

* allow the artifacts_path to be defined as ENV

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add check if artifacts_path exists and is dir

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-02-12 13:08:37 +01:00 committed by GitHub
parent c47ae700ec
commit 5101e2519e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 10 additions and 1 deletions

View File

@ -1,6 +1,6 @@
import sys
from pathlib import Path
from typing import Annotated, Tuple
from typing import Annotated, Optional, Tuple
from pydantic import BaseModel, PlainValidator
from pydantic_settings import BaseSettings, SettingsConfigDict
@ -62,6 +62,7 @@ class AppSettings(BaseSettings):
debug: DebugSettings
cache_dir: Path = Path.home() / ".cache" / "docling"
artifacts_path: Optional[Path] = None
settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())

View File

@ -61,6 +61,14 @@ class StandardPdfPipeline(PaginatedPipeline):
artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
artifacts_path = Path(settings.artifacts_path).expanduser()
if artifacts_path is not None and not artifacts_path.is_dir():
raise RuntimeError(
f"The value of {artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
self.keep_images = (
self.pipeline_options.generate_page_images