feat: allow artifacts_path to be defined as ENV (#940)

* allow the artifacts_path to be defined as ENV

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add check if artifacts_path exists and is dir

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-02-12 13:08:37 +01:00 committed by GitHub
parent c47ae700ec
commit 5101e2519e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 10 additions and 1 deletions

View File

@ -1,6 +1,6 @@
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Annotated, Tuple from typing import Annotated, Optional, Tuple
from pydantic import BaseModel, PlainValidator from pydantic import BaseModel, PlainValidator
from pydantic_settings import BaseSettings, SettingsConfigDict from pydantic_settings import BaseSettings, SettingsConfigDict
@ -62,6 +62,7 @@ class AppSettings(BaseSettings):
debug: DebugSettings debug: DebugSettings
cache_dir: Path = Path.home() / ".cache" / "docling" cache_dir: Path = Path.home() / ".cache" / "docling"
artifacts_path: Optional[Path] = None
settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings()) settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())

View File

@ -61,6 +61,14 @@ class StandardPdfPipeline(PaginatedPipeline):
artifacts_path: Optional[Path] = None artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None: if pipeline_options.artifacts_path is not None:
artifacts_path = Path(pipeline_options.artifacts_path).expanduser() artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
artifacts_path = Path(settings.artifacts_path).expanduser()
if artifacts_path is not None and not artifacts_path.is_dir():
raise RuntimeError(
f"The value of {artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
self.keep_images = ( self.keep_images = (
self.pipeline_options.generate_page_images self.pipeline_options.generate_page_images