From 5101e2519e7a5bb727531b1412b1131a7cfbda52 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Wed, 12 Feb 2025 13:08:37 +0100 Subject: [PATCH] feat: allow artifacts_path to be defined as ENV (#940) * allow the artifacts_path to be defined as ENV Signed-off-by: Michele Dolfi * add check if artifacts_path exists and is dir Signed-off-by: Michele Dolfi --------- Signed-off-by: Michele Dolfi --- docling/datamodel/settings.py | 3 ++- docling/pipeline/standard_pdf_pipeline.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/docling/datamodel/settings.py b/docling/datamodel/settings.py index 439ffe7..fee871a 100644 --- a/docling/datamodel/settings.py +++ b/docling/datamodel/settings.py @@ -1,6 +1,6 @@ import sys from pathlib import Path -from typing import Annotated, Tuple +from typing import Annotated, Optional, Tuple from pydantic import BaseModel, PlainValidator from pydantic_settings import BaseSettings, SettingsConfigDict @@ -62,6 +62,7 @@ class AppSettings(BaseSettings): debug: DebugSettings cache_dir: Path = Path.home() / ".cache" / "docling" + artifacts_path: Optional[Path] = None settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings()) diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 13e435f..1c71bf7 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -61,6 +61,14 @@ class StandardPdfPipeline(PaginatedPipeline): artifacts_path: Optional[Path] = None if pipeline_options.artifacts_path is not None: artifacts_path = Path(pipeline_options.artifacts_path).expanduser() + elif settings.artifacts_path is not None: + artifacts_path = Path(settings.artifacts_path).expanduser() + + if artifacts_path is not None and not artifacts_path.is_dir(): + raise RuntimeError( + f"The value of {artifacts_path=} is not valid. " + "When defined, it must point to a folder containing all models required by the pipeline." + ) self.keep_images = ( self.pipeline_options.generate_page_images