From eeee3b4371cb8207a8e7a887acba3fc5afc6de4d Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Mon, 4 Nov 2024 14:27:56 +0100 Subject: [PATCH] docs: add explicit artifacts path example (#224) * docs: add explicit artifacts path example [skip ci] Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * minor docs fix [skip ci] Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * touch to trigger needed checks Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --------- Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docs/usage.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 9399337..06d2c3c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -115,6 +115,29 @@ doc_converter = DocumentConverter( ) ``` +##### Provide specific artifacts path + +By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows: + +```python +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline + +# # to explicitly prefetch: +# artifacts_path = StandardPdfPipeline.download_models_hf() + +artifacts_path = "/local/path/to/artifacts" + +pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path) +doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + } +) +``` + #### Impose limits on the document size You can limit the file size and number of pages which should be allowed to process per document: