feat: new artifacts path and CLI utility (#876)

* fix artifacts path Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add docling-models utility Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * missing formatting Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename utility to docling-tools Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename download methods and deprecation warnings Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * propagate artifacts path usage for ocr models Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * move function to utils Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove unused file Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update docs Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * simplify downloading specific model(s) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * minor refactor Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2025-02-06 15:46:32 +01:00
parent 722a6eb7b9
commit ed74fe2ec0
15 changed files with 467 additions and 68 deletions
@@ -28,7 +28,7 @@ jobs:
        run: |
          for file in docs/examples/*.py; do
            # Skip batch_convert.py
-            if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models).py ]]; then
+            if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert).py ]]; then
                echo "Skipping $file"
                continue
            fi
@@ -0,0 +1,105 @@
 import logging
 import warnings
 from enum import Enum
 from pathlib import Path
 from typing import Annotated, Optional
 import typer
 from rich.console import Console
 from rich.logging import RichHandler
 from docling.datamodel.settings import settings
 from docling.utils.model_downloader import download_models
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
 console = Console()
 err_console = Console(stderr=True)
 app = typer.Typer(
    name="Docling models helper",
    no_args_is_help=True,
    add_completion=False,
    pretty_exceptions_enable=False,
 )
 class _AvailableModels(str, Enum):
    LAYOUT = "layout"
    TABLEFORMER = "tableformer"
    CODE_FORMULA = "code_formula"
    PICTURE_CLASSIFIER = "picture_classifier"
    EASYOCR = "easyocr"
@app.command("download")
 def download(
    output_dir: Annotated[
        Path,
        typer.Option(
            ...,
            "-o",
            "--output-dir",
            help="The directory where all the models are downloaded.",
        ),
    ] = (settings.cache_dir / "models"),
    force: Annotated[
        bool, typer.Option(..., help="If true, the download will be forced")
    ] = False,
    models: Annotated[
        Optional[list[_AvailableModels]],
        typer.Argument(
            help=f"Models to download (default behavior: all will be downloaded)",
        ),
    ] = None,
    quiet: Annotated[
        bool,
        typer.Option(
            ...,
            "-q",
            "--quiet",
            help="No extra output is generated, the CLI prints only the directory with the cached models.",
        ),
    ] = False,
 ):
    if not quiet:
        FORMAT = "%(message)s"
        logging.basicConfig(
            level=logging.INFO,
            format="[blue]%(message)s[/blue]",
            datefmt="[%X]",
            handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
        )
    to_download = models or [m for m in _AvailableModels]
    output_dir = download_models(
        output_dir=output_dir,
        force=force,
        progress=(not quiet),
        with_layout=_AvailableModels.LAYOUT in to_download,
        with_tableformer=_AvailableModels.TABLEFORMER in to_download,
        with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
        with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
        with_easyocr=_AvailableModels.EASYOCR in to_download,
    )
    if quiet:
        typer.echo(output_dir)
    else:
        typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
        console.print(
            "\n",
            "Docling can now be configured for running offline using the local artifacts.\n\n",
            "Using the CLI:",
            f"`docling --artifacts-path={output_dir} FILE`",
            "\n",
            "Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
        )
 click_app = typer.main.get_command(app)
 if __name__ == "__main__":
    app()
@@ -0,0 +1,17 @@
 import typer
 from docling.cli.models import app as models_app
 app = typer.Typer(
    name="Docling helpers",
    no_args_is_help=True,
    add_completion=False,
    pretty_exceptions_enable=False,
 )
 app.add_typer(models_app, name="models")
 click_app = typer.main.get_command(app)
 if __name__ == "__main__":
    app()
@@ -61,5 +61,7 @@ class AppSettings(BaseSettings):
    perf: BatchConcurrencySettings
    debug: DebugSettings
    cache_dir: Path = Path.home() / ".cache" / "docling"
 settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
@@ -61,6 +61,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
        Processes the given batch of elements and enriches them with predictions.
    """
    _model_repo_folder = "CodeFormula"
    elements_batch_size = 5
    images_scale = 1.66  # = 120 dpi, aligned with training data resolution
    expansion_factor = 0.03
@@ -68,7 +69,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
    def __init__(
        self,
        enabled: bool,
-        artifacts_path: Optional[Union[Path, str]],
+        artifacts_path: Optional[Path],
        options: CodeFormulaModelOptions,
        accelerator_options: AcceleratorOptions,
    ):
@@ -97,9 +98,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
            )
            if artifacts_path is None:
-                artifacts_path = self.download_models_hf()
+                artifacts_path = self.download_models()
            else:
-                artifacts_path = Path(artifacts_path)
+                artifacts_path = artifacts_path / self._model_repo_folder
            self.code_formula_model = CodeFormulaPredictor(
                artifacts_path=artifacts_path,
@@ -108,13 +109,16 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
            )
    @staticmethod
-    def download_models_hf(
+    def download_models(
-        local_dir: Optional[Path] = None, force: bool = False
+        local_dir: Optional[Path] = None,
        force: bool = False,
        progress: bool = False,
    ) -> Path:
        from huggingface_hub import snapshot_download
        from huggingface_hub.utils import disable_progress_bars
-        disable_progress_bars()
+        if not progress:
            disable_progress_bars()
        download_path = snapshot_download(
            repo_id="ds4sd/CodeFormula",
            force_download=force,
@@ -55,12 +55,13 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
        Processes a batch of elements and adds classification annotations.
    """
    _model_repo_folder = "DocumentFigureClassifier"
    images_scale = 2
    def __init__(
        self,
        enabled: bool,
-        artifacts_path: Optional[Union[Path, str]],
+        artifacts_path: Optional[Path],
        options: DocumentPictureClassifierOptions,
        accelerator_options: AcceleratorOptions,
    ):
@@ -88,9 +89,9 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
            )
            if artifacts_path is None:
-                artifacts_path = self.download_models_hf()
+                artifacts_path = self.download_models()
            else:
-                artifacts_path = Path(artifacts_path)
+                artifacts_path = artifacts_path / self._model_repo_folder
            self.document_picture_classifier = DocumentFigureClassifierPredictor(
                artifacts_path=artifacts_path,
@@ -99,13 +100,14 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
            )
    @staticmethod
-    def download_models_hf(
+    def download_models(
-        local_dir: Optional[Path] = None, force: bool = False
+        local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
    ) -> Path:
        from huggingface_hub import snapshot_download
        from huggingface_hub.utils import disable_progress_bars
-        disable_progress_bars()
+        if not progress:
            disable_progress_bars()
        download_path = snapshot_download(
            repo_id="ds4sd/DocumentFigureClassifier",
            force_download=force,
@@ -1,7 +1,10 @@
 import logging
 import warnings
-from typing import Iterable
+import zipfile
 from pathlib import Path
 from typing import Iterable, List, Optional
 import httpx
 import numpy
 import torch
 from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -17,14 +20,18 @@ from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 from docling.utils.utils import download_url_with_progress
 _log = logging.getLogger(__name__)
 class EasyOcrModel(BaseOcrModel):
    _model_repo_folder = "EasyOcr"
    def __init__(
        self,
        enabled: bool,
        artifacts_path: Optional[Path],
        options: EasyOcrOptions,
        accelerator_options: AcceleratorOptions,
    ):
@@ -62,15 +69,55 @@ class EasyOcrModel(BaseOcrModel):
                )
                use_gpu = self.options.use_gpu
            download_enabled = self.options.download_enabled
            model_storage_directory = self.options.model_storage_directory
            if artifacts_path is not None and model_storage_directory is None:
                download_enabled = False
                model_storage_directory = str(artifacts_path / self._model_repo_folder)
            self.reader = easyocr.Reader(
                lang_list=self.options.lang,
                gpu=use_gpu,
-                model_storage_directory=self.options.model_storage_directory,
+                model_storage_directory=model_storage_directory,
                recog_network=self.options.recog_network,
-                download_enabled=self.options.download_enabled,
+                download_enabled=download_enabled,
                verbose=False,
            )
    @staticmethod
    def download_models(
        detection_models: List[str] = ["craft"],
        recognition_models: List[str] = ["english_g2", "latin_g2"],
        local_dir: Optional[Path] = None,
        force: bool = False,
        progress: bool = False,
    ) -> Path:
        # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
        from easyocr.config import detection_models as det_models_dict
        from easyocr.config import recognition_models as rec_models_dict
        if local_dir is None:
            local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
        local_dir.mkdir(parents=True, exist_ok=True)
        # Collect models to download
        download_list = []
        for model_name in detection_models:
            if model_name in det_models_dict:
                download_list.append(det_models_dict[model_name])
        for model_name in recognition_models:
            if model_name in rec_models_dict["gen2"]:
                download_list.append(rec_models_dict["gen2"][model_name])
        # Download models
        for model_details in download_list:
            buf = download_url_with_progress(model_details["url"], progress=progress)
            with zipfile.ZipFile(buf, "r") as zip_ref:
                zip_ref.extractall(local_dir)
        return local_dir
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
@@ -1,7 +1,8 @@
 import copy
 import logging
 import warnings
 from pathlib import Path
-from typing import Iterable
+from typing import Iterable, Optional, Union
 from docling_core.types.doc import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
@@ -21,6 +22,8 @@ _log = logging.getLogger(__name__)
 class LayoutModel(BasePageModel):
    _model_repo_folder = "docling-models"
    _model_path = "model_artifacts/layout"
    TEXT_ELEM_LABELS = [
        DocItemLabel.TEXT,
@@ -42,15 +45,56 @@ class LayoutModel(BasePageModel):
    FORMULA_LABEL = DocItemLabel.FORMULA
    CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
-    def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
+    def __init__(
        self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
    ):
        device = decide_device(accelerator_options.device)
        if artifacts_path is None:
            artifacts_path = self.download_models() / self._model_path
        else:
            # will become the default in the future
            if (artifacts_path / self._model_repo_folder).exists():
                artifacts_path = (
                    artifacts_path / self._model_repo_folder / self._model_path
                )
            elif (artifacts_path / self._model_path).exists():
                warnings.warn(
                    "The usage of artifacts_path containing directly "
                    f"{self._model_path} is deprecated. Please point "
                    "the artifacts_path to the parent containing "
                    f"the {self._model_repo_folder} folder.",
                    DeprecationWarning,
                    stacklevel=3,
                )
                artifacts_path = artifacts_path / self._model_path
        self.layout_predictor = LayoutPredictor(
            artifact_path=str(artifacts_path),
            device=device,
            num_threads=accelerator_options.num_threads,
        )
    @staticmethod
    def download_models(
        local_dir: Optional[Path] = None,
        force: bool = False,
        progress: bool = False,
    ) -> Path:
        from huggingface_hub import snapshot_download
        from huggingface_hub.utils import disable_progress_bars
        if not progress:
            disable_progress_bars()
        download_path = snapshot_download(
            repo_id="ds4sd/docling-models",
            force_download=force,
            local_dir=local_dir,
            revision="v2.1.0",
        )
        return Path(download_path)
    def draw_clusters_and_cells_side_by_side(
        self, conv_res, page, clusters, mode_prefix: str, show: bool = False
    ):
@@ -1,6 +1,7 @@
 import copy
 import warnings
 from pathlib import Path
-from typing import Iterable
+from typing import Iterable, Optional, Union
 import numpy
 from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@@ -22,10 +23,13 @@ from docling.utils.profiling import TimeRecorder
 class TableStructureModel(BasePageModel):
    _model_repo_folder = "docling-models"
    _model_path = "model_artifacts/tableformer"
    def __init__(
        self,
        enabled: bool,
-        artifacts_path: Path,
+        artifacts_path: Optional[Path],
        options: TableStructureOptions,
        accelerator_options: AcceleratorOptions,
    ):
@@ -35,6 +39,26 @@ class TableStructureModel(BasePageModel):
        self.enabled = enabled
        if self.enabled:
            if artifacts_path is None:
                artifacts_path = self.download_models() / self._model_path
            else:
                # will become the default in the future
                if (artifacts_path / self._model_repo_folder).exists():
                    artifacts_path = (
                        artifacts_path / self._model_repo_folder / self._model_path
                    )
                elif (artifacts_path / self._model_path).exists():
                    warnings.warn(
                        "The usage of artifacts_path containing directly "
                        f"{self._model_path} is deprecated. Please point "
                        "the artifacts_path to the parent containing "
                        f"the {self._model_repo_folder} folder.",
                        DeprecationWarning,
                        stacklevel=3,
                    )
                    artifacts_path = artifacts_path / self._model_path
            if self.mode == TableFormerMode.ACCURATE:
                artifacts_path = artifacts_path / "accurate"
            else:
@@ -58,6 +82,24 @@ class TableStructureModel(BasePageModel):
            )
            self.scale = 2.0  # Scale up table input images to 144 dpi
    @staticmethod
    def download_models(
        local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
    ) -> Path:
        from huggingface_hub import snapshot_download
        from huggingface_hub.utils import disable_progress_bars
        if not progress:
            disable_progress_bars()
        download_path = snapshot_download(
            repo_id="ds4sd/docling-models",
            force_download=force,
            local_dir=local_dir,
            revision="v2.1.0",
        )
        return Path(download_path)
    def draw_table_and_cells(
        self,
        conv_res: ConversionResult,
@@ -1,5 +1,6 @@
 import logging
 import sys
 import warnings
 from pathlib import Path
 from typing import Optional
@@ -17,6 +18,7 @@ from docling.datamodel.pipeline_options import (
    TesseractCliOcrOptions,
    TesseractOcrOptions,
 )
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
 from docling.models.document_picture_classifier import (
@@ -37,23 +39,23 @@ from docling.models.table_structure_model import TableStructureModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
 from docling.models.tesseract_ocr_model import TesseractOcrModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.model_downloader import download_models
 from docling.utils.profiling import ProfilingScope, TimeRecorder
 _log = logging.getLogger(__name__)
 class StandardPdfPipeline(PaginatedPipeline):
-    _layout_model_path = "model_artifacts/layout"
+    _layout_model_path = LayoutModel._model_path
-    _table_model_path = "model_artifacts/tableformer"
+    _table_model_path = TableStructureModel._model_path
    def __init__(self, pipeline_options: PdfPipelineOptions):
        super().__init__(pipeline_options)
        self.pipeline_options: PdfPipelineOptions
-        if pipeline_options.artifacts_path is None:
+        artifacts_path: Optional[Path] = None
-            self.artifacts_path = self.download_models_hf()
+        if pipeline_options.artifacts_path is not None:
-        else:
+            artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
            self.artifacts_path = Path(pipeline_options.artifacts_path)
        self.keep_images = (
            self.pipeline_options.generate_page_images
@@ -63,7 +65,7 @@ class StandardPdfPipeline(PaginatedPipeline):
        self.glm_model = GlmModel(options=GlmOptions())
-        if (ocr_model := self.get_ocr_model()) is None:
+        if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
            raise RuntimeError(
                f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
            )
@@ -79,15 +81,13 @@ class StandardPdfPipeline(PaginatedPipeline):
            ocr_model,
            # Layout model
            LayoutModel(
-                artifacts_path=self.artifacts_path
+                artifacts_path=artifacts_path,
                / StandardPdfPipeline._layout_model_path,
                accelerator_options=pipeline_options.accelerator_options,
            ),
            # Table structure model
            TableStructureModel(
                enabled=pipeline_options.do_table_structure,
-                artifacts_path=self.artifacts_path
+                artifacts_path=artifacts_path,
                / StandardPdfPipeline._table_model_path,
                options=pipeline_options.table_structure_options,
                accelerator_options=pipeline_options.accelerator_options,
            ),
@@ -101,7 +101,7 @@ class StandardPdfPipeline(PaginatedPipeline):
            CodeFormulaModel(
                enabled=pipeline_options.do_code_enrichment
                or pipeline_options.do_formula_enrichment,
-                artifacts_path=pipeline_options.artifacts_path,
+                artifacts_path=artifacts_path,
                options=CodeFormulaModelOptions(
                    do_code_enrichment=pipeline_options.do_code_enrichment,
                    do_formula_enrichment=pipeline_options.do_formula_enrichment,
@@ -111,7 +111,7 @@ class StandardPdfPipeline(PaginatedPipeline):
            # Document Picture Classifier
            DocumentPictureClassifier(
                enabled=pipeline_options.do_picture_classification,
-                artifacts_path=pipeline_options.artifacts_path,
+                artifacts_path=artifacts_path,
                options=DocumentPictureClassifierOptions(),
                accelerator_options=pipeline_options.accelerator_options,
            ),
@@ -127,23 +127,24 @@ class StandardPdfPipeline(PaginatedPipeline):
    def download_models_hf(
        local_dir: Optional[Path] = None, force: bool = False
    ) -> Path:
-        from huggingface_hub import snapshot_download
+        warnings.warn(
-        from huggingface_hub.utils import disable_progress_bars
+            "The usage of StandardPdfPipeline.download_models_hf() is deprecated "
-
+            "use instead the utility `docling-tools models download`, or "
-        disable_progress_bars()
+            "the upstream method docling.utils.models_downloader.download_all()",
-        download_path = snapshot_download(
+            DeprecationWarning,
-            repo_id="ds4sd/docling-models",
+            stacklevel=3,
            force_download=force,
            local_dir=local_dir,
            revision="v2.1.0",
        )
-        return Path(download_path)
+        output_dir = download_models(output_dir=local_dir, force=force, progress=False)
        return output_dir
-    def get_ocr_model(self) -> Optional[BaseOcrModel]:
+    def get_ocr_model(
        self, artifacts_path: Optional[Path] = None
    ) -> Optional[BaseOcrModel]:
        if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
            return EasyOcrModel(
                enabled=self.pipeline_options.do_ocr,
                artifacts_path=artifacts_path,
                options=self.pipeline_options.ocr_options,
                accelerator_options=self.pipeline_options.accelerator_options,
            )
@@ -0,0 +1,72 @@
 import logging
 from pathlib import Path
 from typing import Optional
 from docling.datamodel.settings import settings
 from docling.models.code_formula_model import CodeFormulaModel
 from docling.models.document_picture_classifier import DocumentPictureClassifier
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.table_structure_model import TableStructureModel
 _log = logging.getLogger(__name__)
 def download_models(
    output_dir: Optional[Path] = None,
    *,
    force: bool = False,
    progress: bool = False,
    with_layout: bool = True,
    with_tableformer: bool = True,
    with_code_formula: bool = True,
    with_picture_classifier: bool = True,
    with_easyocr: bool = True,
 ):
    if output_dir is None:
        output_dir = settings.cache_dir / "models"
    # Make sure the folder exists
    output_dir.mkdir(exist_ok=True, parents=True)
    if with_layout:
        _log.info(f"Downloading layout model...")
        LayoutModel.download_models(
            local_dir=output_dir / LayoutModel._model_repo_folder,
            force=force,
            progress=progress,
        )
    if with_tableformer:
        _log.info(f"Downloading tableformer model...")
        TableStructureModel.download_models(
            local_dir=output_dir / TableStructureModel._model_repo_folder,
            force=force,
            progress=progress,
        )
    if with_picture_classifier:
        _log.info(f"Downloading picture classifier model...")
        DocumentPictureClassifier.download_models(
            local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
            force=force,
            progress=progress,
        )
    if with_code_formula:
        _log.info(f"Downloading code formula model...")
        CodeFormulaModel.download_models(
            local_dir=output_dir / CodeFormulaModel._model_repo_folder,
            force=force,
            progress=progress,
        )
    if with_easyocr:
        _log.info(f"Downloading easyocr models...")
        EasyOcrModel.download_models(
            local_dir=output_dir / EasyOcrModel._model_repo_folder,
            force=force,
            progress=progress,
        )
    return output_dir
@@ -4,6 +4,9 @@ from itertools import islice
 from pathlib import Path
 from typing import List, Union
 import requests
 from tqdm import tqdm
 def chunkify(iterator, chunk_size):
    """Yield successive chunks of chunk_size from the iterable."""
@@ -39,3 +42,24 @@ def create_hash(string: str):
    hasher.update(string.encode("utf-8"))
    return hasher.hexdigest()
 def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
    buf = BytesIO()
    with requests.get(url, stream=True, allow_redirects=True) as response:
        total_size = int(response.headers.get("content-length", 0))
        progress_bar = tqdm(
            total=total_size,
            unit="B",
            unit_scale=True,
            unit_divisor=1024,
            disable=(not progress),
        )
        for chunk in response.iter_content(10 * 1024):
            buf.write(chunk)
            progress_bar.update(len(chunk))
        progress_bar.close()
    buf.seek(0)
    return buf
@@ -26,12 +26,56 @@ To see all available options (export formats etc.) run `docling --help`. More de
 ### Advanced options
 #### Model prefetching and offline usage
 By default, models are downloaded automatically upon first usage. If you would prefer
 to explicitly prefetch them for offline use (e.g. in air-gapped environments) you can do
 that as follows:
 **Step 1: Prefetch the models**
 Use the `docling-tools models download` utility:
 ```sh
 $ docling-tools models download
 Downloading layout model...
 Downloading tableformer model...
 Downloading picture classifier model...
 Downloading code formula model...
 Downloading easyocr models...
 Models downloaded into $HOME/.cache/docling/models.
 ```
 Alternatively, models can be programmatically downloaded using `docling.utils.model_downloader.download_models()`.
 **Step 2: Use the prefetched models**
 ```python
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
 artifacts_path = "/local/path/to/models"
 pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
 doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
 )
 ```
 Or using the CLI:
 ```sh
 docling --artifacts-path="/local/path/to/models" FILE
 ```
 #### Adjust pipeline features
 The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways
 one can adjust the conversion pipeline and features.
 ##### Control PDF table extraction options
 You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
@@ -70,28 +114,6 @@ doc_converter = DocumentConverter(
 )
 ```
 ##### Provide specific artifacts path
 By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows:
 ```python
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 # # to explicitly prefetch:
 # artifacts_path = StandardPdfPipeline.download_models_hf()
 artifacts_path = "/local/path/to/artifacts"
 pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
 doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
 )
 ```
 #### Impose limits on the document size
@@ -7248,6 +7248,20 @@ files = [
 [package.dependencies]
 urllib3 = ">=2"
 [[package]]
 name = "types-tqdm"
 version = "4.67.0.20241221"
 description = "Typing stubs for tqdm"
 optional = false
 python-versions = ">=3.8"
 files = [
    {file = "types_tqdm-4.67.0.20241221-py3-none-any.whl", hash = "sha256:a1f1c9cda5c2d8482d2c73957a5398bfdedda10f6bc7b3b4e812d5c910486d29"},
    {file = "types_tqdm-4.67.0.20241221.tar.gz", hash = "sha256:e56046631056922385abe89aeb18af5611f471eadd7918a0ad7f34d84cd4c8cc"},
 ]
 [package.dependencies]
 types-requests = "*"
 [[package]]
 name = "typing-extensions"
 version = "4.12.2"
@@ -7837,4 +7851,4 @@ tesserocr = ["tesserocr"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "907c7cef6722358ac30193f07f9cc15684daf1b75b6c400104e87f3b22137632"
+content-hash = "7d8c8a4c2562f3e88673fb9a32d0a5f85aca0f7c7aeaa67a7a65a0f930a5c6c7"
@@ -60,6 +60,7 @@ onnxruntime = [
  { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
 ]
 pillow = "^10.0.0"
 tqdm = "^4.65.0"
 [tool.poetry.group.dev.dependencies]
 black = {extras = ["jupyter"], version = "^24.4.2"}
@@ -79,6 +80,7 @@ ipykernel = "^6.29.5"
 ipywidgets = "^8.1.5"
 nbqa = "^1.9.0"
 types-openpyxl = "^3.1.5.20241114"
 types-tqdm = "^4.67.0.20241221"
 [tool.poetry.group.docs.dependencies]
 mkdocs-material = "^9.5.40"
@@ -123,6 +125,7 @@ rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
 [tool.poetry.scripts]
 docling = "docling.cli.main:app"
 docling-tools = "docling.cli.tools:app"
 [build-system]
 requires = ["poetry-core"]