feat: new artifacts path and CLI utility (#876)

* fix artifacts path

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add docling-models utility

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* missing formatting

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* rename utility to docling-tools

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* rename download methods and deprecation warnings

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* propagate artifacts path usage for ocr models

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* move function to utils

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* remove unused file

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update docs

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* simplify downloading specific model(s)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

* minor refactor

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Michele Dolfi
2025-02-06 15:46:32 +01:00
committed by GitHub
parent 722a6eb7b9
commit ed74fe2ec0
15 changed files with 467 additions and 68 deletions

View File

@@ -1,7 +1,8 @@
import copy
import logging
import warnings
from pathlib import Path
from typing import Iterable
from typing import Iterable, Optional, Union
from docling_core.types.doc import DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
@@ -21,6 +22,8 @@ _log = logging.getLogger(__name__)
class LayoutModel(BasePageModel):
_model_repo_folder = "docling-models"
_model_path = "model_artifacts/layout"
TEXT_ELEM_LABELS = [
DocItemLabel.TEXT,
@@ -42,15 +45,56 @@ class LayoutModel(BasePageModel):
FORMULA_LABEL = DocItemLabel.FORMULA
CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
def __init__(
self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
):
device = decide_device(accelerator_options.device)
if artifacts_path is None:
artifacts_path = self.download_models() / self._model_path
else:
# will become the default in the future
if (artifacts_path / self._model_repo_folder).exists():
artifacts_path = (
artifacts_path / self._model_repo_folder / self._model_path
)
elif (artifacts_path / self._model_path).exists():
warnings.warn(
"The usage of artifacts_path containing directly "
f"{self._model_path} is deprecated. Please point "
"the artifacts_path to the parent containing "
f"the {self._model_repo_folder} folder.",
DeprecationWarning,
stacklevel=3,
)
artifacts_path = artifacts_path / self._model_path
self.layout_predictor = LayoutPredictor(
artifact_path=str(artifacts_path),
device=device,
num_threads=accelerator_options.num_threads,
)
@staticmethod
def download_models(
local_dir: Optional[Path] = None,
force: bool = False,
progress: bool = False,
) -> Path:
from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars
if not progress:
disable_progress_bars()
download_path = snapshot_download(
repo_id="ds4sd/docling-models",
force_download=force,
local_dir=local_dir,
revision="v2.1.0",
)
return Path(download_path)
def draw_clusters_and_cells_side_by_side(
self, conv_res, page, clusters, mode_prefix: str, show: bool = False
):