feat: new artifacts path and CLI utility (#876)

* fix artifacts path

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add docling-models utility

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* missing formatting

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* rename utility to docling-tools

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* rename download methods and deprecation warnings

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* propagate artifacts path usage for ocr models

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* move function to utils

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* remove unused file

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update docs

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* simplify downloading specific model(s)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

* minor refactor

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Michele Dolfi 2025-02-06 15:46:32 +01:00 committed by GitHub
parent 722a6eb7b9
commit ed74fe2ec0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 467 additions and 68 deletions

View File

@ -28,7 +28,7 @@ jobs:
run: | run: |
for file in docs/examples/*.py; do for file in docs/examples/*.py; do
# Skip batch_convert.py # Skip batch_convert.py
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models).py ]]; then if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert).py ]]; then
echo "Skipping $file" echo "Skipping $file"
continue continue
fi fi

105
docling/cli/models.py Normal file
View File

@ -0,0 +1,105 @@
import logging
import warnings
from enum import Enum
from pathlib import Path
from typing import Annotated, Optional
import typer
from rich.console import Console
from rich.logging import RichHandler
from docling.datamodel.settings import settings
from docling.utils.model_downloader import download_models
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
console = Console()
err_console = Console(stderr=True)
app = typer.Typer(
name="Docling models helper",
no_args_is_help=True,
add_completion=False,
pretty_exceptions_enable=False,
)
class _AvailableModels(str, Enum):
LAYOUT = "layout"
TABLEFORMER = "tableformer"
CODE_FORMULA = "code_formula"
PICTURE_CLASSIFIER = "picture_classifier"
EASYOCR = "easyocr"
@app.command("download")
def download(
output_dir: Annotated[
Path,
typer.Option(
...,
"-o",
"--output-dir",
help="The directory where all the models are downloaded.",
),
] = (settings.cache_dir / "models"),
force: Annotated[
bool, typer.Option(..., help="If true, the download will be forced")
] = False,
models: Annotated[
Optional[list[_AvailableModels]],
typer.Argument(
help=f"Models to download (default behavior: all will be downloaded)",
),
] = None,
quiet: Annotated[
bool,
typer.Option(
...,
"-q",
"--quiet",
help="No extra output is generated, the CLI prints only the directory with the cached models.",
),
] = False,
):
if not quiet:
FORMAT = "%(message)s"
logging.basicConfig(
level=logging.INFO,
format="[blue]%(message)s[/blue]",
datefmt="[%X]",
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
)
to_download = models or [m for m in _AvailableModels]
output_dir = download_models(
output_dir=output_dir,
force=force,
progress=(not quiet),
with_layout=_AvailableModels.LAYOUT in to_download,
with_tableformer=_AvailableModels.TABLEFORMER in to_download,
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
with_easyocr=_AvailableModels.EASYOCR in to_download,
)
if quiet:
typer.echo(output_dir)
else:
typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
console.print(
"\n",
"Docling can now be configured for running offline using the local artifacts.\n\n",
"Using the CLI:",
f"`docling --artifacts-path={output_dir} FILE`",
"\n",
"Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
)
click_app = typer.main.get_command(app)
if __name__ == "__main__":
app()

17
docling/cli/tools.py Normal file
View File

@ -0,0 +1,17 @@
import typer
from docling.cli.models import app as models_app
app = typer.Typer(
name="Docling helpers",
no_args_is_help=True,
add_completion=False,
pretty_exceptions_enable=False,
)
app.add_typer(models_app, name="models")
click_app = typer.main.get_command(app)
if __name__ == "__main__":
app()

View File

@ -61,5 +61,7 @@ class AppSettings(BaseSettings):
perf: BatchConcurrencySettings perf: BatchConcurrencySettings
debug: DebugSettings debug: DebugSettings
cache_dir: Path = Path.home() / ".cache" / "docling"
settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings()) settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())

View File

@ -61,6 +61,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
Processes the given batch of elements and enriches them with predictions. Processes the given batch of elements and enriches them with predictions.
""" """
_model_repo_folder = "CodeFormula"
elements_batch_size = 5 elements_batch_size = 5
images_scale = 1.66 # = 120 dpi, aligned with training data resolution images_scale = 1.66 # = 120 dpi, aligned with training data resolution
expansion_factor = 0.03 expansion_factor = 0.03
@ -68,7 +69,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
def __init__( def __init__(
self, self,
enabled: bool, enabled: bool,
artifacts_path: Optional[Union[Path, str]], artifacts_path: Optional[Path],
options: CodeFormulaModelOptions, options: CodeFormulaModelOptions,
accelerator_options: AcceleratorOptions, accelerator_options: AcceleratorOptions,
): ):
@ -97,9 +98,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
) )
if artifacts_path is None: if artifacts_path is None:
artifacts_path = self.download_models_hf() artifacts_path = self.download_models()
else: else:
artifacts_path = Path(artifacts_path) artifacts_path = artifacts_path / self._model_repo_folder
self.code_formula_model = CodeFormulaPredictor( self.code_formula_model = CodeFormulaPredictor(
artifacts_path=artifacts_path, artifacts_path=artifacts_path,
@ -108,13 +109,16 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
) )
@staticmethod @staticmethod
def download_models_hf( def download_models(
local_dir: Optional[Path] = None, force: bool = False local_dir: Optional[Path] = None,
force: bool = False,
progress: bool = False,
) -> Path: ) -> Path:
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars from huggingface_hub.utils import disable_progress_bars
disable_progress_bars() if not progress:
disable_progress_bars()
download_path = snapshot_download( download_path = snapshot_download(
repo_id="ds4sd/CodeFormula", repo_id="ds4sd/CodeFormula",
force_download=force, force_download=force,

View File

@ -55,12 +55,13 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
Processes a batch of elements and adds classification annotations. Processes a batch of elements and adds classification annotations.
""" """
_model_repo_folder = "DocumentFigureClassifier"
images_scale = 2 images_scale = 2
def __init__( def __init__(
self, self,
enabled: bool, enabled: bool,
artifacts_path: Optional[Union[Path, str]], artifacts_path: Optional[Path],
options: DocumentPictureClassifierOptions, options: DocumentPictureClassifierOptions,
accelerator_options: AcceleratorOptions, accelerator_options: AcceleratorOptions,
): ):
@ -88,9 +89,9 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
) )
if artifacts_path is None: if artifacts_path is None:
artifacts_path = self.download_models_hf() artifacts_path = self.download_models()
else: else:
artifacts_path = Path(artifacts_path) artifacts_path = artifacts_path / self._model_repo_folder
self.document_picture_classifier = DocumentFigureClassifierPredictor( self.document_picture_classifier = DocumentFigureClassifierPredictor(
artifacts_path=artifacts_path, artifacts_path=artifacts_path,
@ -99,13 +100,14 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
) )
@staticmethod @staticmethod
def download_models_hf( def download_models(
local_dir: Optional[Path] = None, force: bool = False local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
) -> Path: ) -> Path:
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars from huggingface_hub.utils import disable_progress_bars
disable_progress_bars() if not progress:
disable_progress_bars()
download_path = snapshot_download( download_path = snapshot_download(
repo_id="ds4sd/DocumentFigureClassifier", repo_id="ds4sd/DocumentFigureClassifier",
force_download=force, force_download=force,

View File

@ -1,7 +1,10 @@
import logging import logging
import warnings import warnings
from typing import Iterable import zipfile
from pathlib import Path
from typing import Iterable, List, Optional
import httpx
import numpy import numpy
import torch import torch
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
@ -17,14 +20,18 @@ from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
from docling.utils.utils import download_url_with_progress
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class EasyOcrModel(BaseOcrModel): class EasyOcrModel(BaseOcrModel):
_model_repo_folder = "EasyOcr"
def __init__( def __init__(
self, self,
enabled: bool, enabled: bool,
artifacts_path: Optional[Path],
options: EasyOcrOptions, options: EasyOcrOptions,
accelerator_options: AcceleratorOptions, accelerator_options: AcceleratorOptions,
): ):
@ -62,15 +69,55 @@ class EasyOcrModel(BaseOcrModel):
) )
use_gpu = self.options.use_gpu use_gpu = self.options.use_gpu
download_enabled = self.options.download_enabled
model_storage_directory = self.options.model_storage_directory
if artifacts_path is not None and model_storage_directory is None:
download_enabled = False
model_storage_directory = str(artifacts_path / self._model_repo_folder)
self.reader = easyocr.Reader( self.reader = easyocr.Reader(
lang_list=self.options.lang, lang_list=self.options.lang,
gpu=use_gpu, gpu=use_gpu,
model_storage_directory=self.options.model_storage_directory, model_storage_directory=model_storage_directory,
recog_network=self.options.recog_network, recog_network=self.options.recog_network,
download_enabled=self.options.download_enabled, download_enabled=download_enabled,
verbose=False, verbose=False,
) )
@staticmethod
def download_models(
detection_models: List[str] = ["craft"],
recognition_models: List[str] = ["english_g2", "latin_g2"],
local_dir: Optional[Path] = None,
force: bool = False,
progress: bool = False,
) -> Path:
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
from easyocr.config import detection_models as det_models_dict
from easyocr.config import recognition_models as rec_models_dict
if local_dir is None:
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
local_dir.mkdir(parents=True, exist_ok=True)
# Collect models to download
download_list = []
for model_name in detection_models:
if model_name in det_models_dict:
download_list.append(det_models_dict[model_name])
for model_name in recognition_models:
if model_name in rec_models_dict["gen2"]:
download_list.append(rec_models_dict["gen2"][model_name])
# Download models
for model_details in download_list:
buf = download_url_with_progress(model_details["url"], progress=progress)
with zipfile.ZipFile(buf, "r") as zip_ref:
zip_ref.extractall(local_dir)
return local_dir
def __call__( def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page] self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]: ) -> Iterable[Page]:

View File

@ -1,7 +1,8 @@
import copy import copy
import logging import logging
import warnings
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable, Optional, Union
from docling_core.types.doc import DocItemLabel from docling_core.types.doc import DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
@ -21,6 +22,8 @@ _log = logging.getLogger(__name__)
class LayoutModel(BasePageModel): class LayoutModel(BasePageModel):
_model_repo_folder = "docling-models"
_model_path = "model_artifacts/layout"
TEXT_ELEM_LABELS = [ TEXT_ELEM_LABELS = [
DocItemLabel.TEXT, DocItemLabel.TEXT,
@ -42,15 +45,56 @@ class LayoutModel(BasePageModel):
FORMULA_LABEL = DocItemLabel.FORMULA FORMULA_LABEL = DocItemLabel.FORMULA
CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION] CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions): def __init__(
self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
):
device = decide_device(accelerator_options.device) device = decide_device(accelerator_options.device)
if artifacts_path is None:
artifacts_path = self.download_models() / self._model_path
else:
# will become the default in the future
if (artifacts_path / self._model_repo_folder).exists():
artifacts_path = (
artifacts_path / self._model_repo_folder / self._model_path
)
elif (artifacts_path / self._model_path).exists():
warnings.warn(
"The usage of artifacts_path containing directly "
f"{self._model_path} is deprecated. Please point "
"the artifacts_path to the parent containing "
f"the {self._model_repo_folder} folder.",
DeprecationWarning,
stacklevel=3,
)
artifacts_path = artifacts_path / self._model_path
self.layout_predictor = LayoutPredictor( self.layout_predictor = LayoutPredictor(
artifact_path=str(artifacts_path), artifact_path=str(artifacts_path),
device=device, device=device,
num_threads=accelerator_options.num_threads, num_threads=accelerator_options.num_threads,
) )
@staticmethod
def download_models(
local_dir: Optional[Path] = None,
force: bool = False,
progress: bool = False,
) -> Path:
from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars
if not progress:
disable_progress_bars()
download_path = snapshot_download(
repo_id="ds4sd/docling-models",
force_download=force,
local_dir=local_dir,
revision="v2.1.0",
)
return Path(download_path)
def draw_clusters_and_cells_side_by_side( def draw_clusters_and_cells_side_by_side(
self, conv_res, page, clusters, mode_prefix: str, show: bool = False self, conv_res, page, clusters, mode_prefix: str, show: bool = False
): ):

View File

@ -1,6 +1,7 @@
import copy import copy
import warnings
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable, Optional, Union
import numpy import numpy
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@ -22,10 +23,13 @@ from docling.utils.profiling import TimeRecorder
class TableStructureModel(BasePageModel): class TableStructureModel(BasePageModel):
_model_repo_folder = "docling-models"
_model_path = "model_artifacts/tableformer"
def __init__( def __init__(
self, self,
enabled: bool, enabled: bool,
artifacts_path: Path, artifacts_path: Optional[Path],
options: TableStructureOptions, options: TableStructureOptions,
accelerator_options: AcceleratorOptions, accelerator_options: AcceleratorOptions,
): ):
@ -35,6 +39,26 @@ class TableStructureModel(BasePageModel):
self.enabled = enabled self.enabled = enabled
if self.enabled: if self.enabled:
if artifacts_path is None:
artifacts_path = self.download_models() / self._model_path
else:
# will become the default in the future
if (artifacts_path / self._model_repo_folder).exists():
artifacts_path = (
artifacts_path / self._model_repo_folder / self._model_path
)
elif (artifacts_path / self._model_path).exists():
warnings.warn(
"The usage of artifacts_path containing directly "
f"{self._model_path} is deprecated. Please point "
"the artifacts_path to the parent containing "
f"the {self._model_repo_folder} folder.",
DeprecationWarning,
stacklevel=3,
)
artifacts_path = artifacts_path / self._model_path
if self.mode == TableFormerMode.ACCURATE: if self.mode == TableFormerMode.ACCURATE:
artifacts_path = artifacts_path / "accurate" artifacts_path = artifacts_path / "accurate"
else: else:
@ -58,6 +82,24 @@ class TableStructureModel(BasePageModel):
) )
self.scale = 2.0 # Scale up table input images to 144 dpi self.scale = 2.0 # Scale up table input images to 144 dpi
@staticmethod
def download_models(
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
) -> Path:
from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars
if not progress:
disable_progress_bars()
download_path = snapshot_download(
repo_id="ds4sd/docling-models",
force_download=force,
local_dir=local_dir,
revision="v2.1.0",
)
return Path(download_path)
def draw_table_and_cells( def draw_table_and_cells(
self, self,
conv_res: ConversionResult, conv_res: ConversionResult,

View File

@ -1,5 +1,6 @@
import logging import logging
import sys import sys
import warnings
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
@ -17,6 +18,7 @@ from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions, TesseractOcrOptions,
) )
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
from docling.models.document_picture_classifier import ( from docling.models.document_picture_classifier import (
@ -37,23 +39,23 @@ from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel from docling.models.tesseract_ocr_model import TesseractOcrModel
from docling.pipeline.base_pipeline import PaginatedPipeline from docling.pipeline.base_pipeline import PaginatedPipeline
from docling.utils.model_downloader import download_models
from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class StandardPdfPipeline(PaginatedPipeline): class StandardPdfPipeline(PaginatedPipeline):
_layout_model_path = "model_artifacts/layout" _layout_model_path = LayoutModel._model_path
_table_model_path = "model_artifacts/tableformer" _table_model_path = TableStructureModel._model_path
def __init__(self, pipeline_options: PdfPipelineOptions): def __init__(self, pipeline_options: PdfPipelineOptions):
super().__init__(pipeline_options) super().__init__(pipeline_options)
self.pipeline_options: PdfPipelineOptions self.pipeline_options: PdfPipelineOptions
if pipeline_options.artifacts_path is None: artifacts_path: Optional[Path] = None
self.artifacts_path = self.download_models_hf() if pipeline_options.artifacts_path is not None:
else: artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
self.artifacts_path = Path(pipeline_options.artifacts_path)
self.keep_images = ( self.keep_images = (
self.pipeline_options.generate_page_images self.pipeline_options.generate_page_images
@ -63,7 +65,7 @@ class StandardPdfPipeline(PaginatedPipeline):
self.glm_model = GlmModel(options=GlmOptions()) self.glm_model = GlmModel(options=GlmOptions())
if (ocr_model := self.get_ocr_model()) is None: if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
raise RuntimeError( raise RuntimeError(
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}." f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
) )
@ -79,15 +81,13 @@ class StandardPdfPipeline(PaginatedPipeline):
ocr_model, ocr_model,
# Layout model # Layout model
LayoutModel( LayoutModel(
artifacts_path=self.artifacts_path artifacts_path=artifacts_path,
/ StandardPdfPipeline._layout_model_path,
accelerator_options=pipeline_options.accelerator_options, accelerator_options=pipeline_options.accelerator_options,
), ),
# Table structure model # Table structure model
TableStructureModel( TableStructureModel(
enabled=pipeline_options.do_table_structure, enabled=pipeline_options.do_table_structure,
artifacts_path=self.artifacts_path artifacts_path=artifacts_path,
/ StandardPdfPipeline._table_model_path,
options=pipeline_options.table_structure_options, options=pipeline_options.table_structure_options,
accelerator_options=pipeline_options.accelerator_options, accelerator_options=pipeline_options.accelerator_options,
), ),
@ -101,7 +101,7 @@ class StandardPdfPipeline(PaginatedPipeline):
CodeFormulaModel( CodeFormulaModel(
enabled=pipeline_options.do_code_enrichment enabled=pipeline_options.do_code_enrichment
or pipeline_options.do_formula_enrichment, or pipeline_options.do_formula_enrichment,
artifacts_path=pipeline_options.artifacts_path, artifacts_path=artifacts_path,
options=CodeFormulaModelOptions( options=CodeFormulaModelOptions(
do_code_enrichment=pipeline_options.do_code_enrichment, do_code_enrichment=pipeline_options.do_code_enrichment,
do_formula_enrichment=pipeline_options.do_formula_enrichment, do_formula_enrichment=pipeline_options.do_formula_enrichment,
@ -111,7 +111,7 @@ class StandardPdfPipeline(PaginatedPipeline):
# Document Picture Classifier # Document Picture Classifier
DocumentPictureClassifier( DocumentPictureClassifier(
enabled=pipeline_options.do_picture_classification, enabled=pipeline_options.do_picture_classification,
artifacts_path=pipeline_options.artifacts_path, artifacts_path=artifacts_path,
options=DocumentPictureClassifierOptions(), options=DocumentPictureClassifierOptions(),
accelerator_options=pipeline_options.accelerator_options, accelerator_options=pipeline_options.accelerator_options,
), ),
@ -127,23 +127,24 @@ class StandardPdfPipeline(PaginatedPipeline):
def download_models_hf( def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False local_dir: Optional[Path] = None, force: bool = False
) -> Path: ) -> Path:
from huggingface_hub import snapshot_download warnings.warn(
from huggingface_hub.utils import disable_progress_bars "The usage of StandardPdfPipeline.download_models_hf() is deprecated "
"use instead the utility `docling-tools models download`, or "
disable_progress_bars() "the upstream method docling.utils.models_downloader.download_all()",
download_path = snapshot_download( DeprecationWarning,
repo_id="ds4sd/docling-models", stacklevel=3,
force_download=force,
local_dir=local_dir,
revision="v2.1.0",
) )
return Path(download_path) output_dir = download_models(output_dir=local_dir, force=force, progress=False)
return output_dir
def get_ocr_model(self) -> Optional[BaseOcrModel]: def get_ocr_model(
self, artifacts_path: Optional[Path] = None
) -> Optional[BaseOcrModel]:
if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions): if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
return EasyOcrModel( return EasyOcrModel(
enabled=self.pipeline_options.do_ocr, enabled=self.pipeline_options.do_ocr,
artifacts_path=artifacts_path,
options=self.pipeline_options.ocr_options, options=self.pipeline_options.ocr_options,
accelerator_options=self.pipeline_options.accelerator_options, accelerator_options=self.pipeline_options.accelerator_options,
) )

View File

@ -0,0 +1,72 @@
import logging
from pathlib import Path
from typing import Optional
from docling.datamodel.settings import settings
from docling.models.code_formula_model import CodeFormulaModel
from docling.models.document_picture_classifier import DocumentPictureClassifier
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.table_structure_model import TableStructureModel
_log = logging.getLogger(__name__)
def download_models(
output_dir: Optional[Path] = None,
*,
force: bool = False,
progress: bool = False,
with_layout: bool = True,
with_tableformer: bool = True,
with_code_formula: bool = True,
with_picture_classifier: bool = True,
with_easyocr: bool = True,
):
if output_dir is None:
output_dir = settings.cache_dir / "models"
# Make sure the folder exists
output_dir.mkdir(exist_ok=True, parents=True)
if with_layout:
_log.info(f"Downloading layout model...")
LayoutModel.download_models(
local_dir=output_dir / LayoutModel._model_repo_folder,
force=force,
progress=progress,
)
if with_tableformer:
_log.info(f"Downloading tableformer model...")
TableStructureModel.download_models(
local_dir=output_dir / TableStructureModel._model_repo_folder,
force=force,
progress=progress,
)
if with_picture_classifier:
_log.info(f"Downloading picture classifier model...")
DocumentPictureClassifier.download_models(
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
force=force,
progress=progress,
)
if with_code_formula:
_log.info(f"Downloading code formula model...")
CodeFormulaModel.download_models(
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
force=force,
progress=progress,
)
if with_easyocr:
_log.info(f"Downloading easyocr models...")
EasyOcrModel.download_models(
local_dir=output_dir / EasyOcrModel._model_repo_folder,
force=force,
progress=progress,
)
return output_dir

View File

@ -4,6 +4,9 @@ from itertools import islice
from pathlib import Path from pathlib import Path
from typing import List, Union from typing import List, Union
import requests
from tqdm import tqdm
def chunkify(iterator, chunk_size): def chunkify(iterator, chunk_size):
"""Yield successive chunks of chunk_size from the iterable.""" """Yield successive chunks of chunk_size from the iterable."""
@ -39,3 +42,24 @@ def create_hash(string: str):
hasher.update(string.encode("utf-8")) hasher.update(string.encode("utf-8"))
return hasher.hexdigest() return hasher.hexdigest()
def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
buf = BytesIO()
with requests.get(url, stream=True, allow_redirects=True) as response:
total_size = int(response.headers.get("content-length", 0))
progress_bar = tqdm(
total=total_size,
unit="B",
unit_scale=True,
unit_divisor=1024,
disable=(not progress),
)
for chunk in response.iter_content(10 * 1024):
buf.write(chunk)
progress_bar.update(len(chunk))
progress_bar.close()
buf.seek(0)
return buf

View File

@ -26,12 +26,56 @@ To see all available options (export formats etc.) run `docling --help`. More de
### Advanced options ### Advanced options
#### Model prefetching and offline usage
By default, models are downloaded automatically upon first usage. If you would prefer
to explicitly prefetch them for offline use (e.g. in air-gapped environments) you can do
that as follows:
**Step 1: Prefetch the models**
Use the `docling-tools models download` utility:
```sh
$ docling-tools models download
Downloading layout model...
Downloading tableformer model...
Downloading picture classifier model...
Downloading code formula model...
Downloading easyocr models...
Models downloaded into $HOME/.cache/docling/models.
```
Alternatively, models can be programmatically downloaded using `docling.utils.model_downloader.download_models()`.
**Step 2: Use the prefetched models**
```python
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
artifacts_path = "/local/path/to/models"
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
```
Or using the CLI:
```sh
docling --artifacts-path="/local/path/to/models" FILE
```
#### Adjust pipeline features #### Adjust pipeline features
The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways
one can adjust the conversion pipeline and features. one can adjust the conversion pipeline and features.
##### Control PDF table extraction options ##### Control PDF table extraction options
You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself. You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
@ -70,28 +114,6 @@ doc_converter = DocumentConverter(
) )
``` ```
##### Provide specific artifacts path
By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows:
```python
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
# # to explicitly prefetch:
# artifacts_path = StandardPdfPipeline.download_models_hf()
artifacts_path = "/local/path/to/artifacts"
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
```
#### Impose limits on the document size #### Impose limits on the document size

16
poetry.lock generated
View File

@ -7248,6 +7248,20 @@ files = [
[package.dependencies] [package.dependencies]
urllib3 = ">=2" urllib3 = ">=2"
[[package]]
name = "types-tqdm"
version = "4.67.0.20241221"
description = "Typing stubs for tqdm"
optional = false
python-versions = ">=3.8"
files = [
{file = "types_tqdm-4.67.0.20241221-py3-none-any.whl", hash = "sha256:a1f1c9cda5c2d8482d2c73957a5398bfdedda10f6bc7b3b4e812d5c910486d29"},
{file = "types_tqdm-4.67.0.20241221.tar.gz", hash = "sha256:e56046631056922385abe89aeb18af5611f471eadd7918a0ad7f34d84cd4c8cc"},
]
[package.dependencies]
types-requests = "*"
[[package]] [[package]]
name = "typing-extensions" name = "typing-extensions"
version = "4.12.2" version = "4.12.2"
@ -7837,4 +7851,4 @@ tesserocr = ["tesserocr"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.9" python-versions = "^3.9"
content-hash = "907c7cef6722358ac30193f07f9cc15684daf1b75b6c400104e87f3b22137632" content-hash = "7d8c8a4c2562f3e88673fb9a32d0a5f85aca0f7c7aeaa67a7a65a0f930a5c6c7"

View File

@ -60,6 +60,7 @@ onnxruntime = [
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" } { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
] ]
pillow = "^10.0.0" pillow = "^10.0.0"
tqdm = "^4.65.0"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"} black = {extras = ["jupyter"], version = "^24.4.2"}
@ -79,6 +80,7 @@ ipykernel = "^6.29.5"
ipywidgets = "^8.1.5" ipywidgets = "^8.1.5"
nbqa = "^1.9.0" nbqa = "^1.9.0"
types-openpyxl = "^3.1.5.20241114" types-openpyxl = "^3.1.5.20241114"
types-tqdm = "^4.67.0.20241221"
[tool.poetry.group.docs.dependencies] [tool.poetry.group.docs.dependencies]
mkdocs-material = "^9.5.40" mkdocs-material = "^9.5.40"
@ -123,6 +125,7 @@ rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
[tool.poetry.scripts] [tool.poetry.scripts]
docling = "docling.cli.main:app" docling = "docling.cli.main:app"
docling-tools = "docling.cli.tools:app"
[build-system] [build-system]
requires = ["poetry-core"] requires = ["poetry-core"]