From ed74fe2ec0a702834f0deacfdb5717c8c587dab1 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Thu, 6 Feb 2025 15:46:32 +0100 Subject: [PATCH] feat: new artifacts path and CLI utility (#876) * fix artifacts path Signed-off-by: Michele Dolfi * add docling-models utility Signed-off-by: Michele Dolfi * missing formatting Signed-off-by: Michele Dolfi * rename utility to docling-tools Signed-off-by: Michele Dolfi * rename download methods and deprecation warnings Signed-off-by: Michele Dolfi * propagate artifacts path usage for ocr models Signed-off-by: Michele Dolfi * move function to utils Signed-off-by: Michele Dolfi * remove unused file Signed-off-by: Michele Dolfi * update docs Signed-off-by: Michele Dolfi * simplify downloading specific model(s) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * minor refactor Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- .github/workflows/checks.yml | 2 +- docling/cli/models.py | 105 ++++++++++++++++++ docling/cli/tools.py | 17 +++ docling/datamodel/settings.py | 2 + docling/models/code_formula_model.py | 16 ++- docling/models/document_picture_classifier.py | 14 ++- docling/models/easyocr_model.py | 53 ++++++++- docling/models/layout_model.py | 48 +++++++- docling/models/table_structure_model.py | 46 +++++++- docling/pipeline/standard_pdf_pipeline.py | 49 ++++---- docling/utils/model_downloader.py | 72 ++++++++++++ docling/utils/utils.py | 24 ++++ docs/usage.md | 68 ++++++++---- poetry.lock | 16 ++- pyproject.toml | 3 + 15 files changed, 467 insertions(+), 68 deletions(-) create mode 100644 docling/cli/models.py create mode 100644 docling/cli/tools.py create mode 100644 docling/utils/model_downloader.py diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 19e8c1e..75ea597 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -28,7 +28,7 @@ jobs: run: | for file in docs/examples/*.py; do # Skip batch_convert.py - if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models).py ]]; then + if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert).py ]]; then echo "Skipping $file" continue fi diff --git a/docling/cli/models.py b/docling/cli/models.py new file mode 100644 index 0000000..aea498c --- /dev/null +++ b/docling/cli/models.py @@ -0,0 +1,105 @@ +import logging +import warnings +from enum import Enum +from pathlib import Path +from typing import Annotated, Optional + +import typer +from rich.console import Console +from rich.logging import RichHandler + +from docling.datamodel.settings import settings +from docling.utils.model_downloader import download_models + +warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") +warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") + +console = Console() +err_console = Console(stderr=True) + + +app = typer.Typer( + name="Docling models helper", + no_args_is_help=True, + add_completion=False, + pretty_exceptions_enable=False, +) + + +class _AvailableModels(str, Enum): + LAYOUT = "layout" + TABLEFORMER = "tableformer" + CODE_FORMULA = "code_formula" + PICTURE_CLASSIFIER = "picture_classifier" + EASYOCR = "easyocr" + + +@app.command("download") +def download( + output_dir: Annotated[ + Path, + typer.Option( + ..., + "-o", + "--output-dir", + help="The directory where all the models are downloaded.", + ), + ] = (settings.cache_dir / "models"), + force: Annotated[ + bool, typer.Option(..., help="If true, the download will be forced") + ] = False, + models: Annotated[ + Optional[list[_AvailableModels]], + typer.Argument( + help=f"Models to download (default behavior: all will be downloaded)", + ), + ] = None, + quiet: Annotated[ + bool, + typer.Option( + ..., + "-q", + "--quiet", + help="No extra output is generated, the CLI prints only the directory with the cached models.", + ), + ] = False, +): + if not quiet: + FORMAT = "%(message)s" + logging.basicConfig( + level=logging.INFO, + format="[blue]%(message)s[/blue]", + datefmt="[%X]", + handlers=[RichHandler(show_level=False, show_time=False, markup=True)], + ) + to_download = models or [m for m in _AvailableModels] + output_dir = download_models( + output_dir=output_dir, + force=force, + progress=(not quiet), + with_layout=_AvailableModels.LAYOUT in to_download, + with_tableformer=_AvailableModels.TABLEFORMER in to_download, + with_code_formula=_AvailableModels.CODE_FORMULA in to_download, + with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download, + with_easyocr=_AvailableModels.EASYOCR in to_download, + ) + + if quiet: + typer.echo(output_dir) + else: + typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green") + + console.print( + "\n", + "Docling can now be configured for running offline using the local artifacts.\n\n", + "Using the CLI:", + f"`docling --artifacts-path={output_dir} FILE`", + "\n", + "Using Python: see the documentation at .", + ) + + +click_app = typer.main.get_command(app) + +if __name__ == "__main__": + app() diff --git a/docling/cli/tools.py b/docling/cli/tools.py new file mode 100644 index 0000000..8711013 --- /dev/null +++ b/docling/cli/tools.py @@ -0,0 +1,17 @@ +import typer + +from docling.cli.models import app as models_app + +app = typer.Typer( + name="Docling helpers", + no_args_is_help=True, + add_completion=False, + pretty_exceptions_enable=False, +) + +app.add_typer(models_app, name="models") + +click_app = typer.main.get_command(app) + +if __name__ == "__main__": + app() diff --git a/docling/datamodel/settings.py b/docling/datamodel/settings.py index 9285620..439ffe7 100644 --- a/docling/datamodel/settings.py +++ b/docling/datamodel/settings.py @@ -61,5 +61,7 @@ class AppSettings(BaseSettings): perf: BatchConcurrencySettings debug: DebugSettings + cache_dir: Path = Path.home() / ".cache" / "docling" + settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings()) diff --git a/docling/models/code_formula_model.py b/docling/models/code_formula_model.py index 6648f46..f2c54ed 100644 --- a/docling/models/code_formula_model.py +++ b/docling/models/code_formula_model.py @@ -61,6 +61,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel): Processes the given batch of elements and enriches them with predictions. """ + _model_repo_folder = "CodeFormula" elements_batch_size = 5 images_scale = 1.66 # = 120 dpi, aligned with training data resolution expansion_factor = 0.03 @@ -68,7 +69,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel): def __init__( self, enabled: bool, - artifacts_path: Optional[Union[Path, str]], + artifacts_path: Optional[Path], options: CodeFormulaModelOptions, accelerator_options: AcceleratorOptions, ): @@ -97,9 +98,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel): ) if artifacts_path is None: - artifacts_path = self.download_models_hf() + artifacts_path = self.download_models() else: - artifacts_path = Path(artifacts_path) + artifacts_path = artifacts_path / self._model_repo_folder self.code_formula_model = CodeFormulaPredictor( artifacts_path=artifacts_path, @@ -108,13 +109,16 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel): ) @staticmethod - def download_models_hf( - local_dir: Optional[Path] = None, force: bool = False + def download_models( + local_dir: Optional[Path] = None, + force: bool = False, + progress: bool = False, ) -> Path: from huggingface_hub import snapshot_download from huggingface_hub.utils import disable_progress_bars - disable_progress_bars() + if not progress: + disable_progress_bars() download_path = snapshot_download( repo_id="ds4sd/CodeFormula", force_download=force, diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py index 6e2d90b..5e9b399 100644 --- a/docling/models/document_picture_classifier.py +++ b/docling/models/document_picture_classifier.py @@ -55,12 +55,13 @@ class DocumentPictureClassifier(BaseEnrichmentModel): Processes a batch of elements and adds classification annotations. """ + _model_repo_folder = "DocumentFigureClassifier" images_scale = 2 def __init__( self, enabled: bool, - artifacts_path: Optional[Union[Path, str]], + artifacts_path: Optional[Path], options: DocumentPictureClassifierOptions, accelerator_options: AcceleratorOptions, ): @@ -88,9 +89,9 @@ class DocumentPictureClassifier(BaseEnrichmentModel): ) if artifacts_path is None: - artifacts_path = self.download_models_hf() + artifacts_path = self.download_models() else: - artifacts_path = Path(artifacts_path) + artifacts_path = artifacts_path / self._model_repo_folder self.document_picture_classifier = DocumentFigureClassifierPredictor( artifacts_path=artifacts_path, @@ -99,13 +100,14 @@ class DocumentPictureClassifier(BaseEnrichmentModel): ) @staticmethod - def download_models_hf( - local_dir: Optional[Path] = None, force: bool = False + def download_models( + local_dir: Optional[Path] = None, force: bool = False, progress: bool = False ) -> Path: from huggingface_hub import snapshot_download from huggingface_hub.utils import disable_progress_bars - disable_progress_bars() + if not progress: + disable_progress_bars() download_path = snapshot_download( repo_id="ds4sd/DocumentFigureClassifier", force_download=force, diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index bbe4fb0..9b1b2a0 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -1,7 +1,10 @@ import logging import warnings -from typing import Iterable +import zipfile +from pathlib import Path +from typing import Iterable, List, Optional +import httpx import numpy import torch from docling_core.types.doc import BoundingBox, CoordOrigin @@ -17,14 +20,18 @@ from docling.datamodel.settings import settings from docling.models.base_ocr_model import BaseOcrModel from docling.utils.accelerator_utils import decide_device from docling.utils.profiling import TimeRecorder +from docling.utils.utils import download_url_with_progress _log = logging.getLogger(__name__) class EasyOcrModel(BaseOcrModel): + _model_repo_folder = "EasyOcr" + def __init__( self, enabled: bool, + artifacts_path: Optional[Path], options: EasyOcrOptions, accelerator_options: AcceleratorOptions, ): @@ -62,15 +69,55 @@ class EasyOcrModel(BaseOcrModel): ) use_gpu = self.options.use_gpu + download_enabled = self.options.download_enabled + model_storage_directory = self.options.model_storage_directory + if artifacts_path is not None and model_storage_directory is None: + download_enabled = False + model_storage_directory = str(artifacts_path / self._model_repo_folder) + self.reader = easyocr.Reader( lang_list=self.options.lang, gpu=use_gpu, - model_storage_directory=self.options.model_storage_directory, + model_storage_directory=model_storage_directory, recog_network=self.options.recog_network, - download_enabled=self.options.download_enabled, + download_enabled=download_enabled, verbose=False, ) + @staticmethod + def download_models( + detection_models: List[str] = ["craft"], + recognition_models: List[str] = ["english_g2", "latin_g2"], + local_dir: Optional[Path] = None, + force: bool = False, + progress: bool = False, + ) -> Path: + # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py + from easyocr.config import detection_models as det_models_dict + from easyocr.config import recognition_models as rec_models_dict + + if local_dir is None: + local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder + + local_dir.mkdir(parents=True, exist_ok=True) + + # Collect models to download + download_list = [] + for model_name in detection_models: + if model_name in det_models_dict: + download_list.append(det_models_dict[model_name]) + for model_name in recognition_models: + if model_name in rec_models_dict["gen2"]: + download_list.append(rec_models_dict["gen2"][model_name]) + + # Download models + for model_details in download_list: + buf = download_url_with_progress(model_details["url"], progress=progress) + with zipfile.ZipFile(buf, "r") as zip_ref: + zip_ref.extractall(local_dir) + + return local_dir + def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index 69193c9..56890c5 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -1,7 +1,8 @@ import copy import logging +import warnings from pathlib import Path -from typing import Iterable +from typing import Iterable, Optional, Union from docling_core.types.doc import DocItemLabel from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor @@ -21,6 +22,8 @@ _log = logging.getLogger(__name__) class LayoutModel(BasePageModel): + _model_repo_folder = "docling-models" + _model_path = "model_artifacts/layout" TEXT_ELEM_LABELS = [ DocItemLabel.TEXT, @@ -42,15 +45,56 @@ class LayoutModel(BasePageModel): FORMULA_LABEL = DocItemLabel.FORMULA CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION] - def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions): + def __init__( + self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions + ): device = decide_device(accelerator_options.device) + if artifacts_path is None: + artifacts_path = self.download_models() / self._model_path + else: + # will become the default in the future + if (artifacts_path / self._model_repo_folder).exists(): + artifacts_path = ( + artifacts_path / self._model_repo_folder / self._model_path + ) + elif (artifacts_path / self._model_path).exists(): + warnings.warn( + "The usage of artifacts_path containing directly " + f"{self._model_path} is deprecated. Please point " + "the artifacts_path to the parent containing " + f"the {self._model_repo_folder} folder.", + DeprecationWarning, + stacklevel=3, + ) + artifacts_path = artifacts_path / self._model_path + self.layout_predictor = LayoutPredictor( artifact_path=str(artifacts_path), device=device, num_threads=accelerator_options.num_threads, ) + @staticmethod + def download_models( + local_dir: Optional[Path] = None, + force: bool = False, + progress: bool = False, + ) -> Path: + from huggingface_hub import snapshot_download + from huggingface_hub.utils import disable_progress_bars + + if not progress: + disable_progress_bars() + download_path = snapshot_download( + repo_id="ds4sd/docling-models", + force_download=force, + local_dir=local_dir, + revision="v2.1.0", + ) + + return Path(download_path) + def draw_clusters_and_cells_side_by_side( self, conv_res, page, clusters, mode_prefix: str, show: bool = False ): diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index f17cbed..b5ab5a2 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -1,6 +1,7 @@ import copy +import warnings from pathlib import Path -from typing import Iterable +from typing import Iterable, Optional, Union import numpy from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell @@ -22,10 +23,13 @@ from docling.utils.profiling import TimeRecorder class TableStructureModel(BasePageModel): + _model_repo_folder = "docling-models" + _model_path = "model_artifacts/tableformer" + def __init__( self, enabled: bool, - artifacts_path: Path, + artifacts_path: Optional[Path], options: TableStructureOptions, accelerator_options: AcceleratorOptions, ): @@ -35,6 +39,26 @@ class TableStructureModel(BasePageModel): self.enabled = enabled if self.enabled: + + if artifacts_path is None: + artifacts_path = self.download_models() / self._model_path + else: + # will become the default in the future + if (artifacts_path / self._model_repo_folder).exists(): + artifacts_path = ( + artifacts_path / self._model_repo_folder / self._model_path + ) + elif (artifacts_path / self._model_path).exists(): + warnings.warn( + "The usage of artifacts_path containing directly " + f"{self._model_path} is deprecated. Please point " + "the artifacts_path to the parent containing " + f"the {self._model_repo_folder} folder.", + DeprecationWarning, + stacklevel=3, + ) + artifacts_path = artifacts_path / self._model_path + if self.mode == TableFormerMode.ACCURATE: artifacts_path = artifacts_path / "accurate" else: @@ -58,6 +82,24 @@ class TableStructureModel(BasePageModel): ) self.scale = 2.0 # Scale up table input images to 144 dpi + @staticmethod + def download_models( + local_dir: Optional[Path] = None, force: bool = False, progress: bool = False + ) -> Path: + from huggingface_hub import snapshot_download + from huggingface_hub.utils import disable_progress_bars + + if not progress: + disable_progress_bars() + download_path = snapshot_download( + repo_id="ds4sd/docling-models", + force_download=force, + local_dir=local_dir, + revision="v2.1.0", + ) + + return Path(download_path) + def draw_table_and_cells( self, conv_res: ConversionResult, diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index fe2201d..4e66415 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -1,5 +1,6 @@ import logging import sys +import warnings from pathlib import Path from typing import Optional @@ -17,6 +18,7 @@ from docling.datamodel.pipeline_options import ( TesseractCliOcrOptions, TesseractOcrOptions, ) +from docling.datamodel.settings import settings from docling.models.base_ocr_model import BaseOcrModel from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions from docling.models.document_picture_classifier import ( @@ -37,23 +39,23 @@ from docling.models.table_structure_model import TableStructureModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docling.models.tesseract_ocr_model import TesseractOcrModel from docling.pipeline.base_pipeline import PaginatedPipeline +from docling.utils.model_downloader import download_models from docling.utils.profiling import ProfilingScope, TimeRecorder _log = logging.getLogger(__name__) class StandardPdfPipeline(PaginatedPipeline): - _layout_model_path = "model_artifacts/layout" - _table_model_path = "model_artifacts/tableformer" + _layout_model_path = LayoutModel._model_path + _table_model_path = TableStructureModel._model_path def __init__(self, pipeline_options: PdfPipelineOptions): super().__init__(pipeline_options) self.pipeline_options: PdfPipelineOptions - if pipeline_options.artifacts_path is None: - self.artifacts_path = self.download_models_hf() - else: - self.artifacts_path = Path(pipeline_options.artifacts_path) + artifacts_path: Optional[Path] = None + if pipeline_options.artifacts_path is not None: + artifacts_path = Path(pipeline_options.artifacts_path).expanduser() self.keep_images = ( self.pipeline_options.generate_page_images @@ -63,7 +65,7 @@ class StandardPdfPipeline(PaginatedPipeline): self.glm_model = GlmModel(options=GlmOptions()) - if (ocr_model := self.get_ocr_model()) is None: + if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None: raise RuntimeError( f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}." ) @@ -79,15 +81,13 @@ class StandardPdfPipeline(PaginatedPipeline): ocr_model, # Layout model LayoutModel( - artifacts_path=self.artifacts_path - / StandardPdfPipeline._layout_model_path, + artifacts_path=artifacts_path, accelerator_options=pipeline_options.accelerator_options, ), # Table structure model TableStructureModel( enabled=pipeline_options.do_table_structure, - artifacts_path=self.artifacts_path - / StandardPdfPipeline._table_model_path, + artifacts_path=artifacts_path, options=pipeline_options.table_structure_options, accelerator_options=pipeline_options.accelerator_options, ), @@ -101,7 +101,7 @@ class StandardPdfPipeline(PaginatedPipeline): CodeFormulaModel( enabled=pipeline_options.do_code_enrichment or pipeline_options.do_formula_enrichment, - artifacts_path=pipeline_options.artifacts_path, + artifacts_path=artifacts_path, options=CodeFormulaModelOptions( do_code_enrichment=pipeline_options.do_code_enrichment, do_formula_enrichment=pipeline_options.do_formula_enrichment, @@ -111,7 +111,7 @@ class StandardPdfPipeline(PaginatedPipeline): # Document Picture Classifier DocumentPictureClassifier( enabled=pipeline_options.do_picture_classification, - artifacts_path=pipeline_options.artifacts_path, + artifacts_path=artifacts_path, options=DocumentPictureClassifierOptions(), accelerator_options=pipeline_options.accelerator_options, ), @@ -127,23 +127,24 @@ class StandardPdfPipeline(PaginatedPipeline): def download_models_hf( local_dir: Optional[Path] = None, force: bool = False ) -> Path: - from huggingface_hub import snapshot_download - from huggingface_hub.utils import disable_progress_bars - - disable_progress_bars() - download_path = snapshot_download( - repo_id="ds4sd/docling-models", - force_download=force, - local_dir=local_dir, - revision="v2.1.0", + warnings.warn( + "The usage of StandardPdfPipeline.download_models_hf() is deprecated " + "use instead the utility `docling-tools models download`, or " + "the upstream method docling.utils.models_downloader.download_all()", + DeprecationWarning, + stacklevel=3, ) - return Path(download_path) + output_dir = download_models(output_dir=local_dir, force=force, progress=False) + return output_dir - def get_ocr_model(self) -> Optional[BaseOcrModel]: + def get_ocr_model( + self, artifacts_path: Optional[Path] = None + ) -> Optional[BaseOcrModel]: if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions): return EasyOcrModel( enabled=self.pipeline_options.do_ocr, + artifacts_path=artifacts_path, options=self.pipeline_options.ocr_options, accelerator_options=self.pipeline_options.accelerator_options, ) diff --git a/docling/utils/model_downloader.py b/docling/utils/model_downloader.py new file mode 100644 index 0000000..504618e --- /dev/null +++ b/docling/utils/model_downloader.py @@ -0,0 +1,72 @@ +import logging +from pathlib import Path +from typing import Optional + +from docling.datamodel.settings import settings +from docling.models.code_formula_model import CodeFormulaModel +from docling.models.document_picture_classifier import DocumentPictureClassifier +from docling.models.easyocr_model import EasyOcrModel +from docling.models.layout_model import LayoutModel +from docling.models.table_structure_model import TableStructureModel + +_log = logging.getLogger(__name__) + + +def download_models( + output_dir: Optional[Path] = None, + *, + force: bool = False, + progress: bool = False, + with_layout: bool = True, + with_tableformer: bool = True, + with_code_formula: bool = True, + with_picture_classifier: bool = True, + with_easyocr: bool = True, +): + if output_dir is None: + output_dir = settings.cache_dir / "models" + + # Make sure the folder exists + output_dir.mkdir(exist_ok=True, parents=True) + + if with_layout: + _log.info(f"Downloading layout model...") + LayoutModel.download_models( + local_dir=output_dir / LayoutModel._model_repo_folder, + force=force, + progress=progress, + ) + + if with_tableformer: + _log.info(f"Downloading tableformer model...") + TableStructureModel.download_models( + local_dir=output_dir / TableStructureModel._model_repo_folder, + force=force, + progress=progress, + ) + + if with_picture_classifier: + _log.info(f"Downloading picture classifier model...") + DocumentPictureClassifier.download_models( + local_dir=output_dir / DocumentPictureClassifier._model_repo_folder, + force=force, + progress=progress, + ) + + if with_code_formula: + _log.info(f"Downloading code formula model...") + CodeFormulaModel.download_models( + local_dir=output_dir / CodeFormulaModel._model_repo_folder, + force=force, + progress=progress, + ) + + if with_easyocr: + _log.info(f"Downloading easyocr models...") + EasyOcrModel.download_models( + local_dir=output_dir / EasyOcrModel._model_repo_folder, + force=force, + progress=progress, + ) + + return output_dir diff --git a/docling/utils/utils.py b/docling/utils/utils.py index 24b6942..1261f86 100644 --- a/docling/utils/utils.py +++ b/docling/utils/utils.py @@ -4,6 +4,9 @@ from itertools import islice from pathlib import Path from typing import List, Union +import requests +from tqdm import tqdm + def chunkify(iterator, chunk_size): """Yield successive chunks of chunk_size from the iterable.""" @@ -39,3 +42,24 @@ def create_hash(string: str): hasher.update(string.encode("utf-8")) return hasher.hexdigest() + + +def download_url_with_progress(url: str, progress: bool = False) -> BytesIO: + buf = BytesIO() + with requests.get(url, stream=True, allow_redirects=True) as response: + total_size = int(response.headers.get("content-length", 0)) + progress_bar = tqdm( + total=total_size, + unit="B", + unit_scale=True, + unit_divisor=1024, + disable=(not progress), + ) + + for chunk in response.iter_content(10 * 1024): + buf.write(chunk) + progress_bar.update(len(chunk)) + progress_bar.close() + + buf.seek(0) + return buf diff --git a/docs/usage.md b/docs/usage.md index a577a3e..a42bdea 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -26,12 +26,56 @@ To see all available options (export formats etc.) run `docling --help`. More de ### Advanced options +#### Model prefetching and offline usage + +By default, models are downloaded automatically upon first usage. If you would prefer +to explicitly prefetch them for offline use (e.g. in air-gapped environments) you can do +that as follows: + +**Step 1: Prefetch the models** + +Use the `docling-tools models download` utility: + +```sh +$ docling-tools models download +Downloading layout model... +Downloading tableformer model... +Downloading picture classifier model... +Downloading code formula model... +Downloading easyocr models... +Models downloaded into $HOME/.cache/docling/models. +``` + +Alternatively, models can be programmatically downloaded using `docling.utils.model_downloader.download_models()`. + +**Step 2: Use the prefetched models** + +```python +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions +from docling.document_converter import DocumentConverter, PdfFormatOption + +artifacts_path = "/local/path/to/models" + +pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path) +doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + } +) +``` + +Or using the CLI: + +```sh +docling --artifacts-path="/local/path/to/models" FILE +``` + #### Adjust pipeline features The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways one can adjust the conversion pipeline and features. - ##### Control PDF table extraction options You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself. @@ -70,28 +114,6 @@ doc_converter = DocumentConverter( ) ``` -##### Provide specific artifacts path - -By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows: - -```python -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import PdfPipelineOptions -from docling.document_converter import DocumentConverter, PdfFormatOption -from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline - -# # to explicitly prefetch: -# artifacts_path = StandardPdfPipeline.download_models_hf() - -artifacts_path = "/local/path/to/artifacts" - -pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path) -doc_converter = DocumentConverter( - format_options={ - InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) - } -) -``` #### Impose limits on the document size diff --git a/poetry.lock b/poetry.lock index 04e4e00..2ff4749 100644 --- a/poetry.lock +++ b/poetry.lock @@ -7248,6 +7248,20 @@ files = [ [package.dependencies] urllib3 = ">=2" +[[package]] +name = "types-tqdm" +version = "4.67.0.20241221" +description = "Typing stubs for tqdm" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types_tqdm-4.67.0.20241221-py3-none-any.whl", hash = "sha256:a1f1c9cda5c2d8482d2c73957a5398bfdedda10f6bc7b3b4e812d5c910486d29"}, + {file = "types_tqdm-4.67.0.20241221.tar.gz", hash = "sha256:e56046631056922385abe89aeb18af5611f471eadd7918a0ad7f34d84cd4c8cc"}, +] + +[package.dependencies] +types-requests = "*" + [[package]] name = "typing-extensions" version = "4.12.2" @@ -7837,4 +7851,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "907c7cef6722358ac30193f07f9cc15684daf1b75b6c400104e87f3b22137632" +content-hash = "7d8c8a4c2562f3e88673fb9a32d0a5f85aca0f7c7aeaa67a7a65a0f930a5c6c7" diff --git a/pyproject.toml b/pyproject.toml index 4baf50a..934aff8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ onnxruntime = [ { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" } ] pillow = "^10.0.0" +tqdm = "^4.65.0" [tool.poetry.group.dev.dependencies] black = {extras = ["jupyter"], version = "^24.4.2"} @@ -79,6 +80,7 @@ ipykernel = "^6.29.5" ipywidgets = "^8.1.5" nbqa = "^1.9.0" types-openpyxl = "^3.1.5.20241114" +types-tqdm = "^4.67.0.20241221" [tool.poetry.group.docs.dependencies] mkdocs-material = "^9.5.40" @@ -123,6 +125,7 @@ rapidocr = ["rapidocr-onnxruntime", "onnxruntime"] [tool.poetry.scripts] docling = "docling.cli.main:app" +docling-tools = "docling.cli.tools:app" [build-system] requires = ["poetry-core"]