diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index 5e35a0e..67ada34 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -3,14 +3,13 @@ import logging from abc import abstractmethod from collections.abc import Iterable from pathlib import Path -from typing import List, Optional, Type +from typing import TYPE_CHECKING, List, Optional, Type import numpy as np from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc.page import TextCell from PIL import Image, ImageDraw from rtree import index -from scipy.ndimage import binary_dilation, find_objects, label from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.base_models import Page @@ -31,11 +30,16 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions): options: OcrOptions, accelerator_options: AcceleratorOptions, ): + # Make sure any delay/error from import occurs on ocr model init and not first use + from scipy.ndimage import binary_dilation, find_objects, label + self.enabled = enabled self.options = options # Computes the optimum amount and coordinates of rectangles to OCR on a given page def get_ocr_rects(self, page: Page) -> List[BoundingBox]: + from scipy.ndimage import binary_dilation, find_objects, label + BITMAP_COVERAGE_TRESHOLD = 0.75 assert page.size is not None diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index 67bb3b3..da75bb8 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -7,7 +7,6 @@ from typing import Optional import numpy as np from docling_core.types.doc import DocItemLabel -from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor from PIL import Image from docling.datamodel.accelerator_options import AcceleratorOptions @@ -51,6 +50,8 @@ class LayoutModel(BasePageModel): def __init__( self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions ): + from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor + device = decide_device(accelerator_options.device) if artifacts_path is None: diff --git a/docling/models/plugins/defaults.py b/docling/models/plugins/defaults.py index 0087357..8779847 100644 --- a/docling/models/plugins/defaults.py +++ b/docling/models/plugins/defaults.py @@ -1,13 +1,10 @@ -from docling.models.easyocr_model import EasyOcrModel -from docling.models.ocr_mac_model import OcrMacModel -from docling.models.picture_description_api_model import PictureDescriptionApiModel -from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel -from docling.models.rapid_ocr_model import RapidOcrModel -from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel -from docling.models.tesseract_ocr_model import TesseractOcrModel - - def ocr_engines(): + from docling.models.easyocr_model import EasyOcrModel + from docling.models.ocr_mac_model import OcrMacModel + from docling.models.rapid_ocr_model import RapidOcrModel + from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel + from docling.models.tesseract_ocr_model import TesseractOcrModel + return { "ocr_engines": [ EasyOcrModel, @@ -20,6 +17,9 @@ def ocr_engines(): def picture_description(): + from docling.models.picture_description_api_model import PictureDescriptionApiModel + from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel + return { "picture_description": [ PictureDescriptionVlmModel, diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index b90e85d..f5f2cb1 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -10,7 +10,6 @@ from docling_core.types.doc.page import ( BoundingRectangle, TextCellUnit, ) -from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor from PIL import ImageDraw from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions @@ -70,6 +69,9 @@ class TableStructureModel(BasePageModel): # Third Party import docling_ibm_models.tableformer.common as c + from docling_ibm_models.tableformer.data_management.tf_predictor import ( + TFPredictor, + ) device = decide_device(accelerator_options.device) diff --git a/docling/utils/accelerator_utils.py b/docling/utils/accelerator_utils.py index 09b6651..826b365 100644 --- a/docling/utils/accelerator_utils.py +++ b/docling/utils/accelerator_utils.py @@ -1,8 +1,6 @@ import logging from typing import List, Optional -import torch - from docling.datamodel.accelerator_options import AcceleratorDevice _log = logging.getLogger(__name__) @@ -18,6 +16,8 @@ def decide_device( 1. AUTO: Check for the best available device on the system. 2. User-defined: Check if the device actually exists, otherwise fall-back to CPU """ + import torch + device = "cpu" has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()