fix: Guard against attribute errors in TesseractOcrModel __del__ (#1494)

This moves the initialization of the `reader` and `script_readers` attributes to before we attempt to import tesserocr, so that when later accessing these attributes in the garbage collection method `__del__` the attributes exist. This requires changing the typing of the `script_readers` dict value to `Any` because we cannot yet reference its actual strong type, since it's a tesserocr value. This prevents throwing an exception during garbage collection for cases where the TesseractOcrModel instance didn't properly initialize, like when it throws an `ImportError` during its initializer. Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-04-30 11:51:33 -04:00 · 2025-04-30 11:51:33 -04:00 · 4ab7e9ddfb
commit 4ab7e9ddfb
parent cc453961a9
1 changed files with 3 additions and 3 deletions
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -1,7 +1,7 @@
 import logging
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Optional, Type
+from typing import Any, Optional, Type

 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
@ -38,6 +38,8 @@ class TesseractOcrModel(BaseOcrModel):
        self.options: TesseractOcrOptions

        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+        self.reader = None
+        self.script_readers: dict[str, Any] = {}

        if self.enabled:
            install_errmsg = (
@ -84,9 +86,7 @@ class TesseractOcrModel(BaseOcrModel):
                "oem": tesserocr.OEM.DEFAULT,
            }

-            self.reader = None
            self.osd_reader = None
-            self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}

            if self.options.path is not None:
                tesserocr_kwargs["path"] = self.options.path