From 4ab7e9ddfb9d8fd0abc483efb70e701447a602c5 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Wed, 30 Apr 2025 11:51:33 -0400 Subject: [PATCH] fix: Guard against attribute errors in TesseractOcrModel __del__ (#1494) This moves the initialization of the `reader` and `script_readers` attributes to before we attempt to import tesserocr, so that when later accessing these attributes in the garbage collection method `__del__` the attributes exist. This requires changing the typing of the `script_readers` dict value to `Any` because we cannot yet reference its actual strong type, since it's a tesserocr value. This prevents throwing an exception during garbage collection for cases where the TesseractOcrModel instance didn't properly initialize, like when it throws an `ImportError` during its initializer. Signed-off-by: Ben Browning --- docling/models/tesseract_ocr_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 989ce0e..92a8169 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -1,7 +1,7 @@ import logging from collections.abc import Iterable from pathlib import Path -from typing import Optional, Type +from typing import Any, Optional, Type from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc.page import BoundingRectangle, TextCell @@ -38,6 +38,8 @@ class TesseractOcrModel(BaseOcrModel): self.options: TesseractOcrOptions self.scale = 3 # multiplier for 72 dpi == 216 dpi. + self.reader = None + self.script_readers: dict[str, Any] = {} if self.enabled: install_errmsg = ( @@ -84,9 +86,7 @@ class TesseractOcrModel(BaseOcrModel): "oem": tesserocr.OEM.DEFAULT, } - self.reader = None self.osd_reader = None - self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {} if self.options.path is not None: tesserocr_kwargs["path"] = self.options.path