fix: Guard against attribute errors in TesseractOcrModel __del__ (#1494)

This moves the initialization of the `reader` and `script_readers`
attributes to before we attempt to import tesserocr, so that when later
accessing these attributes in the garbage collection method `__del__`
the attributes exist.

This requires changing the typing of the `script_readers` dict value to
`Any` because we cannot yet reference its actual strong type, since it's
a tesserocr value.

This prevents throwing an exception during garbage collection for
cases where the TesseractOcrModel instance didn't properly initialize,
like when it throws an `ImportError` during its initializer.

Signed-off-by: Ben Browning <bbrownin@redhat.com>
This commit is contained in:
Ben Browning 2025-04-30 11:51:33 -04:00 committed by GitHub
parent cc453961a9
commit 4ab7e9ddfb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,7 +1,7 @@
import logging import logging
from collections.abc import Iterable from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Optional, Type from typing import Any, Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell from docling_core.types.doc.page import BoundingRectangle, TextCell
@ -38,6 +38,8 @@ class TesseractOcrModel(BaseOcrModel):
self.options: TesseractOcrOptions self.options: TesseractOcrOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi. self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None
self.script_readers: dict[str, Any] = {}
if self.enabled: if self.enabled:
install_errmsg = ( install_errmsg = (
@ -84,9 +86,7 @@ class TesseractOcrModel(BaseOcrModel):
"oem": tesserocr.OEM.DEFAULT, "oem": tesserocr.OEM.DEFAULT,
} }
self.reader = None
self.osd_reader = None self.osd_reader = None
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
if self.options.path is not None: if self.options.path is not None:
tesserocr_kwargs["path"] = self.options.path tesserocr_kwargs["path"] = self.options.path