fix: Guard against attribute errors in TesseractOcrModel __del__ (#1494)

This moves the initialization of the `reader` and `script_readers`
attributes to before we attempt to import tesserocr, so that when later
accessing these attributes in the garbage collection method `__del__`
the attributes exist.

This requires changing the typing of the `script_readers` dict value to
`Any` because we cannot yet reference its actual strong type, since it's
a tesserocr value.

This prevents throwing an exception during garbage collection for
cases where the TesseractOcrModel instance didn't properly initialize,
like when it throws an `ImportError` during its initializer.

Signed-off-by: Ben Browning <bbrownin@redhat.com>
This commit is contained in:
Ben Browning 2025-04-30 11:51:33 -04:00 committed by GitHub
parent cc453961a9
commit 4ab7e9ddfb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,7 +1,7 @@
import logging
from collections.abc import Iterable
from pathlib import Path
from typing import Optional, Type
from typing import Any, Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
@ -38,6 +38,8 @@ class TesseractOcrModel(BaseOcrModel):
self.options: TesseractOcrOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None
self.script_readers: dict[str, Any] = {}
if self.enabled:
install_errmsg = (
@ -84,9 +86,7 @@ class TesseractOcrModel(BaseOcrModel):
"oem": tesserocr.OEM.DEFAULT,
}
self.reader = None
self.osd_reader = None
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
if self.options.path is not None:
tesserocr_kwargs["path"] = self.options.path