From b3d111a3cdb90b653ddaaa356f9299e9cd39b340 Mon Sep 17 00:00:00 2001 From: Guilhem VERMOREL <83694424+guilhemvermorel@users.noreply.github.com> Date: Mon, 31 Mar 2025 10:53:49 +0200 Subject: [PATCH] fix: Tesseract OCR CLI can't process images composed with numbers only (#1201) fix wrong type text extracted by tesseract_ocr_cli_model Signed-off-by: gvl4 Co-authored-by: gvl4 --- docling/models/tesseract_ocr_cli_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 56968a2..1e7fe03 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -247,7 +247,7 @@ class TesseractOcrCliModel(BaseOcrModel): cell = TextCell( index=ix, - text=text, + text=str(text), orig=text, from_ocr=True, confidence=conf / 100.0,