From 6796f0a13263281cd48712b3c71579bfd81bb0d1 Mon Sep 17 00:00:00 2001 From: fanszoro Date: Thu, 20 Feb 2025 22:41:41 +0800 Subject: [PATCH] fix: Runtime error when Pandas Series is not always of string type (#1024) Signed-off-by: fan --- docling/models/tesseract_ocr_cli_model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index cdc5671..ac8dd51 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -114,7 +114,9 @@ class TesseractOcrCliModel(BaseOcrModel): # _log.info("df: ", df.head()) # Filter rows that contain actual text (ignore header or empty rows) - df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")] + df_filtered = df[ + df["text"].notnull() & (df["text"].apply(str).str.strip() != "") + ] return df_filtered