diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index cdc5671..ac8dd51 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -114,7 +114,9 @@ class TesseractOcrCliModel(BaseOcrModel): # _log.info("df: ", df.head()) # Filter rows that contain actual text (ignore header or empty rows) - df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")] + df_filtered = df[ + df["text"].notnull() & (df["text"].apply(str).str.strip() != "") + ] return df_filtered