From dae2a3b66732e1e135b00cce24226c7d9f2eb2e4 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos <100353117+nikos-livathinos@users.noreply.github.com> Date: Fri, 11 Oct 2024 10:21:19 +0200 Subject: [PATCH] fix: remove stderr from tesseract cli and introduce fuzziness in the text validation of OCR tests (#138) * feat(OCR tests): Introduce fuzziness in the text validation of OCR tests Signed-off-by: Nikos Livathinos * fix(TesseractOcrCliModel): Send the stderr to devnull to avoid poluting the console with messages from tesseract cmd Signed-off-by: Nikos Livathinos --------- Signed-off-by: Nikos Livathinos --- docling/models/tesseract_ocr_cli_model.py | 4 +- tests/test_e2e_ocr_conversion.py | 2 +- tests/verify_utils.py | 61 +++++++++++++++++------ 3 files changed, 50 insertions(+), 17 deletions(-) diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index c3c1999..052d878 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -1,7 +1,7 @@ import io import logging import tempfile -from subprocess import PIPE, Popen +from subprocess import DEVNULL, PIPE, Popen from typing import Iterable, Tuple import pandas as pd @@ -81,7 +81,7 @@ class TesseractOcrCliModel(BaseOcrModel): cmd += [ifilename, "stdout", "tsv"] _log.info("command: {}".format(" ".join(cmd))) - proc = Popen(cmd, stdout=PIPE) + proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) output, _ = proc.communicate() # _log.info(output) diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index 96bc087..d3a6128 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -94,5 +94,5 @@ def test_e2e_conversions(): input_path=pdf_path, doc_result=doc_result, generate=GENERATE, - skip_cells=True, + fuzzy=True, ) diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 082b7c7..fc587de 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -11,6 +11,42 @@ from docling.datamodel.base_models import ConversionStatus, Page from docling.datamodel.document import ConversionResult +def levenshtein(str1: str, str2: str) -> int: + + # Ensure str1 is the shorter string to optimize memory usage + if len(str1) > len(str2): + str1, str2 = str2, str1 + + # Previous and current row buffers + previous_row = list(range(len(str2) + 1)) + current_row = [0] * (len(str2) + 1) + + # Compute the Levenshtein distance row by row + for i, c1 in enumerate(str1, start=1): + current_row[0] = i + for j, c2 in enumerate(str2, start=1): + insertions = previous_row[j] + 1 + deletions = current_row[j - 1] + 1 + substitutions = previous_row[j - 1] + (c1 != c2) + current_row[j] = min(insertions, deletions, substitutions) + # Swap rows for the next iteration + previous_row, current_row = current_row, previous_row + + # The result is in the last element of the previous row + return previous_row[-1] + + +def verify_text(gt: str, pred: str, fuzzy: bool, fuzzy_threshold: float = 0.4): + + if len(gt) == 0 or not fuzzy: + assert gt == pred, f"{gt}!={pred}" + else: + dist = levenshtein(gt, pred) + diff = dist / len(gt) + assert diff < fuzzy_threshold, f"{gt}!~{pred}" + return True + + def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]): assert len(doc_pred_pages) == len( @@ -32,7 +68,6 @@ def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]): true_text = cell_true_item.text pred_text = cell_pred_item.text - assert true_text == pred_text, f"{true_text}!={pred_text}" true_bbox = cell_true_item.bbox.as_tuple() @@ -69,7 +104,7 @@ def verify_maintext(doc_pred: DsDocument, doc_true: DsDocument): return True -def verify_tables(doc_pred: DsDocument, doc_true: DsDocument): +def verify_tables(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool): if doc_true.tables is None: # No tables to check assert doc_pred.tables is None, "not expecting any table on this document" @@ -102,9 +137,7 @@ def verify_tables(doc_pred: DsDocument, doc_true: DsDocument): # print("pred: ", pred_item.data[i][j].text) # print("") - assert ( - true_item.data[i][j].text == pred_item.data[i][j].text - ), "table-cell does not have the same text" + verify_text(true_item.data[i][j].text, pred_item.data[i][j].text, fuzzy) assert ( true_item.data[i][j].obj_type == pred_item.data[i][j].obj_type @@ -121,12 +154,12 @@ def verify_output(doc_pred: DsDocument, doc_true: DsDocument): return True -def verify_md(doc_pred_md, doc_true_md): - return doc_pred_md == doc_true_md +def verify_md(doc_pred_md: str, doc_true_md: str, fuzzy: bool): + return verify_text(doc_true_md, doc_pred_md, fuzzy) -def verify_dt(doc_pred_dt, doc_true_dt): - return doc_pred_dt == doc_true_dt +def verify_dt(doc_pred_dt: str, doc_true_dt: str, fuzzy: bool): + return verify_text(doc_true_dt, doc_pred_dt, fuzzy) def verify_conversion_result( @@ -134,7 +167,7 @@ def verify_conversion_result( doc_result: ConversionResult, generate: bool = False, ocr_engine: str = None, - skip_cells: bool = False, + fuzzy: bool = False, ): PageList = TypeAdapter(List[Page]) @@ -178,7 +211,7 @@ def verify_conversion_result( with open(dt_path, "r") as fr: doc_true_dt = fr.read() - if not skip_cells: + if not fuzzy: assert verify_cells( doc_pred_pages, doc_true_pages ), f"Mismatch in PDF cell prediction for {input_path}" @@ -188,13 +221,13 @@ def verify_conversion_result( # ), f"Mismatch in JSON prediction for {input_path}" assert verify_tables( - doc_pred, doc_true + doc_pred, doc_true, fuzzy ), f"verify_tables(doc_pred, doc_true) mismatch for {input_path}" assert verify_md( - doc_pred_md, doc_true_md + doc_pred_md, doc_true_md, fuzzy ), f"Mismatch in Markdown prediction for {input_path}" assert verify_dt( - doc_pred_dt, doc_true_dt + doc_pred_dt, doc_true_dt, fuzzy ), f"Mismatch in DocTags prediction for {input_path}"