From dae2a3b66732e1e135b00cce24226c7d9f2eb2e4 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <100353117+nikos-livathinos@users.noreply.github.com>
Date: Fri, 11 Oct 2024 10:21:19 +0200
Subject: [PATCH] fix: remove stderr from tesseract cli and introduce fuzziness
 in the text validation of OCR tests (#138)

* feat(OCR tests): Introduce fuzziness in the text validation of OCR tests

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix(TesseractOcrCliModel): Send the stderr to devnull to avoid poluting the console with messages from tesseract cmd

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

---------

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling/models/tesseract_ocr_cli_model.py |  4 +-
 tests/test_e2e_ocr_conversion.py          |  2 +-
 tests/verify_utils.py                     | 61 +++++++++++++++++------
 3 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py
index c3c1999..052d878 100644
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@@ -1,7 +1,7 @@
 import io
 import logging
 import tempfile
-from subprocess import PIPE, Popen
+from subprocess import DEVNULL, PIPE, Popen
 from typing import Iterable, Tuple
 
 import pandas as pd
@@ -81,7 +81,7 @@ class TesseractOcrCliModel(BaseOcrModel):
         cmd += [ifilename, "stdout", "tsv"]
         _log.info("command: {}".format(" ".join(cmd)))
 
-        proc = Popen(cmd, stdout=PIPE)
+        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
         output, _ = proc.communicate()
 
         # _log.info(output)
diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py
index 96bc087..d3a6128 100644
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@@ -94,5 +94,5 @@ def test_e2e_conversions():
                 input_path=pdf_path,
                 doc_result=doc_result,
                 generate=GENERATE,
-                skip_cells=True,
+                fuzzy=True,
             )
diff --git a/tests/verify_utils.py b/tests/verify_utils.py
index 082b7c7..fc587de 100644
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -11,6 +11,42 @@ from docling.datamodel.base_models import ConversionStatus, Page
 from docling.datamodel.document import ConversionResult
 
 
+def levenshtein(str1: str, str2: str) -> int:
+
+    # Ensure str1 is the shorter string to optimize memory usage
+    if len(str1) > len(str2):
+        str1, str2 = str2, str1
+
+    # Previous and current row buffers
+    previous_row = list(range(len(str2) + 1))
+    current_row = [0] * (len(str2) + 1)
+
+    # Compute the Levenshtein distance row by row
+    for i, c1 in enumerate(str1, start=1):
+        current_row[0] = i
+        for j, c2 in enumerate(str2, start=1):
+            insertions = previous_row[j] + 1
+            deletions = current_row[j - 1] + 1
+            substitutions = previous_row[j - 1] + (c1 != c2)
+            current_row[j] = min(insertions, deletions, substitutions)
+        # Swap rows for the next iteration
+        previous_row, current_row = current_row, previous_row
+
+    # The result is in the last element of the previous row
+    return previous_row[-1]
+
+
+def verify_text(gt: str, pred: str, fuzzy: bool, fuzzy_threshold: float = 0.4):
+
+    if len(gt) == 0 or not fuzzy:
+        assert gt == pred, f"{gt}!={pred}"
+    else:
+        dist = levenshtein(gt, pred)
+        diff = dist / len(gt)
+        assert diff < fuzzy_threshold, f"{gt}!~{pred}"
+    return True
+
+
 def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):
 
     assert len(doc_pred_pages) == len(
@@ -32,7 +68,6 @@ def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):
 
             true_text = cell_true_item.text
             pred_text = cell_pred_item.text
-
             assert true_text == pred_text, f"{true_text}!={pred_text}"
 
             true_bbox = cell_true_item.bbox.as_tuple()
@@ -69,7 +104,7 @@ def verify_maintext(doc_pred: DsDocument, doc_true: DsDocument):
     return True
 
 
-def verify_tables(doc_pred: DsDocument, doc_true: DsDocument):
+def verify_tables(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool):
     if doc_true.tables is None:
         # No tables to check
         assert doc_pred.tables is None, "not expecting any table on this document"
@@ -102,9 +137,7 @@ def verify_tables(doc_pred: DsDocument, doc_true: DsDocument):
                 # print("pred: ", pred_item.data[i][j].text)
                 # print("")
 
-                assert (
-                    true_item.data[i][j].text == pred_item.data[i][j].text
-                ), "table-cell does not have the same text"
+                verify_text(true_item.data[i][j].text, pred_item.data[i][j].text, fuzzy)
 
                 assert (
                     true_item.data[i][j].obj_type == pred_item.data[i][j].obj_type
@@ -121,12 +154,12 @@ def verify_output(doc_pred: DsDocument, doc_true: DsDocument):
     return True
 
 
-def verify_md(doc_pred_md, doc_true_md):
-    return doc_pred_md == doc_true_md
+def verify_md(doc_pred_md: str, doc_true_md: str, fuzzy: bool):
+    return verify_text(doc_true_md, doc_pred_md, fuzzy)
 
 
-def verify_dt(doc_pred_dt, doc_true_dt):
-    return doc_pred_dt == doc_true_dt
+def verify_dt(doc_pred_dt: str, doc_true_dt: str, fuzzy: bool):
+    return verify_text(doc_true_dt, doc_pred_dt, fuzzy)
 
 
 def verify_conversion_result(
@@ -134,7 +167,7 @@ def verify_conversion_result(
     doc_result: ConversionResult,
     generate: bool = False,
     ocr_engine: str = None,
-    skip_cells: bool = False,
+    fuzzy: bool = False,
 ):
     PageList = TypeAdapter(List[Page])
 
@@ -178,7 +211,7 @@ def verify_conversion_result(
         with open(dt_path, "r") as fr:
             doc_true_dt = fr.read()
 
-        if not skip_cells:
+        if not fuzzy:
             assert verify_cells(
                 doc_pred_pages, doc_true_pages
             ), f"Mismatch in PDF cell prediction for {input_path}"
@@ -188,13 +221,13 @@ def verify_conversion_result(
         # ), f"Mismatch in JSON prediction for {input_path}"
 
         assert verify_tables(
-            doc_pred, doc_true
+            doc_pred, doc_true, fuzzy
         ), f"verify_tables(doc_pred, doc_true) mismatch for {input_path}"
 
         assert verify_md(
-            doc_pred_md, doc_true_md
+            doc_pred_md, doc_true_md, fuzzy
         ), f"Mismatch in Markdown prediction for {input_path}"
 
         assert verify_dt(
-            doc_pred_dt, doc_true_dt
+            doc_pred_dt, doc_true_dt, fuzzy
         ), f"Mismatch in DocTags prediction for {input_path}"