fix: remove stderr from tesseract cli and introduce fuzziness in the text validation of OCR tests (#138)

* feat(OCR tests): Introduce fuzziness in the text validation of OCR tests

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix(TesseractOcrCliModel): Send the stderr to devnull to avoid poluting the console with messages from tesseract cmd

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

---------

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos 2024-10-11 10:21:19 +02:00 committed by GitHub
parent 5f1bd9e9c8
commit dae2a3b667
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 50 additions and 17 deletions

View File

@ -1,7 +1,7 @@
import io import io
import logging import logging
import tempfile import tempfile
from subprocess import PIPE, Popen from subprocess import DEVNULL, PIPE, Popen
from typing import Iterable, Tuple from typing import Iterable, Tuple
import pandas as pd import pandas as pd
@ -81,7 +81,7 @@ class TesseractOcrCliModel(BaseOcrModel):
cmd += [ifilename, "stdout", "tsv"] cmd += [ifilename, "stdout", "tsv"]
_log.info("command: {}".format(" ".join(cmd))) _log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE) proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate() output, _ = proc.communicate()
# _log.info(output) # _log.info(output)

View File

@ -94,5 +94,5 @@ def test_e2e_conversions():
input_path=pdf_path, input_path=pdf_path,
doc_result=doc_result, doc_result=doc_result,
generate=GENERATE, generate=GENERATE,
skip_cells=True, fuzzy=True,
) )

View File

@ -11,6 +11,42 @@ from docling.datamodel.base_models import ConversionStatus, Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
def levenshtein(str1: str, str2: str) -> int:
# Ensure str1 is the shorter string to optimize memory usage
if len(str1) > len(str2):
str1, str2 = str2, str1
# Previous and current row buffers
previous_row = list(range(len(str2) + 1))
current_row = [0] * (len(str2) + 1)
# Compute the Levenshtein distance row by row
for i, c1 in enumerate(str1, start=1):
current_row[0] = i
for j, c2 in enumerate(str2, start=1):
insertions = previous_row[j] + 1
deletions = current_row[j - 1] + 1
substitutions = previous_row[j - 1] + (c1 != c2)
current_row[j] = min(insertions, deletions, substitutions)
# Swap rows for the next iteration
previous_row, current_row = current_row, previous_row
# The result is in the last element of the previous row
return previous_row[-1]
def verify_text(gt: str, pred: str, fuzzy: bool, fuzzy_threshold: float = 0.4):
if len(gt) == 0 or not fuzzy:
assert gt == pred, f"{gt}!={pred}"
else:
dist = levenshtein(gt, pred)
diff = dist / len(gt)
assert diff < fuzzy_threshold, f"{gt}!~{pred}"
return True
def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]): def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):
assert len(doc_pred_pages) == len( assert len(doc_pred_pages) == len(
@ -32,7 +68,6 @@ def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):
true_text = cell_true_item.text true_text = cell_true_item.text
pred_text = cell_pred_item.text pred_text = cell_pred_item.text
assert true_text == pred_text, f"{true_text}!={pred_text}" assert true_text == pred_text, f"{true_text}!={pred_text}"
true_bbox = cell_true_item.bbox.as_tuple() true_bbox = cell_true_item.bbox.as_tuple()
@ -69,7 +104,7 @@ def verify_maintext(doc_pred: DsDocument, doc_true: DsDocument):
return True return True
def verify_tables(doc_pred: DsDocument, doc_true: DsDocument): def verify_tables(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool):
if doc_true.tables is None: if doc_true.tables is None:
# No tables to check # No tables to check
assert doc_pred.tables is None, "not expecting any table on this document" assert doc_pred.tables is None, "not expecting any table on this document"
@ -102,9 +137,7 @@ def verify_tables(doc_pred: DsDocument, doc_true: DsDocument):
# print("pred: ", pred_item.data[i][j].text) # print("pred: ", pred_item.data[i][j].text)
# print("") # print("")
assert ( verify_text(true_item.data[i][j].text, pred_item.data[i][j].text, fuzzy)
true_item.data[i][j].text == pred_item.data[i][j].text
), "table-cell does not have the same text"
assert ( assert (
true_item.data[i][j].obj_type == pred_item.data[i][j].obj_type true_item.data[i][j].obj_type == pred_item.data[i][j].obj_type
@ -121,12 +154,12 @@ def verify_output(doc_pred: DsDocument, doc_true: DsDocument):
return True return True
def verify_md(doc_pred_md, doc_true_md): def verify_md(doc_pred_md: str, doc_true_md: str, fuzzy: bool):
return doc_pred_md == doc_true_md return verify_text(doc_true_md, doc_pred_md, fuzzy)
def verify_dt(doc_pred_dt, doc_true_dt): def verify_dt(doc_pred_dt: str, doc_true_dt: str, fuzzy: bool):
return doc_pred_dt == doc_true_dt return verify_text(doc_true_dt, doc_pred_dt, fuzzy)
def verify_conversion_result( def verify_conversion_result(
@ -134,7 +167,7 @@ def verify_conversion_result(
doc_result: ConversionResult, doc_result: ConversionResult,
generate: bool = False, generate: bool = False,
ocr_engine: str = None, ocr_engine: str = None,
skip_cells: bool = False, fuzzy: bool = False,
): ):
PageList = TypeAdapter(List[Page]) PageList = TypeAdapter(List[Page])
@ -178,7 +211,7 @@ def verify_conversion_result(
with open(dt_path, "r") as fr: with open(dt_path, "r") as fr:
doc_true_dt = fr.read() doc_true_dt = fr.read()
if not skip_cells: if not fuzzy:
assert verify_cells( assert verify_cells(
doc_pred_pages, doc_true_pages doc_pred_pages, doc_true_pages
), f"Mismatch in PDF cell prediction for {input_path}" ), f"Mismatch in PDF cell prediction for {input_path}"
@ -188,13 +221,13 @@ def verify_conversion_result(
# ), f"Mismatch in JSON prediction for {input_path}" # ), f"Mismatch in JSON prediction for {input_path}"
assert verify_tables( assert verify_tables(
doc_pred, doc_true doc_pred, doc_true, fuzzy
), f"verify_tables(doc_pred, doc_true) mismatch for {input_path}" ), f"verify_tables(doc_pred, doc_true) mismatch for {input_path}"
assert verify_md( assert verify_md(
doc_pred_md, doc_true_md doc_pred_md, doc_true_md, fuzzy
), f"Mismatch in Markdown prediction for {input_path}" ), f"Mismatch in Markdown prediction for {input_path}"
assert verify_dt( assert verify_dt(
doc_pred_dt, doc_true_dt doc_pred_dt, doc_true_dt, fuzzy
), f"Mismatch in DocTags prediction for {input_path}" ), f"Mismatch in DocTags prediction for {input_path}"