fix: remove stderr from tesseract cli and introduce fuzziness in the text validation of OCR tests (#138)
* feat(OCR tests): Introduce fuzziness in the text validation of OCR tests Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix(TesseractOcrCliModel): Send the stderr to devnull to avoid poluting the console with messages from tesseract cmd Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> --------- Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
parent
5f1bd9e9c8
commit
dae2a3b667
@ -1,7 +1,7 @@
|
|||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
import tempfile
|
import tempfile
|
||||||
from subprocess import PIPE, Popen
|
from subprocess import DEVNULL, PIPE, Popen
|
||||||
from typing import Iterable, Tuple
|
from typing import Iterable, Tuple
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -81,7 +81,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
cmd += [ifilename, "stdout", "tsv"]
|
cmd += [ifilename, "stdout", "tsv"]
|
||||||
_log.info("command: {}".format(" ".join(cmd)))
|
_log.info("command: {}".format(" ".join(cmd)))
|
||||||
|
|
||||||
proc = Popen(cmd, stdout=PIPE)
|
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
||||||
output, _ = proc.communicate()
|
output, _ = proc.communicate()
|
||||||
|
|
||||||
# _log.info(output)
|
# _log.info(output)
|
||||||
|
@ -94,5 +94,5 @@ def test_e2e_conversions():
|
|||||||
input_path=pdf_path,
|
input_path=pdf_path,
|
||||||
doc_result=doc_result,
|
doc_result=doc_result,
|
||||||
generate=GENERATE,
|
generate=GENERATE,
|
||||||
skip_cells=True,
|
fuzzy=True,
|
||||||
)
|
)
|
||||||
|
@ -11,6 +11,42 @@ from docling.datamodel.base_models import ConversionStatus, Page
|
|||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
|
|
||||||
|
|
||||||
|
def levenshtein(str1: str, str2: str) -> int:
|
||||||
|
|
||||||
|
# Ensure str1 is the shorter string to optimize memory usage
|
||||||
|
if len(str1) > len(str2):
|
||||||
|
str1, str2 = str2, str1
|
||||||
|
|
||||||
|
# Previous and current row buffers
|
||||||
|
previous_row = list(range(len(str2) + 1))
|
||||||
|
current_row = [0] * (len(str2) + 1)
|
||||||
|
|
||||||
|
# Compute the Levenshtein distance row by row
|
||||||
|
for i, c1 in enumerate(str1, start=1):
|
||||||
|
current_row[0] = i
|
||||||
|
for j, c2 in enumerate(str2, start=1):
|
||||||
|
insertions = previous_row[j] + 1
|
||||||
|
deletions = current_row[j - 1] + 1
|
||||||
|
substitutions = previous_row[j - 1] + (c1 != c2)
|
||||||
|
current_row[j] = min(insertions, deletions, substitutions)
|
||||||
|
# Swap rows for the next iteration
|
||||||
|
previous_row, current_row = current_row, previous_row
|
||||||
|
|
||||||
|
# The result is in the last element of the previous row
|
||||||
|
return previous_row[-1]
|
||||||
|
|
||||||
|
|
||||||
|
def verify_text(gt: str, pred: str, fuzzy: bool, fuzzy_threshold: float = 0.4):
|
||||||
|
|
||||||
|
if len(gt) == 0 or not fuzzy:
|
||||||
|
assert gt == pred, f"{gt}!={pred}"
|
||||||
|
else:
|
||||||
|
dist = levenshtein(gt, pred)
|
||||||
|
diff = dist / len(gt)
|
||||||
|
assert diff < fuzzy_threshold, f"{gt}!~{pred}"
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):
|
def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):
|
||||||
|
|
||||||
assert len(doc_pred_pages) == len(
|
assert len(doc_pred_pages) == len(
|
||||||
@ -32,7 +68,6 @@ def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):
|
|||||||
|
|
||||||
true_text = cell_true_item.text
|
true_text = cell_true_item.text
|
||||||
pred_text = cell_pred_item.text
|
pred_text = cell_pred_item.text
|
||||||
|
|
||||||
assert true_text == pred_text, f"{true_text}!={pred_text}"
|
assert true_text == pred_text, f"{true_text}!={pred_text}"
|
||||||
|
|
||||||
true_bbox = cell_true_item.bbox.as_tuple()
|
true_bbox = cell_true_item.bbox.as_tuple()
|
||||||
@ -69,7 +104,7 @@ def verify_maintext(doc_pred: DsDocument, doc_true: DsDocument):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def verify_tables(doc_pred: DsDocument, doc_true: DsDocument):
|
def verify_tables(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool):
|
||||||
if doc_true.tables is None:
|
if doc_true.tables is None:
|
||||||
# No tables to check
|
# No tables to check
|
||||||
assert doc_pred.tables is None, "not expecting any table on this document"
|
assert doc_pred.tables is None, "not expecting any table on this document"
|
||||||
@ -102,9 +137,7 @@ def verify_tables(doc_pred: DsDocument, doc_true: DsDocument):
|
|||||||
# print("pred: ", pred_item.data[i][j].text)
|
# print("pred: ", pred_item.data[i][j].text)
|
||||||
# print("")
|
# print("")
|
||||||
|
|
||||||
assert (
|
verify_text(true_item.data[i][j].text, pred_item.data[i][j].text, fuzzy)
|
||||||
true_item.data[i][j].text == pred_item.data[i][j].text
|
|
||||||
), "table-cell does not have the same text"
|
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
true_item.data[i][j].obj_type == pred_item.data[i][j].obj_type
|
true_item.data[i][j].obj_type == pred_item.data[i][j].obj_type
|
||||||
@ -121,12 +154,12 @@ def verify_output(doc_pred: DsDocument, doc_true: DsDocument):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def verify_md(doc_pred_md, doc_true_md):
|
def verify_md(doc_pred_md: str, doc_true_md: str, fuzzy: bool):
|
||||||
return doc_pred_md == doc_true_md
|
return verify_text(doc_true_md, doc_pred_md, fuzzy)
|
||||||
|
|
||||||
|
|
||||||
def verify_dt(doc_pred_dt, doc_true_dt):
|
def verify_dt(doc_pred_dt: str, doc_true_dt: str, fuzzy: bool):
|
||||||
return doc_pred_dt == doc_true_dt
|
return verify_text(doc_true_dt, doc_pred_dt, fuzzy)
|
||||||
|
|
||||||
|
|
||||||
def verify_conversion_result(
|
def verify_conversion_result(
|
||||||
@ -134,7 +167,7 @@ def verify_conversion_result(
|
|||||||
doc_result: ConversionResult,
|
doc_result: ConversionResult,
|
||||||
generate: bool = False,
|
generate: bool = False,
|
||||||
ocr_engine: str = None,
|
ocr_engine: str = None,
|
||||||
skip_cells: bool = False,
|
fuzzy: bool = False,
|
||||||
):
|
):
|
||||||
PageList = TypeAdapter(List[Page])
|
PageList = TypeAdapter(List[Page])
|
||||||
|
|
||||||
@ -178,7 +211,7 @@ def verify_conversion_result(
|
|||||||
with open(dt_path, "r") as fr:
|
with open(dt_path, "r") as fr:
|
||||||
doc_true_dt = fr.read()
|
doc_true_dt = fr.read()
|
||||||
|
|
||||||
if not skip_cells:
|
if not fuzzy:
|
||||||
assert verify_cells(
|
assert verify_cells(
|
||||||
doc_pred_pages, doc_true_pages
|
doc_pred_pages, doc_true_pages
|
||||||
), f"Mismatch in PDF cell prediction for {input_path}"
|
), f"Mismatch in PDF cell prediction for {input_path}"
|
||||||
@ -188,13 +221,13 @@ def verify_conversion_result(
|
|||||||
# ), f"Mismatch in JSON prediction for {input_path}"
|
# ), f"Mismatch in JSON prediction for {input_path}"
|
||||||
|
|
||||||
assert verify_tables(
|
assert verify_tables(
|
||||||
doc_pred, doc_true
|
doc_pred, doc_true, fuzzy
|
||||||
), f"verify_tables(doc_pred, doc_true) mismatch for {input_path}"
|
), f"verify_tables(doc_pred, doc_true) mismatch for {input_path}"
|
||||||
|
|
||||||
assert verify_md(
|
assert verify_md(
|
||||||
doc_pred_md, doc_true_md
|
doc_pred_md, doc_true_md, fuzzy
|
||||||
), f"Mismatch in Markdown prediction for {input_path}"
|
), f"Mismatch in Markdown prediction for {input_path}"
|
||||||
|
|
||||||
assert verify_dt(
|
assert verify_dt(
|
||||||
doc_pred_dt, doc_true_dt
|
doc_pred_dt, doc_true_dt, fuzzy
|
||||||
), f"Mismatch in DocTags prediction for {input_path}"
|
), f"Mismatch in DocTags prediction for {input_path}"
|
||||||
|
Loading…
Reference in New Issue
Block a user