
* test: avoid testing exact JSON Avoid testing exact JSON output in html and xml backends. Reuse the JSON verify helper function among backend test files. Improve type annotations in html backend. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Update tests/test_backend_patent_uspto.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
475 lines
17 KiB
Python
475 lines
17 KiB
Python
import json
|
|
import os
|
|
import warnings
|
|
from pathlib import Path
|
|
from typing import List, Optional
|
|
|
|
from docling_core.types.doc import (
|
|
DocItem,
|
|
DoclingDocument,
|
|
PictureItem,
|
|
TableItem,
|
|
TextItem,
|
|
)
|
|
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
|
from PIL import Image as PILImage
|
|
from pydantic import TypeAdapter
|
|
from pydantic.json import pydantic_encoder
|
|
|
|
from docling.datamodel.base_models import ConversionStatus, Page
|
|
from docling.datamodel.document import ConversionResult
|
|
|
|
|
|
def levenshtein(str1: str, str2: str) -> int:
|
|
|
|
# Ensure str1 is the shorter string to optimize memory usage
|
|
if len(str1) > len(str2):
|
|
str1, str2 = str2, str1
|
|
|
|
# Previous and current row buffers
|
|
previous_row = list(range(len(str2) + 1))
|
|
current_row = [0] * (len(str2) + 1)
|
|
|
|
# Compute the Levenshtein distance row by row
|
|
for i, c1 in enumerate(str1, start=1):
|
|
current_row[0] = i
|
|
for j, c2 in enumerate(str2, start=1):
|
|
insertions = previous_row[j] + 1
|
|
deletions = current_row[j - 1] + 1
|
|
substitutions = previous_row[j - 1] + (c1 != c2)
|
|
current_row[j] = min(insertions, deletions, substitutions)
|
|
# Swap rows for the next iteration
|
|
previous_row, current_row = current_row, previous_row
|
|
|
|
# The result is in the last element of the previous row
|
|
return previous_row[-1]
|
|
|
|
|
|
def verify_text(gt: str, pred: str, fuzzy: bool, fuzzy_threshold: float = 0.4):
|
|
|
|
if len(gt) == 0 or not fuzzy:
|
|
assert gt == pred, f"{gt}!={pred}"
|
|
else:
|
|
dist = levenshtein(gt, pred)
|
|
diff = dist / len(gt)
|
|
assert diff < fuzzy_threshold, f"{gt}!~{pred}"
|
|
return True
|
|
|
|
|
|
def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):
|
|
|
|
assert len(doc_pred_pages) == len(
|
|
doc_true_pages
|
|
), "pred- and true-doc do not have the same number of pages"
|
|
|
|
for pid, page_true_item in enumerate(doc_true_pages):
|
|
|
|
num_true_cells = len(page_true_item.cells)
|
|
num_pred_cells = len(doc_pred_pages[pid].cells)
|
|
|
|
assert (
|
|
num_true_cells == num_pred_cells
|
|
), f"num_true_cells!=num_pred_cells {num_true_cells}!={num_pred_cells}"
|
|
|
|
for cid, cell_true_item in enumerate(page_true_item.cells):
|
|
|
|
cell_pred_item = doc_pred_pages[pid].cells[cid]
|
|
|
|
true_text = cell_true_item.text
|
|
pred_text = cell_pred_item.text
|
|
assert true_text == pred_text, f"{true_text}!={pred_text}"
|
|
|
|
true_bbox = cell_true_item.bbox.as_tuple()
|
|
pred_bbox = cell_pred_item.bbox.as_tuple()
|
|
assert (
|
|
true_bbox == pred_bbox
|
|
), f"bbox is not the same: {true_bbox} != {pred_bbox}"
|
|
|
|
return True
|
|
|
|
|
|
# def verify_maintext(doc_pred: DsDocument, doc_true: DsDocument):
|
|
# assert doc_true.main_text is not None, "doc_true cannot be None"
|
|
# assert doc_pred.main_text is not None, "doc_true cannot be None"
|
|
#
|
|
# assert len(doc_true.main_text) == len(
|
|
# doc_pred.main_text
|
|
# ), f"document has different length of main-text than expected. {len(doc_true.main_text)}!={len(doc_pred.main_text)}"
|
|
#
|
|
# for l, true_item in enumerate(doc_true.main_text):
|
|
# pred_item = doc_pred.main_text[l]
|
|
# # Validate type
|
|
# assert (
|
|
# true_item.obj_type == pred_item.obj_type
|
|
# ), f"Item[{l}] type does not match. expected[{true_item.obj_type}] != predicted [{pred_item.obj_type}]"
|
|
#
|
|
# # Validate text ceels
|
|
# if isinstance(true_item, BaseText):
|
|
# assert isinstance(
|
|
# pred_item, BaseText
|
|
# ), f"{pred_item} is not a BaseText element, but {true_item} is."
|
|
# assert true_item.text == pred_item.text
|
|
#
|
|
# return True
|
|
|
|
|
|
def verify_tables_v1(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool):
|
|
if doc_true.tables is None:
|
|
# No tables to check
|
|
assert doc_pred.tables is None, "not expecting any table on this document"
|
|
return True
|
|
|
|
assert doc_pred.tables is not None, "no tables predicted, but expected in doc_true"
|
|
|
|
# print("Expected number of tables: {}, result: {}".format(len(doc_true.tables), len(doc_pred.tables)))
|
|
|
|
assert len(doc_true.tables) == len(
|
|
doc_pred.tables
|
|
), "document has different count of tables than expected."
|
|
|
|
for l, true_item in enumerate(doc_true.tables):
|
|
pred_item = doc_pred.tables[l]
|
|
|
|
assert (
|
|
true_item.num_rows == pred_item.num_rows
|
|
), "table does not have the same #-rows"
|
|
assert (
|
|
true_item.num_cols == pred_item.num_cols
|
|
), "table does not have the same #-cols"
|
|
|
|
assert true_item.data is not None, "documents are expected to have table data"
|
|
assert pred_item.data is not None, "documents are expected to have table data"
|
|
|
|
print("True: \n", true_item.export_to_dataframe().to_markdown())
|
|
print("Pred: \n", true_item.export_to_dataframe().to_markdown())
|
|
|
|
for i, row in enumerate(true_item.data):
|
|
for j, col in enumerate(true_item.data[i]):
|
|
|
|
# print("true: ", true_item.data[i][j].text)
|
|
# print("pred: ", pred_item.data[i][j].text)
|
|
# print("")
|
|
|
|
verify_text(
|
|
true_item.data[i][j].text, pred_item.data[i][j].text, fuzzy=fuzzy
|
|
)
|
|
|
|
assert (
|
|
true_item.data[i][j].obj_type == pred_item.data[i][j].obj_type
|
|
), "table-cell does not have the same type"
|
|
|
|
return True
|
|
|
|
|
|
def verify_table_v2(true_item: TableItem, pred_item: TableItem, fuzzy: bool):
|
|
assert (
|
|
true_item.data.num_rows == pred_item.data.num_rows
|
|
), "table does not have the same #-rows"
|
|
assert (
|
|
true_item.data.num_cols == pred_item.data.num_cols
|
|
), "table does not have the same #-cols"
|
|
|
|
assert true_item.data is not None, "documents are expected to have table data"
|
|
assert pred_item.data is not None, "documents are expected to have table data"
|
|
|
|
# print("True: \n", true_item.export_to_dataframe().to_markdown())
|
|
# print("Pred: \n", true_item.export_to_dataframe().to_markdown())
|
|
|
|
for i, row in enumerate(true_item.data.grid):
|
|
for j, col in enumerate(true_item.data.grid[i]):
|
|
|
|
# print("true: ", true_item.data[i][j].text)
|
|
# print("pred: ", pred_item.data[i][j].text)
|
|
# print("")
|
|
|
|
verify_text(
|
|
true_item.data.grid[i][j].text,
|
|
pred_item.data.grid[i][j].text,
|
|
fuzzy=fuzzy,
|
|
)
|
|
|
|
assert (
|
|
true_item.data.grid[i][j].column_header
|
|
== pred_item.data.grid[i][j].column_header
|
|
), "table-cell should be a column_header but prediction isn't"
|
|
|
|
assert (
|
|
true_item.data.grid[i][j].row_header
|
|
== pred_item.data.grid[i][j].row_header
|
|
), "table-cell should be a row_header but prediction isn't"
|
|
|
|
assert (
|
|
true_item.data.grid[i][j].row_section
|
|
== pred_item.data.grid[i][j].row_section
|
|
), "table-cell should be a row_section but prediction isn't"
|
|
|
|
return True
|
|
|
|
|
|
def verify_picture_image_v2(
|
|
true_image: PILImage.Image, pred_item: Optional[PILImage.Image]
|
|
):
|
|
assert pred_item is not None, "predicted image is None"
|
|
assert true_image.size == pred_item.size
|
|
assert true_image.mode == pred_item.mode
|
|
# assert true_image.tobytes() == pred_item.tobytes()
|
|
return True
|
|
|
|
|
|
# def verify_output(doc_pred: DsDocument, doc_true: DsDocument):
|
|
# #assert verify_maintext(doc_pred, doc_true), "verify_maintext(doc_pred, doc_true)"
|
|
# assert verify_tables_v1(doc_pred, doc_true), "verify_tables(doc_pred, doc_true)"
|
|
# return True
|
|
|
|
|
|
def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy: bool):
|
|
assert len(doc_pred.texts) == len(doc_true.texts), f"Text lengths do not match."
|
|
|
|
assert len(doc_true.tables) == len(
|
|
doc_pred.tables
|
|
), "document has different count of tables than expected."
|
|
|
|
for (true_item, _true_level), (pred_item, _pred_level) in zip(
|
|
doc_true.iterate_items(), doc_pred.iterate_items()
|
|
):
|
|
if not isinstance(true_item, DocItem):
|
|
continue
|
|
assert isinstance(pred_item, DocItem), "Test item is not a DocItem"
|
|
|
|
# Validate type
|
|
assert true_item.label == pred_item.label, f"Object label does not match."
|
|
|
|
# Validate provenance
|
|
assert len(true_item.prov) == len(pred_item.prov), "Length of prov mismatch"
|
|
if len(true_item.prov) > 0:
|
|
true_prov = true_item.prov[0]
|
|
pred_prov = pred_item.prov[0]
|
|
|
|
assert true_prov.page_no == pred_prov.page_no, "Page provenance mistmatch"
|
|
|
|
# TODO: add bbox check with tolerance
|
|
|
|
# Validate text content
|
|
if isinstance(true_item, TextItem):
|
|
assert isinstance(pred_item, TextItem), (
|
|
"Test item is not a TextItem as the expected one "
|
|
f"{true_item=} "
|
|
f"{pred_item=} "
|
|
)
|
|
|
|
assert verify_text(true_item.text, pred_item.text, fuzzy=fuzzy)
|
|
|
|
# Validate table content
|
|
if isinstance(true_item, TableItem):
|
|
assert isinstance(
|
|
pred_item, TableItem
|
|
), "Test item is not a TableItem as the expected one"
|
|
assert verify_table_v2(
|
|
true_item, pred_item, fuzzy=fuzzy
|
|
), "Tables not matching"
|
|
|
|
# Validate picture content
|
|
if isinstance(true_item, PictureItem):
|
|
assert isinstance(
|
|
pred_item, PictureItem
|
|
), "Test item is not a PictureItem as the expected one"
|
|
|
|
true_image = true_item.get_image(doc=doc_true)
|
|
pred_image = true_item.get_image(doc=doc_pred)
|
|
if true_image is not None:
|
|
assert verify_picture_image_v2(
|
|
true_image, pred_image
|
|
), "Picture image mismatch"
|
|
|
|
# TODO: check picture annotations
|
|
|
|
return True
|
|
|
|
|
|
def verify_md(doc_pred_md: str, doc_true_md: str, fuzzy: bool):
|
|
return verify_text(doc_true_md, doc_pred_md, fuzzy)
|
|
|
|
|
|
def verify_dt(doc_pred_dt: str, doc_true_dt: str, fuzzy: bool):
|
|
return verify_text(doc_true_dt, doc_pred_dt, fuzzy)
|
|
|
|
|
|
def verify_conversion_result_v1(
|
|
input_path: Path,
|
|
doc_result: ConversionResult,
|
|
generate: bool = False,
|
|
ocr_engine: str = None,
|
|
fuzzy: bool = False,
|
|
):
|
|
PageList = TypeAdapter(List[Page])
|
|
|
|
assert (
|
|
doc_result.status == ConversionStatus.SUCCESS
|
|
), f"Doc {input_path} did not convert successfully."
|
|
|
|
doc_pred_pages: List[Page] = doc_result.pages
|
|
doc_pred: DsDocument = doc_result.legacy_document
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore", DeprecationWarning)
|
|
doc_pred_md = doc_result.legacy_document.export_to_markdown()
|
|
doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()
|
|
|
|
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
|
|
|
gt_subpath = input_path.parent / "groundtruth" / "docling_v1" / input_path.name
|
|
if str(input_path.parent).endswith("pdf"):
|
|
gt_subpath = (
|
|
input_path.parent.parent / "groundtruth" / "docling_v1" / input_path.name
|
|
)
|
|
|
|
pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json")
|
|
json_path = gt_subpath.with_suffix(f"{engine_suffix}.json")
|
|
md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")
|
|
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
|
|
|
|
if generate: # only used when re-generating truth
|
|
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(pages_path, "w") as fw:
|
|
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
|
|
|
|
json_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(json_path, "w") as fw:
|
|
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
|
|
|
|
md_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(md_path, "w") as fw:
|
|
fw.write(doc_pred_md)
|
|
|
|
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(dt_path, "w") as fw:
|
|
fw.write(doc_pred_dt)
|
|
else: # default branch in test
|
|
with open(pages_path, "r") as fr:
|
|
doc_true_pages = PageList.validate_json(fr.read())
|
|
|
|
with open(json_path, "r") as fr:
|
|
doc_true: DsDocument = DsDocument.model_validate_json(fr.read())
|
|
|
|
with open(md_path, "r") as fr:
|
|
doc_true_md = fr.read()
|
|
|
|
with open(dt_path, "r") as fr:
|
|
doc_true_dt = fr.read()
|
|
|
|
if not fuzzy:
|
|
assert verify_cells(
|
|
doc_pred_pages, doc_true_pages
|
|
), f"Mismatch in PDF cell prediction for {input_path}"
|
|
|
|
# assert verify_output(
|
|
# doc_pred, doc_true
|
|
# ), f"Mismatch in JSON prediction for {input_path}"
|
|
|
|
assert verify_tables_v1(
|
|
doc_pred, doc_true, fuzzy=fuzzy
|
|
), f"verify_tables(doc_pred, doc_true) mismatch for {input_path}"
|
|
|
|
assert verify_md(
|
|
doc_pred_md, doc_true_md, fuzzy=fuzzy
|
|
), f"Mismatch in Markdown prediction for {input_path}"
|
|
|
|
assert verify_dt(
|
|
doc_pred_dt, doc_true_dt, fuzzy=fuzzy
|
|
), f"Mismatch in DocTags prediction for {input_path}"
|
|
|
|
|
|
def verify_conversion_result_v2(
|
|
input_path: Path,
|
|
doc_result: ConversionResult,
|
|
generate: bool = False,
|
|
ocr_engine: str = None,
|
|
fuzzy: bool = False,
|
|
):
|
|
PageList = TypeAdapter(List[Page])
|
|
|
|
assert (
|
|
doc_result.status == ConversionStatus.SUCCESS
|
|
), f"Doc {input_path} did not convert successfully."
|
|
|
|
doc_pred_pages: List[Page] = doc_result.pages
|
|
doc_pred: DoclingDocument = doc_result.document
|
|
doc_pred_md = doc_result.document.export_to_markdown()
|
|
doc_pred_dt = doc_result.document.export_to_document_tokens()
|
|
|
|
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
|
|
|
gt_subpath = input_path.parent / "groundtruth" / "docling_v2" / input_path.name
|
|
if str(input_path.parent).endswith("pdf"):
|
|
gt_subpath = (
|
|
input_path.parent.parent / "groundtruth" / "docling_v2" / input_path.name
|
|
)
|
|
|
|
pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json")
|
|
json_path = gt_subpath.with_suffix(f"{engine_suffix}.json")
|
|
md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")
|
|
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
|
|
|
|
if generate: # only used when re-generating truth
|
|
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(pages_path, "w") as fw:
|
|
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
|
|
|
|
json_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(json_path, "w") as fw:
|
|
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
|
|
|
|
md_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(md_path, "w") as fw:
|
|
fw.write(doc_pred_md)
|
|
|
|
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(dt_path, "w") as fw:
|
|
fw.write(doc_pred_dt)
|
|
else: # default branch in test
|
|
with open(pages_path, "r") as fr:
|
|
doc_true_pages = PageList.validate_json(fr.read())
|
|
|
|
with open(json_path, "r") as fr:
|
|
doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())
|
|
|
|
with open(md_path, "r") as fr:
|
|
doc_true_md = fr.read()
|
|
|
|
with open(dt_path, "r") as fr:
|
|
doc_true_dt = fr.read()
|
|
|
|
if not fuzzy:
|
|
assert verify_cells(
|
|
doc_pred_pages, doc_true_pages
|
|
), f"Mismatch in PDF cell prediction for {input_path}"
|
|
|
|
# assert verify_output(
|
|
# doc_pred, doc_true
|
|
# ), f"Mismatch in JSON prediction for {input_path}"
|
|
|
|
assert verify_docitems(
|
|
doc_pred, doc_true, fuzzy=fuzzy
|
|
), f"verify_docling_document(doc_pred, doc_true) mismatch for {input_path}"
|
|
|
|
assert verify_md(
|
|
doc_pred_md, doc_true_md, fuzzy=fuzzy
|
|
), f"Mismatch in Markdown prediction for {input_path}"
|
|
|
|
assert verify_dt(
|
|
doc_pred_dt, doc_true_dt, fuzzy=fuzzy
|
|
), f"Mismatch in DocTags prediction for {input_path}"
|
|
|
|
|
|
def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
|
|
|
|
if not os.path.exists(gtfile) or generate:
|
|
with open(gtfile, "w") as fw:
|
|
json.dump(pred_doc.export_to_dict(), fw, indent=2)
|
|
|
|
return True
|
|
else:
|
|
with open(gtfile) as fr:
|
|
true_doc = DoclingDocument.model_validate_json(fr.read())
|
|
|
|
return verify_docitems(pred_doc, true_doc, fuzzy=False)
|