test: ensure utf-8 in test data utils (#1691)
Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
parent
984cb137f6
commit
1c8a1283c4
@ -323,33 +323,33 @@ def verify_conversion_result_v1(
|
||||
|
||||
if generate: # only used when re-generating truth
|
||||
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(pages_path, "w") as fw:
|
||||
with open(pages_path, mode="w", encoding="utf-8") as fw:
|
||||
fw.write(
|
||||
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
|
||||
)
|
||||
|
||||
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(json_path, "w") as fw:
|
||||
with open(json_path, mode="w", encoding="utf-8") as fw:
|
||||
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
|
||||
|
||||
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(md_path, "w") as fw:
|
||||
with open(md_path, mode="w", encoding="utf-8") as fw:
|
||||
fw.write(doc_pred_md)
|
||||
|
||||
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(dt_path, "w") as fw:
|
||||
with open(dt_path, mode="w", encoding="utf-8") as fw:
|
||||
fw.write(doc_pred_dt)
|
||||
else: # default branch in test
|
||||
with open(pages_path) as fr:
|
||||
with open(pages_path, encoding="utf-8") as fr:
|
||||
doc_true_pages = PageList.validate_json(fr.read())
|
||||
|
||||
with open(json_path) as fr:
|
||||
with open(json_path, encoding="utf-8") as fr:
|
||||
doc_true: DsDocument = DsDocument.model_validate_json(fr.read())
|
||||
|
||||
with open(md_path) as fr:
|
||||
with open(md_path, encoding="utf-8") as fr:
|
||||
doc_true_md = fr.read()
|
||||
|
||||
with open(dt_path) as fr:
|
||||
with open(dt_path, encoding="utf-8") as fr:
|
||||
doc_true_dt = fr.read()
|
||||
|
||||
if not fuzzy:
|
||||
@ -408,33 +408,33 @@ def verify_conversion_result_v2(
|
||||
|
||||
if generate: # only used when re-generating truth
|
||||
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(pages_path, "w") as fw:
|
||||
with open(pages_path, mode="w", encoding="utf-8") as fw:
|
||||
fw.write(
|
||||
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
|
||||
)
|
||||
|
||||
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(json_path, "w") as fw:
|
||||
with open(json_path, mode="w", encoding="utf-8") as fw:
|
||||
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
|
||||
|
||||
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(md_path, "w") as fw:
|
||||
with open(md_path, mode="w", encoding="utf-8") as fw:
|
||||
fw.write(doc_pred_md)
|
||||
|
||||
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(dt_path, "w") as fw:
|
||||
with open(dt_path, mode="w", encoding="utf-8") as fw:
|
||||
fw.write(doc_pred_dt)
|
||||
else: # default branch in test
|
||||
with open(pages_path) as fr:
|
||||
with open(pages_path, encoding="utf-8") as fr:
|
||||
doc_true_pages = PageList.validate_json(fr.read())
|
||||
|
||||
with open(json_path) as fr:
|
||||
with open(json_path, encoding="utf-8") as fr:
|
||||
doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())
|
||||
|
||||
with open(md_path) as fr:
|
||||
with open(md_path, encoding="utf-8") as fr:
|
||||
doc_true_md = fr.read()
|
||||
|
||||
with open(dt_path) as fr:
|
||||
with open(dt_path, encoding="utf-8") as fr:
|
||||
doc_true_dt = fr.read()
|
||||
|
||||
if not fuzzy:
|
||||
@ -461,12 +461,12 @@ def verify_conversion_result_v2(
|
||||
|
||||
def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
|
||||
if not os.path.exists(gtfile) or generate:
|
||||
with open(gtfile, "w") as fw:
|
||||
with open(gtfile, mode="w", encoding="utf-8") as fw:
|
||||
json.dump(pred_doc.export_to_dict(), fw, ensure_ascii=False, indent=2)
|
||||
|
||||
return True
|
||||
else:
|
||||
with open(gtfile) as fr:
|
||||
with open(gtfile, encoding="utf-8") as fr:
|
||||
true_doc = DoclingDocument.model_validate_json(fr.read())
|
||||
|
||||
return verify_docitems(pred_doc, true_doc, fuzzy=False)
|
||||
@ -476,11 +476,11 @@ def verify_export(pred_text: str, gtfile: str, generate: bool = False) -> bool:
|
||||
file = Path(gtfile)
|
||||
|
||||
if not file.exists() or generate:
|
||||
with file.open("w") as fw:
|
||||
with file.open(mode="w", encoding="utf-8") as fw:
|
||||
fw.write(pred_text)
|
||||
return True
|
||||
|
||||
with file.open("r") as fr:
|
||||
with file.open(encoding="utf-8") as fr:
|
||||
true_text = fr.read()
|
||||
|
||||
return pred_text == true_text
|
||||
|
Loading…
Reference in New Issue
Block a user