From 1d17e7397a2a2410d22045069912ed9eb65a0191 Mon Sep 17 00:00:00 2001 From: Suehtam <84603752+MatheusAbdias@users.noreply.github.com> Date: Mon, 24 Feb 2025 07:10:40 +0000 Subject: [PATCH] test: avoid testing exact JSON in CSV backend (#1038) * feat: updated verify_export Moved verify_export to verify_utils Reuse verify_export in tests Signed-off-by: Matheus Abdias * feat: replace verify_export with verify_document in CSV conversion tests Signed-off-by: Matheus Abdias --------- Signed-off-by: Matheus Abdias --- tests/test_backend_csv.py | 21 +++------------------ tests/test_backend_html.py | 18 +----------------- tests/test_backend_jats.py | 14 +------------- tests/test_backend_msexcel.py | 18 +----------------- tests/test_backend_msword.py | 17 +---------------- tests/test_backend_pptx.py | 18 +----------------- tests/verify_utils.py | 14 ++++++++++++++ 7 files changed, 22 insertions(+), 98 deletions(-) diff --git a/tests/test_backend_csv.py b/tests/test_backend_csv.py index 8bdbc8d..252f7c6 100644 --- a/tests/test_backend_csv.py +++ b/tests/test_backend_csv.py @@ -8,6 +8,8 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult, DoclingDocument from docling.document_converter import DocumentConverter +from .verify_utils import verify_document, verify_export + GENERATE = False @@ -33,22 +35,6 @@ def get_converter(): return converter -def verify_export(pred_text: str, gtfile: str): - - if not os.path.exists(gtfile) or GENERATE: - with open(gtfile, "w") as fw: - fw.write(pred_text) - - return True - - else: - with open(gtfile, "r") as fr: - true_text = fr.read() - - assert pred_text == true_text, "pred_itxt==true_itxt" - return pred_text == true_text - - def test_e2e_valid_csv_conversions(): valid_csv_paths = get_csv_paths() converter = get_converter() @@ -72,8 +58,7 @@ def test_e2e_valid_csv_conversions(): pred_itxt, str(gt_path) + ".itxt" ), "export to indented-text" - pred_json: str = json.dumps(doc.export_to_dict(), indent=2) - assert verify_export(pred_json, str(gt_path) + ".json"), "export to json" + assert verify_document(doc, str(gt_path) + ".json"), "export to json" def test_e2e_invalid_csv_conversions(): diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py index 02fb0c3..6c1db06 100644 --- a/tests/test_backend_html.py +++ b/tests/test_backend_html.py @@ -11,7 +11,7 @@ from docling.datamodel.document import ( ) from docling.document_converter import DocumentConverter -from .verify_utils import verify_document +from .verify_utils import verify_document, verify_export GENERATE = False @@ -58,22 +58,6 @@ def get_converter(): return converter -def verify_export(pred_text: str, gtfile: str): - - if not os.path.exists(gtfile) or GENERATE: - with open(gtfile, "w") as fw: - fw.write(pred_text) - - return True - - else: - with open(gtfile) as fr: - true_text = fr.read() - - assert pred_text == true_text, f"pred_text!=true_text for {gtfile}" - return pred_text == true_text - - def test_e2e_html_conversions(): html_paths = get_html_paths() diff --git a/tests/test_backend_jats.py b/tests/test_backend_jats.py index 377cf3b..a338e32 100644 --- a/tests/test_backend_jats.py +++ b/tests/test_backend_jats.py @@ -8,7 +8,7 @@ from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.document import ConversionResult from docling.document_converter import DocumentConverter -from .verify_utils import verify_document +from .verify_utils import verify_document, verify_export GENERATE = False @@ -24,18 +24,6 @@ def get_converter(): return converter -def verify_export(pred_text: str, gtfile: str): - if not os.path.exists(gtfile) or GENERATE: - with open(gtfile, "w") as fw: - fw.write(pred_text) - return True - else: - with open(gtfile, "r") as fr: - true_text = fr.read() - assert pred_text == true_text, f"pred_text!=true_text for {gtfile}" - return pred_text == true_text - - def test_e2e_pubmed_conversions(use_stream=False): pubmed_paths = get_pubmed_paths() converter = get_converter() diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index 5324185..549088e 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -5,7 +5,7 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult, DoclingDocument from docling.document_converter import DocumentConverter -from .verify_utils import verify_document +from .verify_utils import verify_document, verify_export GENERATE = False @@ -27,22 +27,6 @@ def get_converter(): return converter -def verify_export(pred_text: str, gtfile: str): - - if not os.path.exists(gtfile) or GENERATE: - with open(gtfile, "w") as fw: - fw.write(pred_text) - - return True - - else: - with open(gtfile, "r") as fr: - true_text = fr.read() - - assert pred_text == true_text, "pred_itxt==true_itxt" - return pred_text == true_text - - def test_e2e_xlsx_conversions(): xlsx_paths = get_xlsx_paths() diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 9adf54f..83251a5 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -11,7 +11,7 @@ from docling.datamodel.document import ( ) from docling.document_converter import DocumentConverter -from .verify_utils import verify_document +from .verify_utils import verify_document, verify_export GENERATE = False @@ -58,21 +58,6 @@ def get_converter(): return converter -def verify_export(pred_text: str, gtfile: str): - - if not os.path.exists(gtfile) or GENERATE: - with open(gtfile, "w") as fw: - fw.write(pred_text) - - return True - - else: - with open(gtfile, "r") as fr: - true_text = fr.read() - - return pred_text == true_text - - def test_e2e_docx_conversions(): docx_paths = get_docx_paths() diff --git a/tests/test_backend_pptx.py b/tests/test_backend_pptx.py index c0e71df..5f6129f 100644 --- a/tests/test_backend_pptx.py +++ b/tests/test_backend_pptx.py @@ -5,7 +5,7 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult, DoclingDocument from docling.document_converter import DocumentConverter -from .verify_utils import verify_document +from .verify_utils import verify_document, verify_export GENERATE = False @@ -27,22 +27,6 @@ def get_converter(): return converter -def verify_export(pred_text: str, gtfile: str): - - if not os.path.exists(gtfile) or GENERATE: - with open(gtfile, "w") as fw: - fw.write(pred_text) - - return True - - else: - with open(gtfile, "r") as fr: - true_text = fr.read() - - assert pred_text == true_text, "pred_itxt==true_itxt" - return pred_text == true_text - - def test_e2e_pptx_conversions(): pptx_paths = get_pptx_paths() diff --git a/tests/verify_utils.py b/tests/verify_utils.py index d94ccfb..45152e0 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -472,3 +472,17 @@ def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = Fal true_doc = DoclingDocument.model_validate_json(fr.read()) return verify_docitems(pred_doc, true_doc, fuzzy=False) + + +def verify_export(pred_text: str, gtfile: str, generate: bool = False) -> bool: + file = Path(gtfile) + + if not file.exists() or generate: + with file.open("w") as fw: + fw.write(pred_text) + return True + + with file.open("r") as fr: + true_text = fr.read() + + return pred_text == true_text