test: avoid testing exact JSON in CSV backend (#1038)
* feat: updated verify_export Moved verify_export to verify_utils Reuse verify_export in tests Signed-off-by: Matheus Abdias <matheusfabdias@gmail.com> * feat: replace verify_export with verify_document in CSV conversion tests Signed-off-by: Matheus Abdias <matheusfabdias@gmail.com> --------- Signed-off-by: Matheus Abdias <matheusfabdias@gmail.com>
This commit is contained in:
parent
d8a81c3168
commit
1d17e7397a
@ -8,6 +8,8 @@ from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DoclingDocument
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .verify_utils import verify_document, verify_export
|
||||
|
||||
GENERATE = False
|
||||
|
||||
|
||||
@ -33,22 +35,6 @@ def get_converter():
|
||||
return converter
|
||||
|
||||
|
||||
def verify_export(pred_text: str, gtfile: str):
|
||||
|
||||
if not os.path.exists(gtfile) or GENERATE:
|
||||
with open(gtfile, "w") as fw:
|
||||
fw.write(pred_text)
|
||||
|
||||
return True
|
||||
|
||||
else:
|
||||
with open(gtfile, "r") as fr:
|
||||
true_text = fr.read()
|
||||
|
||||
assert pred_text == true_text, "pred_itxt==true_itxt"
|
||||
return pred_text == true_text
|
||||
|
||||
|
||||
def test_e2e_valid_csv_conversions():
|
||||
valid_csv_paths = get_csv_paths()
|
||||
converter = get_converter()
|
||||
@ -72,8 +58,7 @@ def test_e2e_valid_csv_conversions():
|
||||
pred_itxt, str(gt_path) + ".itxt"
|
||||
), "export to indented-text"
|
||||
|
||||
pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
|
||||
assert verify_export(pred_json, str(gt_path) + ".json"), "export to json"
|
||||
assert verify_document(doc, str(gt_path) + ".json"), "export to json"
|
||||
|
||||
|
||||
def test_e2e_invalid_csv_conversions():
|
||||
|
@ -11,7 +11,7 @@ from docling.datamodel.document import (
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .verify_utils import verify_document
|
||||
from .verify_utils import verify_document, verify_export
|
||||
|
||||
GENERATE = False
|
||||
|
||||
@ -58,22 +58,6 @@ def get_converter():
|
||||
return converter
|
||||
|
||||
|
||||
def verify_export(pred_text: str, gtfile: str):
|
||||
|
||||
if not os.path.exists(gtfile) or GENERATE:
|
||||
with open(gtfile, "w") as fw:
|
||||
fw.write(pred_text)
|
||||
|
||||
return True
|
||||
|
||||
else:
|
||||
with open(gtfile) as fr:
|
||||
true_text = fr.read()
|
||||
|
||||
assert pred_text == true_text, f"pred_text!=true_text for {gtfile}"
|
||||
return pred_text == true_text
|
||||
|
||||
|
||||
def test_e2e_html_conversions():
|
||||
|
||||
html_paths = get_html_paths()
|
||||
|
@ -8,7 +8,7 @@ from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .verify_utils import verify_document
|
||||
from .verify_utils import verify_document, verify_export
|
||||
|
||||
GENERATE = False
|
||||
|
||||
@ -24,18 +24,6 @@ def get_converter():
|
||||
return converter
|
||||
|
||||
|
||||
def verify_export(pred_text: str, gtfile: str):
|
||||
if not os.path.exists(gtfile) or GENERATE:
|
||||
with open(gtfile, "w") as fw:
|
||||
fw.write(pred_text)
|
||||
return True
|
||||
else:
|
||||
with open(gtfile, "r") as fr:
|
||||
true_text = fr.read()
|
||||
assert pred_text == true_text, f"pred_text!=true_text for {gtfile}"
|
||||
return pred_text == true_text
|
||||
|
||||
|
||||
def test_e2e_pubmed_conversions(use_stream=False):
|
||||
pubmed_paths = get_pubmed_paths()
|
||||
converter = get_converter()
|
||||
|
@ -5,7 +5,7 @@ from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DoclingDocument
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .verify_utils import verify_document
|
||||
from .verify_utils import verify_document, verify_export
|
||||
|
||||
GENERATE = False
|
||||
|
||||
@ -27,22 +27,6 @@ def get_converter():
|
||||
return converter
|
||||
|
||||
|
||||
def verify_export(pred_text: str, gtfile: str):
|
||||
|
||||
if not os.path.exists(gtfile) or GENERATE:
|
||||
with open(gtfile, "w") as fw:
|
||||
fw.write(pred_text)
|
||||
|
||||
return True
|
||||
|
||||
else:
|
||||
with open(gtfile, "r") as fr:
|
||||
true_text = fr.read()
|
||||
|
||||
assert pred_text == true_text, "pred_itxt==true_itxt"
|
||||
return pred_text == true_text
|
||||
|
||||
|
||||
def test_e2e_xlsx_conversions():
|
||||
|
||||
xlsx_paths = get_xlsx_paths()
|
||||
|
@ -11,7 +11,7 @@ from docling.datamodel.document import (
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .verify_utils import verify_document
|
||||
from .verify_utils import verify_document, verify_export
|
||||
|
||||
GENERATE = False
|
||||
|
||||
@ -58,21 +58,6 @@ def get_converter():
|
||||
return converter
|
||||
|
||||
|
||||
def verify_export(pred_text: str, gtfile: str):
|
||||
|
||||
if not os.path.exists(gtfile) or GENERATE:
|
||||
with open(gtfile, "w") as fw:
|
||||
fw.write(pred_text)
|
||||
|
||||
return True
|
||||
|
||||
else:
|
||||
with open(gtfile, "r") as fr:
|
||||
true_text = fr.read()
|
||||
|
||||
return pred_text == true_text
|
||||
|
||||
|
||||
def test_e2e_docx_conversions():
|
||||
|
||||
docx_paths = get_docx_paths()
|
||||
|
@ -5,7 +5,7 @@ from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DoclingDocument
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .verify_utils import verify_document
|
||||
from .verify_utils import verify_document, verify_export
|
||||
|
||||
GENERATE = False
|
||||
|
||||
@ -27,22 +27,6 @@ def get_converter():
|
||||
return converter
|
||||
|
||||
|
||||
def verify_export(pred_text: str, gtfile: str):
|
||||
|
||||
if not os.path.exists(gtfile) or GENERATE:
|
||||
with open(gtfile, "w") as fw:
|
||||
fw.write(pred_text)
|
||||
|
||||
return True
|
||||
|
||||
else:
|
||||
with open(gtfile, "r") as fr:
|
||||
true_text = fr.read()
|
||||
|
||||
assert pred_text == true_text, "pred_itxt==true_itxt"
|
||||
return pred_text == true_text
|
||||
|
||||
|
||||
def test_e2e_pptx_conversions():
|
||||
|
||||
pptx_paths = get_pptx_paths()
|
||||
|
@ -472,3 +472,17 @@ def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = Fal
|
||||
true_doc = DoclingDocument.model_validate_json(fr.read())
|
||||
|
||||
return verify_docitems(pred_doc, true_doc, fuzzy=False)
|
||||
|
||||
|
||||
def verify_export(pred_text: str, gtfile: str, generate: bool = False) -> bool:
|
||||
file = Path(gtfile)
|
||||
|
||||
if not file.exists() or generate:
|
||||
with file.open("w") as fw:
|
||||
fw.write(pred_text)
|
||||
return True
|
||||
|
||||
with file.open("r") as fr:
|
||||
true_text = fr.read()
|
||||
|
||||
return pred_text == true_text
|
||||
|
Loading…
Reference in New Issue
Block a user