diff --git a/tests/test_backend_csv.py b/tests/test_backend_csv.py index 11aff89..8bdbc8d 100644 --- a/tests/test_backend_csv.py +++ b/tests/test_backend_csv.py @@ -8,7 +8,7 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult, DoclingDocument from docling.document_converter import DocumentConverter -GENERATE = True +GENERATE = False def get_csv_paths(): diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index f33dffa..d07bb69 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -6,6 +6,8 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult, DoclingDocument from docling.document_converter import DocumentConverter +from .verify_utils import verify_docitems + GENERATE = False @@ -42,6 +44,20 @@ def verify_export(pred_text: str, gtfile: str): return pred_text == true_text +def verify_document(pred_doc: DoclingDocument, gtfile: str): + + if not os.path.exists(gtfile) or GENERATE: + with open(gtfile, "w") as fw: + json.dump(pred_doc.export_to_dict(), fw, indent=2) + + return True + else: + with open(gtfile, "r") as fr: + true_doc = DoclingDocument.model_validate_json(fr.read()) + + return verify_docitems(pred_doc, true_doc, fuzzy=False) + + def test_e2e_xlsx_conversions(): xlsx_paths = get_xlsx_paths() @@ -68,5 +84,4 @@ def test_e2e_xlsx_conversions(): pred_itxt, str(gt_path) + ".itxt" ), "export to indented-text" - pred_json: str = json.dumps(doc.export_to_dict(), indent=2) - assert verify_export(pred_json, str(gt_path) + ".json"), "export to json" + assert verify_document(doc, str(gt_path) + ".json"), "document document" diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 86bd837..4b092e0 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -12,6 +12,8 @@ from docling.datamodel.document import ( ) from docling.document_converter import DocumentConverter +from .verify_utils import verify_docitems + GENERATE = False @@ -72,6 +74,20 @@ def verify_export(pred_text: str, gtfile: str): return pred_text == true_text +def verify_document(pred_doc: DoclingDocument, gtfile: str): + + if not os.path.exists(gtfile) or GENERATE: + with open(gtfile, "w") as fw: + json.dump(pred_doc.export_to_dict(), fw, indent=2) + + return True + else: + with open(gtfile, "r") as fr: + true_doc = DoclingDocument.model_validate_json(fr.read()) + + return verify_docitems(pred_doc, true_doc, fuzzy=False) + + def test_e2e_docx_conversions(): docx_paths = get_docx_paths() @@ -98,8 +114,7 @@ def test_e2e_docx_conversions(): pred_itxt, str(gt_path) + ".itxt" ), "export to indented-text" - pred_json: str = json.dumps(doc.export_to_dict(), indent=2) - assert verify_export(pred_json, str(gt_path) + ".json"), "export to json" + assert verify_document(doc, str(gt_path) + ".json"), "document document" if docx_path.name == "word_tables.docx": pred_html: str = doc.export_to_html() diff --git a/tests/test_backend_pptx.py b/tests/test_backend_pptx.py index f4799a8..7540208 100644 --- a/tests/test_backend_pptx.py +++ b/tests/test_backend_pptx.py @@ -6,6 +6,8 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult, DoclingDocument from docling.document_converter import DocumentConverter +from .verify_utils import verify_docitems + GENERATE = False @@ -42,6 +44,20 @@ def verify_export(pred_text: str, gtfile: str): return pred_text == true_text +def verify_document(pred_doc: DoclingDocument, gtfile: str): + + if not os.path.exists(gtfile) or GENERATE: + with open(gtfile, "w") as fw: + json.dump(pred_doc.export_to_dict(), fw, indent=2) + + return True + else: + with open(gtfile, "r") as fr: + true_doc = DoclingDocument.model_validate_json(fr.read()) + + return verify_docitems(pred_doc, true_doc, fuzzy=False) + + def test_e2e_pptx_conversions(): pptx_paths = get_pptx_paths() @@ -68,5 +84,4 @@ def test_e2e_pptx_conversions(): pred_itxt, str(gt_path) + ".itxt" ), "export to indented-text" - pred_json: str = json.dumps(doc.export_to_dict(), indent=2) - assert verify_export(pred_json, str(gt_path) + ".json"), "export to json" + assert verify_document(doc, str(gt_path) + ".json"), "document document" diff --git a/tests/verify_utils.py b/tests/verify_utils.py index e179a40..0493dac 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -1,10 +1,17 @@ import json import warnings from pathlib import Path -from typing import List +from typing import List, Optional -from docling_core.types.doc import DoclingDocument +from docling_core.types.doc import ( + DocItem, + DoclingDocument, + PictureItem, + TableItem, + TextItem, +) from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument +from PIL import Image as PILImage from pydantic import TypeAdapter from pydantic.json import pydantic_encoder @@ -153,64 +160,58 @@ def verify_tables_v1(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool): return True -def verify_tables_v2(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy: bool): - if not len(doc_true.tables) > 0: - # No tables to check - assert len(doc_pred.tables) == 0, "not expecting any table on this document" - return True - else: - assert len(doc_pred.tables) > 0, "no tables predicted, but expected in doc_true" +def verify_table_v2(true_item: TableItem, pred_item: TableItem, fuzzy: bool): + assert ( + true_item.data.num_rows == pred_item.data.num_rows + ), "table does not have the same #-rows" + assert ( + true_item.data.num_cols == pred_item.data.num_cols + ), "table does not have the same #-cols" - # print("Expected number of tables: {}, result: {}".format(len(doc_true.tables), len(doc_pred.tables))) + assert true_item.data is not None, "documents are expected to have table data" + assert pred_item.data is not None, "documents are expected to have table data" - assert len(doc_true.tables) == len( - doc_pred.tables - ), "document has different count of tables than expected." + # print("True: \n", true_item.export_to_dataframe().to_markdown()) + # print("Pred: \n", true_item.export_to_dataframe().to_markdown()) - for l, true_item in enumerate(doc_true.tables): - pred_item = doc_pred.tables[l] + for i, row in enumerate(true_item.data.grid): + for j, col in enumerate(true_item.data.grid[i]): - assert ( - true_item.data.num_rows == pred_item.data.num_rows - ), "table does not have the same #-rows" - assert ( - true_item.data.num_cols == pred_item.data.num_cols - ), "table does not have the same #-cols" + # print("true: ", true_item.data[i][j].text) + # print("pred: ", pred_item.data[i][j].text) + # print("") - assert true_item.data is not None, "documents are expected to have table data" - assert pred_item.data is not None, "documents are expected to have table data" + verify_text( + true_item.data.grid[i][j].text, + pred_item.data.grid[i][j].text, + fuzzy=fuzzy, + ) - print("True: \n", true_item.export_to_dataframe().to_markdown()) - print("Pred: \n", true_item.export_to_dataframe().to_markdown()) + assert ( + true_item.data.grid[i][j].column_header + == pred_item.data.grid[i][j].column_header + ), "table-cell should be a column_header but prediction isn't" - for i, row in enumerate(true_item.data.grid): - for j, col in enumerate(true_item.data.grid[i]): + assert ( + true_item.data.grid[i][j].row_header + == pred_item.data.grid[i][j].row_header + ), "table-cell should be a row_header but prediction isn't" - # print("true: ", true_item.data[i][j].text) - # print("pred: ", pred_item.data[i][j].text) - # print("") + assert ( + true_item.data.grid[i][j].row_section + == pred_item.data.grid[i][j].row_section + ), "table-cell should be a row_section but prediction isn't" - verify_text( - true_item.data.grid[i][j].text, - pred_item.data.grid[i][j].text, - fuzzy=fuzzy, - ) + return True - assert ( - true_item.data.grid[i][j].column_header - == pred_item.data.grid[i][j].column_header - ), "table-cell should be a column_header but prediction isn't" - - assert ( - true_item.data.grid[i][j].row_header - == pred_item.data.grid[i][j].row_header - ), "table-cell should be a row_header but prediction isn't" - - assert ( - true_item.data.grid[i][j].row_section - == pred_item.data.grid[i][j].row_section - ), "table-cell should be a row_section but prediction isn't" +def verify_picture_image_v2( + true_image: PILImage.Image, pred_item: Optional[PILImage.Image] +): + assert pred_item is not None, "predicted image is None" + assert true_image.size == pred_item.size + assert true_image.mode == pred_item.mode + # assert true_image.tobytes() == pred_item.tobytes() return True @@ -220,6 +221,70 @@ def verify_tables_v2(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy # return True +def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy: bool): + assert len(doc_pred.texts) == len(doc_true.texts), f"Text lengths do not match." + + assert len(doc_true.tables) == len( + doc_pred.tables + ), "document has different count of tables than expected." + + for (true_item, _true_level), (pred_item, _pred_level) in zip( + doc_true.iterate_items(), doc_pred.iterate_items() + ): + if not isinstance(true_item, DocItem): + continue + assert isinstance(pred_item, DocItem), "Test item is not a DocItem" + + # Validate type + assert true_item.label == pred_item.label, f"Object label does not match." + + # Validate provenance + assert len(true_item.prov) == len(pred_item.prov), "Length of prov mismatch" + if len(true_item.prov) > 0: + true_prov = true_item.prov[0] + pred_prov = pred_item.prov[0] + + assert true_prov.page_no == pred_prov.page_no, "Page provenance mistmatch" + + # TODO: add bbox check with tolerance + + # Validate text content + if isinstance(true_item, TextItem): + assert isinstance(pred_item, TextItem), ( + "Test item is not a TextItem as the expected one " + f"{true_item=} " + f"{pred_item=} " + ) + + assert verify_text(true_item.text, pred_item.text, fuzzy=fuzzy) + + # Validate table content + if isinstance(true_item, TableItem): + assert isinstance( + pred_item, TableItem + ), "Test item is not a TableItem as the expected one" + assert verify_table_v2( + true_item, pred_item, fuzzy=fuzzy + ), "Tables not matching" + + # Validate picture content + if isinstance(true_item, PictureItem): + assert isinstance( + pred_item, PictureItem + ), "Test item is not a PictureItem as the expected one" + + true_image = true_item.get_image(doc=doc_true) + pred_image = true_item.get_image(doc=doc_pred) + if true_image is not None: + assert verify_picture_image_v2( + true_image, pred_image + ), "Picture image mismatch" + + # TODO: check picture annotations + + return True + + def verify_md(doc_pred_md: str, doc_true_md: str, fuzzy: bool): return verify_text(doc_true_md, doc_pred_md, fuzzy) @@ -381,9 +446,9 @@ def verify_conversion_result_v2( # doc_pred, doc_true # ), f"Mismatch in JSON prediction for {input_path}" - assert verify_tables_v2( + assert verify_docitems( doc_pred, doc_true, fuzzy=fuzzy - ), f"verify_tables(doc_pred, doc_true) mismatch for {input_path}" + ), f"verify_docling_document(doc_pred, doc_true) mismatch for {input_path}" assert verify_md( doc_pred_md, doc_true_md, fuzzy=fuzzy