From 1ac010354fbd27c500f6e0ad2dd80c8735cbd25f Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Thu, 20 Feb 2025 16:20:07 +0100 Subject: [PATCH] test: avoid testing exact JSON (#1027) * test: avoid testing exact JSON Avoid testing exact JSON output in html and xml backends. Reuse the JSON verify helper function among backend test files. Improve type annotations in html backend. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Update tests/test_backend_patent_uspto.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> --- docling/backend/html_backend.py | 26 +++++++++++++------------- tests/test_backend_html.py | 8 ++++---- tests/test_backend_jats.py | 6 +++--- tests/test_backend_msexcel.py | 21 ++++----------------- tests/test_backend_msword.py | 21 ++++----------------- tests/test_backend_patent_uspto.py | 14 +++++++------- tests/test_backend_pptx.py | 21 ++++----------------- tests/verify_utils.py | 15 +++++++++++++++ 8 files changed, 54 insertions(+), 78 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 234e5da..5b7f5d8 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -5,9 +5,11 @@ from typing import Optional, Union, cast from bs4 import BeautifulSoup, NavigableString, PageElement, Tag from docling_core.types.doc import ( + DocItem, DocItemLabel, DoclingDocument, DocumentOrigin, + GroupItem, GroupLabel, TableCell, TableData, @@ -32,10 +34,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): # Initialise the parents for the hierarchy self.max_levels = 10 self.level = 0 - self.parents = {} # type: ignore + self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {} for i in range(0, self.max_levels): self.parents[i] = None - self.labels = {} # type: ignore try: if isinstance(self.path_or_stream, BytesIO): @@ -111,11 +112,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None: - if tag.name in self.labels: - self.labels[tag.name] += 1 - else: - self.labels[tag.name] = 1 - if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: self.handle_header(tag, doc) elif tag.name in ["p"]: @@ -238,8 +234,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): """Handles listitem tags (li).""" nested_list = element.find(["ul", "ol"]) - parent_list_label = self.parents[self.level].label - index_in_list = len(self.parents[self.level].children) + 1 + parent = self.parents[self.level] + if parent is None: + _log.warning(f"list-item has no parent in DoclingDocument: {element}") + return + parent_label: str = parent.label + index_in_list = len(parent.children) + 1 if nested_list: # Text in list item can be hidden within hierarchy, hence @@ -251,7 +251,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): marker = "" enumerated = False - if parent_list_label == GroupLabel.ORDERED_LIST: + if parent_label == GroupLabel.ORDERED_LIST: marker = str(index_in_list) enumerated = True @@ -261,7 +261,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): text=text, enumerated=enumerated, marker=marker, - parent=self.parents[self.level], + parent=parent, ) self.level += 1 @@ -275,14 +275,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): marker = "" enumerated = False - if parent_list_label == GroupLabel.ORDERED_LIST: + if parent_label == GroupLabel.ORDERED_LIST: marker = f"{str(index_in_list)}." enumerated = True doc.add_list_item( text=text, enumerated=enumerated, marker=marker, - parent=self.parents[self.level], + parent=parent, ) else: _log.warning(f"list-item has no text: {element}") diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py index a4deb21..02fb0c3 100644 --- a/tests/test_backend_html.py +++ b/tests/test_backend_html.py @@ -1,4 +1,3 @@ -import json import os from pathlib import Path @@ -12,6 +11,8 @@ from docling.datamodel.document import ( ) from docling.document_converter import DocumentConverter +from .verify_utils import verify_document + GENERATE = False @@ -66,7 +67,7 @@ def verify_export(pred_text: str, gtfile: str): return True else: - with open(gtfile, "r") as fr: + with open(gtfile) as fr: true_text = fr.read() assert pred_text == true_text, f"pred_text!=true_text for {gtfile}" @@ -99,5 +100,4 @@ def test_e2e_html_conversions(): pred_itxt, str(gt_path) + ".itxt" ), "export to indented-text" - pred_json: str = json.dumps(doc.export_to_dict(), indent=2) - assert verify_export(pred_json, str(gt_path) + ".json"), "export to json" + assert verify_document(doc, str(gt_path) + ".json", GENERATE) diff --git a/tests/test_backend_jats.py b/tests/test_backend_jats.py index a15ba86..377cf3b 100644 --- a/tests/test_backend_jats.py +++ b/tests/test_backend_jats.py @@ -1,4 +1,3 @@ -import json import os from io import BytesIO from pathlib import Path @@ -9,6 +8,8 @@ from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.document import ConversionResult from docling.document_converter import DocumentConverter +from .verify_utils import verify_document + GENERATE = False @@ -61,8 +62,7 @@ def test_e2e_pubmed_conversions(use_stream=False): pred_itxt, str(gt_path) + ".itxt" ), "export to indented-text" - pred_json: str = json.dumps(doc.export_to_dict(), indent=2) - assert verify_export(pred_json, str(gt_path) + ".json"), "export to json" + assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json" def test_e2e_pubmed_conversions_stream(): diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index d07bb69..5324185 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -1,4 +1,3 @@ -import json import os from pathlib import Path @@ -6,7 +5,7 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult, DoclingDocument from docling.document_converter import DocumentConverter -from .verify_utils import verify_docitems +from .verify_utils import verify_document GENERATE = False @@ -44,20 +43,6 @@ def verify_export(pred_text: str, gtfile: str): return pred_text == true_text -def verify_document(pred_doc: DoclingDocument, gtfile: str): - - if not os.path.exists(gtfile) or GENERATE: - with open(gtfile, "w") as fw: - json.dump(pred_doc.export_to_dict(), fw, indent=2) - - return True - else: - with open(gtfile, "r") as fr: - true_doc = DoclingDocument.model_validate_json(fr.read()) - - return verify_docitems(pred_doc, true_doc, fuzzy=False) - - def test_e2e_xlsx_conversions(): xlsx_paths = get_xlsx_paths() @@ -84,4 +69,6 @@ def test_e2e_xlsx_conversions(): pred_itxt, str(gt_path) + ".itxt" ), "export to indented-text" - assert verify_document(doc, str(gt_path) + ".json"), "document document" + assert verify_document( + doc, str(gt_path) + ".json", GENERATE + ), "document document" diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 4b092e0..9adf54f 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -1,4 +1,3 @@ -import json import os from pathlib import Path @@ -12,7 +11,7 @@ from docling.datamodel.document import ( ) from docling.document_converter import DocumentConverter -from .verify_utils import verify_docitems +from .verify_utils import verify_document GENERATE = False @@ -74,20 +73,6 @@ def verify_export(pred_text: str, gtfile: str): return pred_text == true_text -def verify_document(pred_doc: DoclingDocument, gtfile: str): - - if not os.path.exists(gtfile) or GENERATE: - with open(gtfile, "w") as fw: - json.dump(pred_doc.export_to_dict(), fw, indent=2) - - return True - else: - with open(gtfile, "r") as fr: - true_doc = DoclingDocument.model_validate_json(fr.read()) - - return verify_docitems(pred_doc, true_doc, fuzzy=False) - - def test_e2e_docx_conversions(): docx_paths = get_docx_paths() @@ -114,7 +99,9 @@ def test_e2e_docx_conversions(): pred_itxt, str(gt_path) + ".itxt" ), "export to indented-text" - assert verify_document(doc, str(gt_path) + ".json"), "document document" + assert verify_document( + doc, str(gt_path) + ".json", GENERATE + ), "document document" if docx_path.name == "word_tables.docx": pred_html: str = doc.export_to_html() diff --git a/tests/test_backend_patent_uspto.py b/tests/test_backend_patent_uspto.py index 0e95a4d..002aa71 100644 --- a/tests/test_backend_patent_uspto.py +++ b/tests/test_backend_patent_uspto.py @@ -1,6 +1,5 @@ """Test methods in module docling.backend.patent_uspto_backend.py.""" -import json import logging import os from pathlib import Path @@ -14,6 +13,8 @@ from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTab from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument +from .verify_utils import verify_document + GENERATE: bool = False DATA_PATH: Path = Path("./tests/data/uspto/") GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/") @@ -110,12 +111,11 @@ def test_patent_groundtruth(patents, groundtruth): assert ( pred_md == gt_names[md_name] ), f"Markdown file mismatch against groundtruth {md_name}" - json_name = path.stem + ".json" - if json_name in gt_names: - pred_json = json.dumps(doc.export_to_dict(), indent=2) - assert ( - pred_json == gt_names[json_name] - ), f"JSON file mismatch against groundtruth {json_name}" + json_path = path.with_suffix(".json") + if json_path.stem in gt_names: + assert verify_document( + doc, str(json_path), GENERATE + ), f"JSON file mismatch against groundtruth {json_path}" itxt_name = path.stem + ".itxt" if itxt_name in gt_names: pred_itxt = doc._export_to_indented_text() diff --git a/tests/test_backend_pptx.py b/tests/test_backend_pptx.py index 7540208..c0e71df 100644 --- a/tests/test_backend_pptx.py +++ b/tests/test_backend_pptx.py @@ -1,4 +1,3 @@ -import json import os from pathlib import Path @@ -6,7 +5,7 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult, DoclingDocument from docling.document_converter import DocumentConverter -from .verify_utils import verify_docitems +from .verify_utils import verify_document GENERATE = False @@ -44,20 +43,6 @@ def verify_export(pred_text: str, gtfile: str): return pred_text == true_text -def verify_document(pred_doc: DoclingDocument, gtfile: str): - - if not os.path.exists(gtfile) or GENERATE: - with open(gtfile, "w") as fw: - json.dump(pred_doc.export_to_dict(), fw, indent=2) - - return True - else: - with open(gtfile, "r") as fr: - true_doc = DoclingDocument.model_validate_json(fr.read()) - - return verify_docitems(pred_doc, true_doc, fuzzy=False) - - def test_e2e_pptx_conversions(): pptx_paths = get_pptx_paths() @@ -84,4 +69,6 @@ def test_e2e_pptx_conversions(): pred_itxt, str(gt_path) + ".itxt" ), "export to indented-text" - assert verify_document(doc, str(gt_path) + ".json"), "document document" + assert verify_document( + doc, str(gt_path) + ".json", GENERATE + ), "document document" diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 0493dac..d94ccfb 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -1,4 +1,5 @@ import json +import os import warnings from pathlib import Path from typing import List, Optional @@ -457,3 +458,17 @@ def verify_conversion_result_v2( assert verify_dt( doc_pred_dt, doc_true_dt, fuzzy=fuzzy ), f"Mismatch in DocTags prediction for {input_path}" + + +def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False): + + if not os.path.exists(gtfile) or generate: + with open(gtfile, "w") as fw: + json.dump(pred_doc.export_to_dict(), fw, indent=2) + + return True + else: + with open(gtfile) as fr: + true_doc = DoclingDocument.model_validate_json(fr.read()) + + return verify_docitems(pred_doc, true_doc, fuzzy=False)