test: avoid testing exact JSON (#1027)

* test: avoid testing exact JSON

Avoid testing exact JSON output in html and xml backends.
Reuse the JSON verify helper function among backend test files.
Improve type annotations in html backend.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* Update tests/test_backend_patent_uspto.py

Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-02-20 16:20:07 +01:00 committed by GitHub
parent 6796f0a132
commit 1ac010354f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 54 additions and 78 deletions

View File

@ -5,9 +5,11 @@ from typing import Optional, Union, cast
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from docling_core.types.doc import (
DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupItem,
GroupLabel,
TableCell,
TableData,
@ -32,10 +34,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# Initialise the parents for the hierarchy
self.max_levels = 10
self.level = 0
self.parents = {} # type: ignore
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
for i in range(0, self.max_levels):
self.parents[i] = None
self.labels = {} # type: ignore
try:
if isinstance(self.path_or_stream, BytesIO):
@ -111,11 +112,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
if tag.name in self.labels:
self.labels[tag.name] += 1
else:
self.labels[tag.name] = 1
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
self.handle_header(tag, doc)
elif tag.name in ["p"]:
@ -238,8 +234,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
"""Handles listitem tags (li)."""
nested_list = element.find(["ul", "ol"])
parent_list_label = self.parents[self.level].label
index_in_list = len(self.parents[self.level].children) + 1
parent = self.parents[self.level]
if parent is None:
_log.warning(f"list-item has no parent in DoclingDocument: {element}")
return
parent_label: str = parent.label
index_in_list = len(parent.children) + 1
if nested_list:
# Text in list item can be hidden within hierarchy, hence
@ -251,7 +251,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
marker = ""
enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST:
if parent_label == GroupLabel.ORDERED_LIST:
marker = str(index_in_list)
enumerated = True
@ -261,7 +261,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text=text,
enumerated=enumerated,
marker=marker,
parent=self.parents[self.level],
parent=parent,
)
self.level += 1
@ -275,14 +275,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
marker = ""
enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST:
if parent_label == GroupLabel.ORDERED_LIST:
marker = f"{str(index_in_list)}."
enumerated = True
doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
parent=self.parents[self.level],
parent=parent,
)
else:
_log.warning(f"list-item has no text: {element}")

View File

@ -1,4 +1,3 @@
import json
import os
from pathlib import Path
@ -12,6 +11,8 @@ from docling.datamodel.document import (
)
from docling.document_converter import DocumentConverter
from .verify_utils import verify_document
GENERATE = False
@ -66,7 +67,7 @@ def verify_export(pred_text: str, gtfile: str):
return True
else:
with open(gtfile, "r") as fr:
with open(gtfile) as fr:
true_text = fr.read()
assert pred_text == true_text, f"pred_text!=true_text for {gtfile}"
@ -99,5 +100,4 @@ def test_e2e_html_conversions():
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
assert verify_export(pred_json, str(gt_path) + ".json"), "export to json"
assert verify_document(doc, str(gt_path) + ".json", GENERATE)

View File

@ -1,4 +1,3 @@
import json
import os
from io import BytesIO
from pathlib import Path
@ -9,6 +8,8 @@ from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
from .verify_utils import verify_document
GENERATE = False
@ -61,8 +62,7 @@ def test_e2e_pubmed_conversions(use_stream=False):
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
assert verify_export(pred_json, str(gt_path) + ".json"), "export to json"
assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json"
def test_e2e_pubmed_conversions_stream():

View File

@ -1,4 +1,3 @@
import json
import os
from pathlib import Path
@ -6,7 +5,7 @@ from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument
from docling.document_converter import DocumentConverter
from .verify_utils import verify_docitems
from .verify_utils import verify_document
GENERATE = False
@ -44,20 +43,6 @@ def verify_export(pred_text: str, gtfile: str):
return pred_text == true_text
def verify_document(pred_doc: DoclingDocument, gtfile: str):
if not os.path.exists(gtfile) or GENERATE:
with open(gtfile, "w") as fw:
json.dump(pred_doc.export_to_dict(), fw, indent=2)
return True
else:
with open(gtfile, "r") as fr:
true_doc = DoclingDocument.model_validate_json(fr.read())
return verify_docitems(pred_doc, true_doc, fuzzy=False)
def test_e2e_xlsx_conversions():
xlsx_paths = get_xlsx_paths()
@ -84,4 +69,6 @@ def test_e2e_xlsx_conversions():
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
assert verify_document(doc, str(gt_path) + ".json"), "document document"
assert verify_document(
doc, str(gt_path) + ".json", GENERATE
), "document document"

View File

@ -1,4 +1,3 @@
import json
import os
from pathlib import Path
@ -12,7 +11,7 @@ from docling.datamodel.document import (
)
from docling.document_converter import DocumentConverter
from .verify_utils import verify_docitems
from .verify_utils import verify_document
GENERATE = False
@ -74,20 +73,6 @@ def verify_export(pred_text: str, gtfile: str):
return pred_text == true_text
def verify_document(pred_doc: DoclingDocument, gtfile: str):
if not os.path.exists(gtfile) or GENERATE:
with open(gtfile, "w") as fw:
json.dump(pred_doc.export_to_dict(), fw, indent=2)
return True
else:
with open(gtfile, "r") as fr:
true_doc = DoclingDocument.model_validate_json(fr.read())
return verify_docitems(pred_doc, true_doc, fuzzy=False)
def test_e2e_docx_conversions():
docx_paths = get_docx_paths()
@ -114,7 +99,9 @@ def test_e2e_docx_conversions():
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
assert verify_document(doc, str(gt_path) + ".json"), "document document"
assert verify_document(
doc, str(gt_path) + ".json", GENERATE
), "document document"
if docx_path.name == "word_tables.docx":
pred_html: str = doc.export_to_html()

View File

@ -1,6 +1,5 @@
"""Test methods in module docling.backend.patent_uspto_backend.py."""
import json
import logging
import os
from pathlib import Path
@ -14,6 +13,8 @@ from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTab
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from .verify_utils import verify_document
GENERATE: bool = False
DATA_PATH: Path = Path("./tests/data/uspto/")
GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/")
@ -110,12 +111,11 @@ def test_patent_groundtruth(patents, groundtruth):
assert (
pred_md == gt_names[md_name]
), f"Markdown file mismatch against groundtruth {md_name}"
json_name = path.stem + ".json"
if json_name in gt_names:
pred_json = json.dumps(doc.export_to_dict(), indent=2)
assert (
pred_json == gt_names[json_name]
), f"JSON file mismatch against groundtruth {json_name}"
json_path = path.with_suffix(".json")
if json_path.stem in gt_names:
assert verify_document(
doc, str(json_path), GENERATE
), f"JSON file mismatch against groundtruth {json_path}"
itxt_name = path.stem + ".itxt"
if itxt_name in gt_names:
pred_itxt = doc._export_to_indented_text()

View File

@ -1,4 +1,3 @@
import json
import os
from pathlib import Path
@ -6,7 +5,7 @@ from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument
from docling.document_converter import DocumentConverter
from .verify_utils import verify_docitems
from .verify_utils import verify_document
GENERATE = False
@ -44,20 +43,6 @@ def verify_export(pred_text: str, gtfile: str):
return pred_text == true_text
def verify_document(pred_doc: DoclingDocument, gtfile: str):
if not os.path.exists(gtfile) or GENERATE:
with open(gtfile, "w") as fw:
json.dump(pred_doc.export_to_dict(), fw, indent=2)
return True
else:
with open(gtfile, "r") as fr:
true_doc = DoclingDocument.model_validate_json(fr.read())
return verify_docitems(pred_doc, true_doc, fuzzy=False)
def test_e2e_pptx_conversions():
pptx_paths = get_pptx_paths()
@ -84,4 +69,6 @@ def test_e2e_pptx_conversions():
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
assert verify_document(doc, str(gt_path) + ".json"), "document document"
assert verify_document(
doc, str(gt_path) + ".json", GENERATE
), "document document"

View File

@ -1,4 +1,5 @@
import json
import os
import warnings
from pathlib import Path
from typing import List, Optional
@ -457,3 +458,17 @@ def verify_conversion_result_v2(
assert verify_dt(
doc_pred_dt, doc_true_dt, fuzzy=fuzzy
), f"Mismatch in DocTags prediction for {input_path}"
def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
if not os.path.exists(gtfile) or generate:
with open(gtfile, "w") as fw:
json.dump(pred_doc.export_to_dict(), fw, indent=2)
return True
else:
with open(gtfile) as fr:
true_doc = DoclingDocument.model_validate_json(fr.read())
return verify_docitems(pred_doc, true_doc, fuzzy=False)