test: avoid testing exact JSON (#1027)
* test: avoid testing exact JSON Avoid testing exact JSON output in html and xml backends. Reuse the JSON verify helper function among backend test files. Improve type annotations in html backend. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Update tests/test_backend_patent_uspto.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
This commit is contained in:
parent
6796f0a132
commit
1ac010354f
@ -5,9 +5,11 @@ from typing import Optional, Union, cast
|
||||
|
||||
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
||||
from docling_core.types.doc import (
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupItem,
|
||||
GroupLabel,
|
||||
TableCell,
|
||||
TableData,
|
||||
@ -32,10 +34,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Initialise the parents for the hierarchy
|
||||
self.max_levels = 10
|
||||
self.level = 0
|
||||
self.parents = {} # type: ignore
|
||||
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
||||
for i in range(0, self.max_levels):
|
||||
self.parents[i] = None
|
||||
self.labels = {} # type: ignore
|
||||
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
@ -111,11 +112,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
return
|
||||
|
||||
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||
if tag.name in self.labels:
|
||||
self.labels[tag.name] += 1
|
||||
else:
|
||||
self.labels[tag.name] = 1
|
||||
|
||||
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||
self.handle_header(tag, doc)
|
||||
elif tag.name in ["p"]:
|
||||
@ -238,8 +234,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
"""Handles listitem tags (li)."""
|
||||
nested_list = element.find(["ul", "ol"])
|
||||
|
||||
parent_list_label = self.parents[self.level].label
|
||||
index_in_list = len(self.parents[self.level].children) + 1
|
||||
parent = self.parents[self.level]
|
||||
if parent is None:
|
||||
_log.warning(f"list-item has no parent in DoclingDocument: {element}")
|
||||
return
|
||||
parent_label: str = parent.label
|
||||
index_in_list = len(parent.children) + 1
|
||||
|
||||
if nested_list:
|
||||
# Text in list item can be hidden within hierarchy, hence
|
||||
@ -251,7 +251,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
marker = ""
|
||||
enumerated = False
|
||||
if parent_list_label == GroupLabel.ORDERED_LIST:
|
||||
if parent_label == GroupLabel.ORDERED_LIST:
|
||||
marker = str(index_in_list)
|
||||
enumerated = True
|
||||
|
||||
@ -261,7 +261,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
text=text,
|
||||
enumerated=enumerated,
|
||||
marker=marker,
|
||||
parent=self.parents[self.level],
|
||||
parent=parent,
|
||||
)
|
||||
self.level += 1
|
||||
|
||||
@ -275,14 +275,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
marker = ""
|
||||
enumerated = False
|
||||
if parent_list_label == GroupLabel.ORDERED_LIST:
|
||||
if parent_label == GroupLabel.ORDERED_LIST:
|
||||
marker = f"{str(index_in_list)}."
|
||||
enumerated = True
|
||||
doc.add_list_item(
|
||||
text=text,
|
||||
enumerated=enumerated,
|
||||
marker=marker,
|
||||
parent=self.parents[self.level],
|
||||
parent=parent,
|
||||
)
|
||||
else:
|
||||
_log.warning(f"list-item has no text: {element}")
|
||||
|
@ -1,4 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
@ -12,6 +11,8 @@ from docling.datamodel.document import (
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .verify_utils import verify_document
|
||||
|
||||
GENERATE = False
|
||||
|
||||
|
||||
@ -66,7 +67,7 @@ def verify_export(pred_text: str, gtfile: str):
|
||||
return True
|
||||
|
||||
else:
|
||||
with open(gtfile, "r") as fr:
|
||||
with open(gtfile) as fr:
|
||||
true_text = fr.read()
|
||||
|
||||
assert pred_text == true_text, f"pred_text!=true_text for {gtfile}"
|
||||
@ -99,5 +100,4 @@ def test_e2e_html_conversions():
|
||||
pred_itxt, str(gt_path) + ".itxt"
|
||||
), "export to indented-text"
|
||||
|
||||
pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
|
||||
assert verify_export(pred_json, str(gt_path) + ".json"), "export to json"
|
||||
assert verify_document(doc, str(gt_path) + ".json", GENERATE)
|
||||
|
@ -1,4 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
@ -9,6 +8,8 @@ from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .verify_utils import verify_document
|
||||
|
||||
GENERATE = False
|
||||
|
||||
|
||||
@ -61,8 +62,7 @@ def test_e2e_pubmed_conversions(use_stream=False):
|
||||
pred_itxt, str(gt_path) + ".itxt"
|
||||
), "export to indented-text"
|
||||
|
||||
pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
|
||||
assert verify_export(pred_json, str(gt_path) + ".json"), "export to json"
|
||||
assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json"
|
||||
|
||||
|
||||
def test_e2e_pubmed_conversions_stream():
|
||||
|
@ -1,4 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
@ -6,7 +5,7 @@ from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DoclingDocument
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .verify_utils import verify_docitems
|
||||
from .verify_utils import verify_document
|
||||
|
||||
GENERATE = False
|
||||
|
||||
@ -44,20 +43,6 @@ def verify_export(pred_text: str, gtfile: str):
|
||||
return pred_text == true_text
|
||||
|
||||
|
||||
def verify_document(pred_doc: DoclingDocument, gtfile: str):
|
||||
|
||||
if not os.path.exists(gtfile) or GENERATE:
|
||||
with open(gtfile, "w") as fw:
|
||||
json.dump(pred_doc.export_to_dict(), fw, indent=2)
|
||||
|
||||
return True
|
||||
else:
|
||||
with open(gtfile, "r") as fr:
|
||||
true_doc = DoclingDocument.model_validate_json(fr.read())
|
||||
|
||||
return verify_docitems(pred_doc, true_doc, fuzzy=False)
|
||||
|
||||
|
||||
def test_e2e_xlsx_conversions():
|
||||
|
||||
xlsx_paths = get_xlsx_paths()
|
||||
@ -84,4 +69,6 @@ def test_e2e_xlsx_conversions():
|
||||
pred_itxt, str(gt_path) + ".itxt"
|
||||
), "export to indented-text"
|
||||
|
||||
assert verify_document(doc, str(gt_path) + ".json"), "document document"
|
||||
assert verify_document(
|
||||
doc, str(gt_path) + ".json", GENERATE
|
||||
), "document document"
|
||||
|
@ -1,4 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
@ -12,7 +11,7 @@ from docling.datamodel.document import (
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .verify_utils import verify_docitems
|
||||
from .verify_utils import verify_document
|
||||
|
||||
GENERATE = False
|
||||
|
||||
@ -74,20 +73,6 @@ def verify_export(pred_text: str, gtfile: str):
|
||||
return pred_text == true_text
|
||||
|
||||
|
||||
def verify_document(pred_doc: DoclingDocument, gtfile: str):
|
||||
|
||||
if not os.path.exists(gtfile) or GENERATE:
|
||||
with open(gtfile, "w") as fw:
|
||||
json.dump(pred_doc.export_to_dict(), fw, indent=2)
|
||||
|
||||
return True
|
||||
else:
|
||||
with open(gtfile, "r") as fr:
|
||||
true_doc = DoclingDocument.model_validate_json(fr.read())
|
||||
|
||||
return verify_docitems(pred_doc, true_doc, fuzzy=False)
|
||||
|
||||
|
||||
def test_e2e_docx_conversions():
|
||||
|
||||
docx_paths = get_docx_paths()
|
||||
@ -114,7 +99,9 @@ def test_e2e_docx_conversions():
|
||||
pred_itxt, str(gt_path) + ".itxt"
|
||||
), "export to indented-text"
|
||||
|
||||
assert verify_document(doc, str(gt_path) + ".json"), "document document"
|
||||
assert verify_document(
|
||||
doc, str(gt_path) + ".json", GENERATE
|
||||
), "document document"
|
||||
|
||||
if docx_path.name == "word_tables.docx":
|
||||
pred_html: str = doc.export_to_html()
|
||||
|
@ -1,6 +1,5 @@
|
||||
"""Test methods in module docling.backend.patent_uspto_backend.py."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
@ -14,6 +13,8 @@ from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTab
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
from .verify_utils import verify_document
|
||||
|
||||
GENERATE: bool = False
|
||||
DATA_PATH: Path = Path("./tests/data/uspto/")
|
||||
GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/")
|
||||
@ -110,12 +111,11 @@ def test_patent_groundtruth(patents, groundtruth):
|
||||
assert (
|
||||
pred_md == gt_names[md_name]
|
||||
), f"Markdown file mismatch against groundtruth {md_name}"
|
||||
json_name = path.stem + ".json"
|
||||
if json_name in gt_names:
|
||||
pred_json = json.dumps(doc.export_to_dict(), indent=2)
|
||||
assert (
|
||||
pred_json == gt_names[json_name]
|
||||
), f"JSON file mismatch against groundtruth {json_name}"
|
||||
json_path = path.with_suffix(".json")
|
||||
if json_path.stem in gt_names:
|
||||
assert verify_document(
|
||||
doc, str(json_path), GENERATE
|
||||
), f"JSON file mismatch against groundtruth {json_path}"
|
||||
itxt_name = path.stem + ".itxt"
|
||||
if itxt_name in gt_names:
|
||||
pred_itxt = doc._export_to_indented_text()
|
||||
|
@ -1,4 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
@ -6,7 +5,7 @@ from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DoclingDocument
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .verify_utils import verify_docitems
|
||||
from .verify_utils import verify_document
|
||||
|
||||
GENERATE = False
|
||||
|
||||
@ -44,20 +43,6 @@ def verify_export(pred_text: str, gtfile: str):
|
||||
return pred_text == true_text
|
||||
|
||||
|
||||
def verify_document(pred_doc: DoclingDocument, gtfile: str):
|
||||
|
||||
if not os.path.exists(gtfile) or GENERATE:
|
||||
with open(gtfile, "w") as fw:
|
||||
json.dump(pred_doc.export_to_dict(), fw, indent=2)
|
||||
|
||||
return True
|
||||
else:
|
||||
with open(gtfile, "r") as fr:
|
||||
true_doc = DoclingDocument.model_validate_json(fr.read())
|
||||
|
||||
return verify_docitems(pred_doc, true_doc, fuzzy=False)
|
||||
|
||||
|
||||
def test_e2e_pptx_conversions():
|
||||
|
||||
pptx_paths = get_pptx_paths()
|
||||
@ -84,4 +69,6 @@ def test_e2e_pptx_conversions():
|
||||
pred_itxt, str(gt_path) + ".itxt"
|
||||
), "export to indented-text"
|
||||
|
||||
assert verify_document(doc, str(gt_path) + ".json"), "document document"
|
||||
assert verify_document(
|
||||
doc, str(gt_path) + ".json", GENERATE
|
||||
), "document document"
|
||||
|
@ -1,4 +1,5 @@
|
||||
import json
|
||||
import os
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
@ -457,3 +458,17 @@ def verify_conversion_result_v2(
|
||||
assert verify_dt(
|
||||
doc_pred_dt, doc_true_dt, fuzzy=fuzzy
|
||||
), f"Mismatch in DocTags prediction for {input_path}"
|
||||
|
||||
|
||||
def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
|
||||
|
||||
if not os.path.exists(gtfile) or generate:
|
||||
with open(gtfile, "w") as fw:
|
||||
json.dump(pred_doc.export_to_dict(), fw, indent=2)
|
||||
|
||||
return True
|
||||
else:
|
||||
with open(gtfile) as fr:
|
||||
true_doc = DoclingDocument.model_validate_json(fr.read())
|
||||
|
||||
return verify_docitems(pred_doc, true_doc, fuzzy=False)
|
||||
|
Loading…
Reference in New Issue
Block a user