Docling/tests/test_backend_html.py
Cesar Berrospi Ramis 1ac010354f
test: avoid testing exact JSON (#1027)
* test: avoid testing exact JSON

Avoid testing exact JSON output in html and xml backends.
Reuse the JSON verify helper function among backend test files.
Improve type annotations in html backend.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* Update tests/test_backend_patent_uspto.py

Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
2025-02-20 16:20:07 +01:00

104 lines
2.7 KiB
Python

import os
from pathlib import Path
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import (
ConversionResult,
DoclingDocument,
InputDocument,
SectionHeaderItem,
)
from docling.document_converter import DocumentConverter
from .verify_utils import verify_document
GENERATE = False
def test_heading_levels():
in_path = Path("tests/data/html/wiki_duck.html")
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=in_path,
)
doc = backend.convert()
found_lvl_2 = found_lvl_3 = False
for item, _ in doc.iterate_items():
if isinstance(item, SectionHeaderItem):
if item.text == "Etymology":
found_lvl_2 = True
assert item.level == 2
elif item.text == "Feeding":
found_lvl_3 = True
assert item.level == 3
assert found_lvl_2 and found_lvl_3
def get_html_paths():
# Define the directory you want to search
directory = Path("./tests/data/html/")
# List all HTML files in the directory and its subdirectories
html_files = sorted(directory.rglob("*.html"))
return html_files
def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.HTML])
return converter
def verify_export(pred_text: str, gtfile: str):
if not os.path.exists(gtfile) or GENERATE:
with open(gtfile, "w") as fw:
fw.write(pred_text)
return True
else:
with open(gtfile) as fr:
true_text = fr.read()
assert pred_text == true_text, f"pred_text!=true_text for {gtfile}"
return pred_text == true_text
def test_e2e_html_conversions():
html_paths = get_html_paths()
converter = get_converter()
for html_path in html_paths:
# print(f"converting {html_path}")
gt_path = (
html_path.parent.parent / "groundtruth" / "docling_v2" / html_path.name
)
conv_result: ConversionResult = converter.convert(html_path)
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
assert verify_document(doc, str(gt_path) + ".json", GENERATE)