diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 234e5da..5b7f5d8 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -5,9 +5,11 @@ from typing import Optional, Union, cast
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from docling_core.types.doc import (
+ DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
+ GroupItem,
GroupLabel,
TableCell,
TableData,
@@ -32,10 +34,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# Initialise the parents for the hierarchy
self.max_levels = 10
self.level = 0
- self.parents = {} # type: ignore
+ self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
for i in range(0, self.max_levels):
self.parents[i] = None
- self.labels = {} # type: ignore
try:
if isinstance(self.path_or_stream, BytesIO):
@@ -111,11 +112,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
- if tag.name in self.labels:
- self.labels[tag.name] += 1
- else:
- self.labels[tag.name] = 1
-
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
self.handle_header(tag, doc)
elif tag.name in ["p"]:
@@ -238,8 +234,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
"""Handles listitem tags (li)."""
nested_list = element.find(["ul", "ol"])
- parent_list_label = self.parents[self.level].label
- index_in_list = len(self.parents[self.level].children) + 1
+ parent = self.parents[self.level]
+ if parent is None:
+ _log.warning(f"list-item has no parent in DoclingDocument: {element}")
+ return
+ parent_label: str = parent.label
+ index_in_list = len(parent.children) + 1
if nested_list:
# Text in list item can be hidden within hierarchy, hence
@@ -251,7 +251,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
marker = ""
enumerated = False
- if parent_list_label == GroupLabel.ORDERED_LIST:
+ if parent_label == GroupLabel.ORDERED_LIST:
marker = str(index_in_list)
enumerated = True
@@ -261,7 +261,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text=text,
enumerated=enumerated,
marker=marker,
- parent=self.parents[self.level],
+ parent=parent,
)
self.level += 1
@@ -275,14 +275,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
marker = ""
enumerated = False
- if parent_list_label == GroupLabel.ORDERED_LIST:
+ if parent_label == GroupLabel.ORDERED_LIST:
marker = f"{str(index_in_list)}."
enumerated = True
doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
- parent=self.parents[self.level],
+ parent=parent,
)
else:
_log.warning(f"list-item has no text: {element}")
diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py
index a4deb21..02fb0c3 100644
--- a/tests/test_backend_html.py
+++ b/tests/test_backend_html.py
@@ -1,4 +1,3 @@
-import json
import os
from pathlib import Path
@@ -12,6 +11,8 @@ from docling.datamodel.document import (
)
from docling.document_converter import DocumentConverter
+from .verify_utils import verify_document
+
GENERATE = False
@@ -66,7 +67,7 @@ def verify_export(pred_text: str, gtfile: str):
return True
else:
- with open(gtfile, "r") as fr:
+ with open(gtfile) as fr:
true_text = fr.read()
assert pred_text == true_text, f"pred_text!=true_text for {gtfile}"
@@ -99,5 +100,4 @@ def test_e2e_html_conversions():
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
- pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
- assert verify_export(pred_json, str(gt_path) + ".json"), "export to json"
+ assert verify_document(doc, str(gt_path) + ".json", GENERATE)
diff --git a/tests/test_backend_jats.py b/tests/test_backend_jats.py
index a15ba86..377cf3b 100644
--- a/tests/test_backend_jats.py
+++ b/tests/test_backend_jats.py
@@ -1,4 +1,3 @@
-import json
import os
from io import BytesIO
from pathlib import Path
@@ -9,6 +8,8 @@ from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
+from .verify_utils import verify_document
+
GENERATE = False
@@ -61,8 +62,7 @@ def test_e2e_pubmed_conversions(use_stream=False):
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
- pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
- assert verify_export(pred_json, str(gt_path) + ".json"), "export to json"
+ assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json"
def test_e2e_pubmed_conversions_stream():
diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py
index d07bb69..5324185 100644
--- a/tests/test_backend_msexcel.py
+++ b/tests/test_backend_msexcel.py
@@ -1,4 +1,3 @@
-import json
import os
from pathlib import Path
@@ -6,7 +5,7 @@ from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument
from docling.document_converter import DocumentConverter
-from .verify_utils import verify_docitems
+from .verify_utils import verify_document
GENERATE = False
@@ -44,20 +43,6 @@ def verify_export(pred_text: str, gtfile: str):
return pred_text == true_text
-def verify_document(pred_doc: DoclingDocument, gtfile: str):
-
- if not os.path.exists(gtfile) or GENERATE:
- with open(gtfile, "w") as fw:
- json.dump(pred_doc.export_to_dict(), fw, indent=2)
-
- return True
- else:
- with open(gtfile, "r") as fr:
- true_doc = DoclingDocument.model_validate_json(fr.read())
-
- return verify_docitems(pred_doc, true_doc, fuzzy=False)
-
-
def test_e2e_xlsx_conversions():
xlsx_paths = get_xlsx_paths()
@@ -84,4 +69,6 @@ def test_e2e_xlsx_conversions():
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
- assert verify_document(doc, str(gt_path) + ".json"), "document document"
+ assert verify_document(
+ doc, str(gt_path) + ".json", GENERATE
+ ), "document document"
diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py
index 4b092e0..9adf54f 100644
--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@@ -1,4 +1,3 @@
-import json
import os
from pathlib import Path
@@ -12,7 +11,7 @@ from docling.datamodel.document import (
)
from docling.document_converter import DocumentConverter
-from .verify_utils import verify_docitems
+from .verify_utils import verify_document
GENERATE = False
@@ -74,20 +73,6 @@ def verify_export(pred_text: str, gtfile: str):
return pred_text == true_text
-def verify_document(pred_doc: DoclingDocument, gtfile: str):
-
- if not os.path.exists(gtfile) or GENERATE:
- with open(gtfile, "w") as fw:
- json.dump(pred_doc.export_to_dict(), fw, indent=2)
-
- return True
- else:
- with open(gtfile, "r") as fr:
- true_doc = DoclingDocument.model_validate_json(fr.read())
-
- return verify_docitems(pred_doc, true_doc, fuzzy=False)
-
-
def test_e2e_docx_conversions():
docx_paths = get_docx_paths()
@@ -114,7 +99,9 @@ def test_e2e_docx_conversions():
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
- assert verify_document(doc, str(gt_path) + ".json"), "document document"
+ assert verify_document(
+ doc, str(gt_path) + ".json", GENERATE
+ ), "document document"
if docx_path.name == "word_tables.docx":
pred_html: str = doc.export_to_html()
diff --git a/tests/test_backend_patent_uspto.py b/tests/test_backend_patent_uspto.py
index 0e95a4d..002aa71 100644
--- a/tests/test_backend_patent_uspto.py
+++ b/tests/test_backend_patent_uspto.py
@@ -1,6 +1,5 @@
"""Test methods in module docling.backend.patent_uspto_backend.py."""
-import json
import logging
import os
from pathlib import Path
@@ -14,6 +13,8 @@ from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTab
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
+from .verify_utils import verify_document
+
GENERATE: bool = False
DATA_PATH: Path = Path("./tests/data/uspto/")
GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/")
@@ -110,12 +111,11 @@ def test_patent_groundtruth(patents, groundtruth):
assert (
pred_md == gt_names[md_name]
), f"Markdown file mismatch against groundtruth {md_name}"
- json_name = path.stem + ".json"
- if json_name in gt_names:
- pred_json = json.dumps(doc.export_to_dict(), indent=2)
- assert (
- pred_json == gt_names[json_name]
- ), f"JSON file mismatch against groundtruth {json_name}"
+ json_path = path.with_suffix(".json")
+ if json_path.stem in gt_names:
+ assert verify_document(
+ doc, str(json_path), GENERATE
+ ), f"JSON file mismatch against groundtruth {json_path}"
itxt_name = path.stem + ".itxt"
if itxt_name in gt_names:
pred_itxt = doc._export_to_indented_text()
diff --git a/tests/test_backend_pptx.py b/tests/test_backend_pptx.py
index 7540208..c0e71df 100644
--- a/tests/test_backend_pptx.py
+++ b/tests/test_backend_pptx.py
@@ -1,4 +1,3 @@
-import json
import os
from pathlib import Path
@@ -6,7 +5,7 @@ from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument
from docling.document_converter import DocumentConverter
-from .verify_utils import verify_docitems
+from .verify_utils import verify_document
GENERATE = False
@@ -44,20 +43,6 @@ def verify_export(pred_text: str, gtfile: str):
return pred_text == true_text
-def verify_document(pred_doc: DoclingDocument, gtfile: str):
-
- if not os.path.exists(gtfile) or GENERATE:
- with open(gtfile, "w") as fw:
- json.dump(pred_doc.export_to_dict(), fw, indent=2)
-
- return True
- else:
- with open(gtfile, "r") as fr:
- true_doc = DoclingDocument.model_validate_json(fr.read())
-
- return verify_docitems(pred_doc, true_doc, fuzzy=False)
-
-
def test_e2e_pptx_conversions():
pptx_paths = get_pptx_paths()
@@ -84,4 +69,6 @@ def test_e2e_pptx_conversions():
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
- assert verify_document(doc, str(gt_path) + ".json"), "document document"
+ assert verify_document(
+ doc, str(gt_path) + ".json", GENERATE
+ ), "document document"
diff --git a/tests/verify_utils.py b/tests/verify_utils.py
index 0493dac..d94ccfb 100644
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -1,4 +1,5 @@
import json
+import os
import warnings
from pathlib import Path
from typing import List, Optional
@@ -457,3 +458,17 @@ def verify_conversion_result_v2(
assert verify_dt(
doc_pred_dt, doc_true_dt, fuzzy=fuzzy
), f"Mismatch in DocTags prediction for {input_path}"
+
+
+def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
+
+ if not os.path.exists(gtfile) or generate:
+ with open(gtfile, "w") as fw:
+ json.dump(pred_doc.export_to_dict(), fw, indent=2)
+
+ return True
+ else:
+ with open(gtfile) as fr:
+ true_doc = DoclingDocument.model_validate_json(fr.read())
+
+ return verify_docitems(pred_doc, true_doc, fuzzy=False)