diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 234e5da..5b7f5d8 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -5,9 +5,11 @@ from typing import Optional, Union, cast from bs4 import BeautifulSoup, NavigableString, PageElement, Tag from docling_core.types.doc import ( + DocItem, DocItemLabel, DoclingDocument, DocumentOrigin, + GroupItem, GroupLabel, TableCell, TableData, @@ -32,10 +34,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): # Initialise the parents for the hierarchy self.max_levels = 10 self.level = 0 - self.parents = {} # type: ignore + self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {} for i in range(0, self.max_levels): self.parents[i] = None - self.labels = {} # type: ignore try: if isinstance(self.path_or_stream, BytesIO): @@ -111,11 +112,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None: - if tag.name in self.labels: - self.labels[tag.name] += 1 - else: - self.labels[tag.name] = 1 - if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: self.handle_header(tag, doc) elif tag.name in ["p"]: @@ -238,8 +234,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): """Handles listitem tags (li).""" nested_list = element.find(["ul", "ol"]) - parent_list_label = self.parents[self.level].label - index_in_list = len(self.parents[self.level].children) + 1 + parent = self.parents[self.level] + if parent is None: + _log.warning(f"list-item has no parent in DoclingDocument: {element}") + return + parent_label: str = parent.label + index_in_list = len(parent.children) + 1 if nested_list: # Text in list item can be hidden within hierarchy, hence @@ -251,7 +251,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): marker = "" enumerated = False - if parent_list_label == GroupLabel.ORDERED_LIST: + if parent_label == GroupLabel.ORDERED_LIST: marker = str(index_in_list) enumerated = True @@ -261,7 +261,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): text=text, enumerated=enumerated, marker=marker, - parent=self.parents[self.level], + parent=parent, ) self.level += 1 @@ -275,14 +275,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): marker = "" enumerated = False - if parent_list_label == GroupLabel.ORDERED_LIST: + if parent_label == GroupLabel.ORDERED_LIST: marker = f"{str(index_in_list)}." enumerated = True doc.add_list_item( text=text, enumerated=enumerated, marker=marker, - parent=self.parents[self.level], + parent=parent, ) else: _log.warning(f"list-item has no text: {element}") diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py index a4deb21..02fb0c3 100644 --- a/tests/test_backend_html.py +++ b/tests/test_backend_html.py @@ -1,4 +1,3 @@ -import json import os from pathlib import Path @@ -12,6 +11,8 @@ from docling.datamodel.document import ( ) from docling.document_converter import DocumentConverter +from .verify_utils import verify_document + GENERATE = False @@ -66,7 +67,7 @@ def verify_export(pred_text: str, gtfile: str): return True else: - with open(gtfile, "r") as fr: + with open(gtfile) as fr: true_text = fr.read() assert pred_text == true_text, f"pred_text!=true_text for {gtfile}" @@ -99,5 +100,4 @@ def test_e2e_html_conversions(): pred_itxt, str(gt_path) + ".itxt" ), "export to indented-text" - pred_json: str = json.dumps(doc.export_to_dict(), indent=2) - assert verify_export(pred_json, str(gt_path) + ".json"), "export to json" + assert verify_document(doc, str(gt_path) + ".json", GENERATE) diff --git a/tests/test_backend_jats.py b/tests/test_backend_jats.py index a15ba86..377cf3b 100644 --- a/tests/test_backend_jats.py +++ b/tests/test_backend_jats.py @@ -1,4 +1,3 @@ -import json import os from io import BytesIO from pathlib import Path @@ -9,6 +8,8 @@ from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.document import ConversionResult from docling.document_converter import DocumentConverter +from .verify_utils import verify_document + GENERATE = False @@ -61,8 +62,7 @@ def test_e2e_pubmed_conversions(use_stream=False): pred_itxt, str(gt_path) + ".itxt" ), "export to indented-text" - pred_json: str = json.dumps(doc.export_to_dict(), indent=2) - assert verify_export(pred_json, str(gt_path) + ".json"), "export to json" + assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json" def test_e2e_pubmed_conversions_stream(): diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index d07bb69..5324185 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -1,4 +1,3 @@ -import json import os from pathlib import Path @@ -6,7 +5,7 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult, DoclingDocument from docling.document_converter import DocumentConverter -from .verify_utils import verify_docitems +from .verify_utils import verify_document GENERATE = False @@ -44,20 +43,6 @@ def verify_export(pred_text: str, gtfile: str): return pred_text == true_text -def verify_document(pred_doc: DoclingDocument, gtfile: str): - - if not os.path.exists(gtfile) or GENERATE: - with open(gtfile, "w") as fw: - json.dump(pred_doc.export_to_dict(), fw, indent=2) - - return True - else: - with open(gtfile, "r") as fr: - true_doc = DoclingDocument.model_validate_json(fr.read()) - - return verify_docitems(pred_doc, true_doc, fuzzy=False) - - def test_e2e_xlsx_conversions(): xlsx_paths = get_xlsx_paths() @@ -84,4 +69,6 @@ def test_e2e_xlsx_conversions(): pred_itxt, str(gt_path) + ".itxt" ), "export to indented-text" - assert verify_document(doc, str(gt_path) + ".json"), "document document" + assert verify_document( + doc, str(gt_path) + ".json", GENERATE + ), "document document" diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 4b092e0..9adf54f 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -1,4 +1,3 @@ -import json import os from pathlib import Path @@ -12,7 +11,7 @@ from docling.datamodel.document import ( ) from docling.document_converter import DocumentConverter -from .verify_utils import verify_docitems +from .verify_utils import verify_document GENERATE = False @@ -74,20 +73,6 @@ def verify_export(pred_text: str, gtfile: str): return pred_text == true_text -def verify_document(pred_doc: DoclingDocument, gtfile: str): - - if not os.path.exists(gtfile) or GENERATE: - with open(gtfile, "w") as fw: - json.dump(pred_doc.export_to_dict(), fw, indent=2) - - return True - else: - with open(gtfile, "r") as fr: - true_doc = DoclingDocument.model_validate_json(fr.read()) - - return verify_docitems(pred_doc, true_doc, fuzzy=False) - - def test_e2e_docx_conversions(): docx_paths = get_docx_paths() @@ -114,7 +99,9 @@ def test_e2e_docx_conversions(): pred_itxt, str(gt_path) + ".itxt" ), "export to indented-text" - assert verify_document(doc, str(gt_path) + ".json"), "document document" + assert verify_document( + doc, str(gt_path) + ".json", GENERATE + ), "document document" if docx_path.name == "word_tables.docx": pred_html: str = doc.export_to_html() diff --git a/tests/test_backend_patent_uspto.py b/tests/test_backend_patent_uspto.py index 0e95a4d..002aa71 100644 --- a/tests/test_backend_patent_uspto.py +++ b/tests/test_backend_patent_uspto.py @@ -1,6 +1,5 @@ """Test methods in module docling.backend.patent_uspto_backend.py.""" -import json import logging import os from pathlib import Path @@ -14,6 +13,8 @@ from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTab from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument +from .verify_utils import verify_document + GENERATE: bool = False DATA_PATH: Path = Path("./tests/data/uspto/") GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/") @@ -110,12 +111,11 @@ def test_patent_groundtruth(patents, groundtruth): assert ( pred_md == gt_names[md_name] ), f"Markdown file mismatch against groundtruth {md_name}" - json_name = path.stem + ".json" - if json_name in gt_names: - pred_json = json.dumps(doc.export_to_dict(), indent=2) - assert ( - pred_json == gt_names[json_name] - ), f"JSON file mismatch against groundtruth {json_name}" + json_path = path.with_suffix(".json") + if json_path.stem in gt_names: + assert verify_document( + doc, str(json_path), GENERATE + ), f"JSON file mismatch against groundtruth {json_path}" itxt_name = path.stem + ".itxt" if itxt_name in gt_names: pred_itxt = doc._export_to_indented_text() diff --git a/tests/test_backend_pptx.py b/tests/test_backend_pptx.py index 7540208..c0e71df 100644 --- a/tests/test_backend_pptx.py +++ b/tests/test_backend_pptx.py @@ -1,4 +1,3 @@ -import json import os from pathlib import Path @@ -6,7 +5,7 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult, DoclingDocument from docling.document_converter import DocumentConverter -from .verify_utils import verify_docitems +from .verify_utils import verify_document GENERATE = False @@ -44,20 +43,6 @@ def verify_export(pred_text: str, gtfile: str): return pred_text == true_text -def verify_document(pred_doc: DoclingDocument, gtfile: str): - - if not os.path.exists(gtfile) or GENERATE: - with open(gtfile, "w") as fw: - json.dump(pred_doc.export_to_dict(), fw, indent=2) - - return True - else: - with open(gtfile, "r") as fr: - true_doc = DoclingDocument.model_validate_json(fr.read()) - - return verify_docitems(pred_doc, true_doc, fuzzy=False) - - def test_e2e_pptx_conversions(): pptx_paths = get_pptx_paths() @@ -84,4 +69,6 @@ def test_e2e_pptx_conversions(): pred_itxt, str(gt_path) + ".itxt" ), "export to indented-text" - assert verify_document(doc, str(gt_path) + ".json"), "document document" + assert verify_document( + doc, str(gt_path) + ".json", GENERATE + ), "document document" diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 0493dac..d94ccfb 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -1,4 +1,5 @@ import json +import os import warnings from pathlib import Path from typing import List, Optional @@ -457,3 +458,17 @@ def verify_conversion_result_v2( assert verify_dt( doc_pred_dt, doc_true_dt, fuzzy=fuzzy ), f"Mismatch in DocTags prediction for {input_path}" + + +def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False): + + if not os.path.exists(gtfile) or generate: + with open(gtfile, "w") as fw: + json.dump(pred_doc.export_to_dict(), fw, indent=2) + + return True + else: + with open(gtfile) as fr: + true_doc = DoclingDocument.model_validate_json(fr.read()) + + return verify_docitems(pred_doc, true_doc, fuzzy=False)