diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index b802605..7fd69cf 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -180,11 +180,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
)
- elif hlevel == self.level:
- self.parents[hlevel] = doc.add_text(
- parent=self.parents[hlevel - 1], label=label, text=text
- )
-
elif hlevel > self.level:
# add invisible group
@@ -194,10 +189,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
label=GroupLabel.SECTION,
parent=self.parents[i - 1],
)
-
- self.parents[hlevel] = doc.add_text(
- parent=self.parents[hlevel - 1], label=label, text=text
- )
self.level = hlevel
elif hlevel < self.level:
@@ -206,12 +197,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
for key, val in self.parents.items():
if key > hlevel:
self.parents[key] = None
-
- self.parents[hlevel] = doc.add_text(
- parent=self.parents[hlevel - 1], label=label, text=text
- )
self.level = hlevel
+ self.parents[hlevel] = doc.add_heading(
+ parent=self.parents[hlevel - 1],
+ text=text,
+ level=hlevel,
+ )
+
def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p)."""
if element.text is None:
diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py
index 5b166d5..08529ea 100644
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -294,13 +294,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
level = self.get_level()
if isinstance(curr_level, int):
- if curr_level == level:
-
- self.parents[level] = doc.add_heading(
- parent=self.parents[level - 1], text=text
- )
-
- elif curr_level > level:
+ if curr_level > level:
# add invisible group
for i in range(level, curr_level):
@@ -310,10 +304,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
name=f"header-{i}",
)
- self.parents[curr_level] = doc.add_heading(
- parent=self.parents[curr_level - 1], text=text
- )
-
elif curr_level < level:
# remove the tail
@@ -321,13 +311,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if key >= curr_level:
self.parents[key] = None
- self.parents[curr_level] = doc.add_heading(
- parent=self.parents[curr_level - 1], text=text
- )
+ self.parents[curr_level] = doc.add_heading(
+ parent=self.parents[curr_level - 1],
+ text=text,
+ level=curr_level,
+ )
else:
self.parents[self.level] = doc.add_heading(
- parent=self.parents[self.level - 1], text=text
+ parent=self.parents[self.level - 1],
+ text=text,
+ level=1,
)
return
diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py
new file mode 100644
index 0000000..f5ec035
--- /dev/null
+++ b/tests/test_backend_html.py
@@ -0,0 +1,30 @@
+from pathlib import Path
+
+from docling.backend.html_backend import HTMLDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument, SectionHeaderItem
+
+
+def test_heading_levels():
+ in_path = Path("tests/data/wiki_duck.html")
+ in_doc = InputDocument(
+ path_or_stream=in_path,
+ format=InputFormat.HTML,
+ backend=HTMLDocumentBackend,
+ )
+ backend = HTMLDocumentBackend(
+ in_doc=in_doc,
+ path_or_stream=in_path,
+ )
+ doc = backend.convert()
+
+ found_lvl_2 = found_lvl_3 = False
+ for item, _ in doc.iterate_items():
+ if isinstance(item, SectionHeaderItem):
+ if item.text == "Etymology":
+ found_lvl_2 = True
+ assert item.level == 2
+ elif item.text == "Feeding":
+ found_lvl_3 = True
+ assert item.level == 3
+ assert found_lvl_2 and found_lvl_3
diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py
new file mode 100644
index 0000000..4544e71
--- /dev/null
+++ b/tests/test_backend_msword.py
@@ -0,0 +1,30 @@
+from pathlib import Path
+
+from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument, SectionHeaderItem
+
+
+def test_heading_levels():
+ in_path = Path("tests/data/word_sample.docx")
+ in_doc = InputDocument(
+ path_or_stream=in_path,
+ format=InputFormat.DOCX,
+ backend=MsWordDocumentBackend,
+ )
+ backend = MsWordDocumentBackend(
+ in_doc=in_doc,
+ path_or_stream=in_path,
+ )
+ doc = backend.convert()
+
+ found_lvl_1 = found_lvl_2 = False
+ for item, _ in doc.iterate_items():
+ if isinstance(item, SectionHeaderItem):
+ if item.text == "Let\u2019s swim!":
+ found_lvl_1 = True
+ assert item.level == 1
+ elif item.text == "Let\u2019s eat":
+ found_lvl_2 = True
+ assert item.level == 2
+ assert found_lvl_1 and found_lvl_2