From b9f5c74a7d13827c2b7887ddbf0b4eb43edd0846 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Mon, 28 Oct 2024 17:02:52 +0100 Subject: [PATCH] fix: fix header levels for DOCX & HTML (#184) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling/backend/html_backend.py | 19 ++++++------------- docling/backend/msword_backend.py | 24 +++++++++--------------- tests/test_backend_html.py | 30 ++++++++++++++++++++++++++++++ tests/test_backend_msword.py | 30 ++++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 28 deletions(-) create mode 100644 tests/test_backend_html.py create mode 100644 tests/test_backend_msword.py diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index b802605..7fd69cf 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -180,11 +180,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[0], label=DocItemLabel.TITLE, text=text ) - elif hlevel == self.level: - self.parents[hlevel] = doc.add_text( - parent=self.parents[hlevel - 1], label=label, text=text - ) - elif hlevel > self.level: # add invisible group @@ -194,10 +189,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): label=GroupLabel.SECTION, parent=self.parents[i - 1], ) - - self.parents[hlevel] = doc.add_text( - parent=self.parents[hlevel - 1], label=label, text=text - ) self.level = hlevel elif hlevel < self.level: @@ -206,12 +197,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): for key, val in self.parents.items(): if key > hlevel: self.parents[key] = None - - self.parents[hlevel] = doc.add_text( - parent=self.parents[hlevel - 1], label=label, text=text - ) self.level = hlevel + self.parents[hlevel] = doc.add_heading( + parent=self.parents[hlevel - 1], + text=text, + level=hlevel, + ) + def handle_paragraph(self, element, idx, doc): """Handles paragraph tags (p).""" if element.text is None: diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 5b166d5..08529ea 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -294,13 +294,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): level = self.get_level() if isinstance(curr_level, int): - if curr_level == level: - - self.parents[level] = doc.add_heading( - parent=self.parents[level - 1], text=text - ) - - elif curr_level > level: + if curr_level > level: # add invisible group for i in range(level, curr_level): @@ -310,10 +304,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): name=f"header-{i}", ) - self.parents[curr_level] = doc.add_heading( - parent=self.parents[curr_level - 1], text=text - ) - elif curr_level < level: # remove the tail @@ -321,13 +311,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if key >= curr_level: self.parents[key] = None - self.parents[curr_level] = doc.add_heading( - parent=self.parents[curr_level - 1], text=text - ) + self.parents[curr_level] = doc.add_heading( + parent=self.parents[curr_level - 1], + text=text, + level=curr_level, + ) else: self.parents[self.level] = doc.add_heading( - parent=self.parents[self.level - 1], text=text + parent=self.parents[self.level - 1], + text=text, + level=1, ) return diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py new file mode 100644 index 0000000..f5ec035 --- /dev/null +++ b/tests/test_backend_html.py @@ -0,0 +1,30 @@ +from pathlib import Path + +from docling.backend.html_backend import HTMLDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument, SectionHeaderItem + + +def test_heading_levels(): + in_path = Path("tests/data/wiki_duck.html") + in_doc = InputDocument( + path_or_stream=in_path, + format=InputFormat.HTML, + backend=HTMLDocumentBackend, + ) + backend = HTMLDocumentBackend( + in_doc=in_doc, + path_or_stream=in_path, + ) + doc = backend.convert() + + found_lvl_2 = found_lvl_3 = False + for item, _ in doc.iterate_items(): + if isinstance(item, SectionHeaderItem): + if item.text == "Etymology": + found_lvl_2 = True + assert item.level == 2 + elif item.text == "Feeding": + found_lvl_3 = True + assert item.level == 3 + assert found_lvl_2 and found_lvl_3 diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py new file mode 100644 index 0000000..4544e71 --- /dev/null +++ b/tests/test_backend_msword.py @@ -0,0 +1,30 @@ +from pathlib import Path + +from docling.backend.msword_backend import MsWordDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument, SectionHeaderItem + + +def test_heading_levels(): + in_path = Path("tests/data/word_sample.docx") + in_doc = InputDocument( + path_or_stream=in_path, + format=InputFormat.DOCX, + backend=MsWordDocumentBackend, + ) + backend = MsWordDocumentBackend( + in_doc=in_doc, + path_or_stream=in_path, + ) + doc = backend.convert() + + found_lvl_1 = found_lvl_2 = False + for item, _ in doc.iterate_items(): + if isinstance(item, SectionHeaderItem): + if item.text == "Let\u2019s swim!": + found_lvl_1 = True + assert item.level == 1 + elif item.text == "Let\u2019s eat": + found_lvl_2 = True + assert item.level == 2 + assert found_lvl_1 and found_lvl_2