From ed20124544a1b10f068b11bbdf12e1bfc7567195 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Wed, 23 Apr 2025 09:30:59 +0200 Subject: [PATCH] fix(html): handle address, details, and summary tags (#1436) * fix(html): handle 'address' tag Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * fix(html): handle 'details' tag Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/html_backend.py | 24 ++++++++- .../docling_v2/example_06.html.itxt | 5 +- .../docling_v2/example_06.html.json | 51 +++++++++++++++++-- .../groundtruth/docling_v2/example_06.html.md | 4 ++ tests/data/html/example_06.html | 4 ++ 5 files changed, 82 insertions(+), 6 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index aa2637f..88a315d 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -26,6 +26,8 @@ _log = logging.getLogger(__name__) # tags that generate NodeItem elements TAGS_FOR_NODE_ITEMS: Final = [ + "address", + "details", "h1", "h2", "h3", @@ -38,6 +40,7 @@ TAGS_FOR_NODE_ITEMS: Final = [ "ul", "ol", "li", + "summary", "table", "figure", "img", @@ -163,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None: if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: self.handle_header(tag, doc) - elif tag.name in ["p"]: + elif tag.name in ["p", "address", "summary"]: self.handle_paragraph(tag, doc) elif tag.name in ["pre", "code"]: self.handle_code(tag, doc) @@ -177,6 +180,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.handle_figure(tag, doc) elif tag.name == "img": self.handle_image(tag, doc) + elif tag.name == "details": + self.handle_details(tag, doc) else: self.walk(tag, doc) @@ -201,6 +206,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return ["".join(result) + " "] + def handle_details(self, element: Tag, doc: DoclingDocument) -> None: + """Handle details tag (details) and its content.""" + + self.parents[self.level + 1] = doc.add_group( + name="details", + label=GroupLabel.SECTION, + parent=self.parents[self.level], + content_layer=self.content_layer, + ) + + self.level += 1 + self.walk(element, doc) + self.parents[self.level + 1] = None + self.level -= 1 + def handle_header(self, element: Tag, doc: DoclingDocument) -> None: """Handles header tags (h1, h2, etc.).""" hlevel = int(element.name.replace("h", "")) @@ -258,7 +278,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): ) def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None: - """Handles paragraph tags (p).""" + """Handles paragraph tags (p) or equivalent ones.""" if element.text is None: return text = element.text.strip() diff --git a/tests/data/groundtruth/docling_v2/example_06.html.itxt b/tests/data/groundtruth/docling_v2/example_06.html.itxt index 0cbdcf4..f9d5ef5 100644 --- a/tests/data/groundtruth/docling_v2/example_06.html.itxt +++ b/tests/data/groundtruth/docling_v2/example_06.html.itxt @@ -4,4 +4,7 @@ item-0 at level 0: unspecified: group _root_ item-3 at level 1: text: This is a regular paragraph. item-4 at level 1: text: This is a third div with a new line. - item-5 at level 1: text: This is a fourth div with a bold paragraph. \ No newline at end of file + item-5 at level 1: section: group details + item-6 at level 2: text: Heading for the details element + item-7 at level 2: text: Description of the details element. + item-8 at level 1: text: This is a fourth div with a bold paragraph. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_06.html.json b/tests/data/groundtruth/docling_v2/example_06.html.json index 2f48245..f62a2cf 100644 --- a/tests/data/groundtruth/docling_v2/example_06.html.json +++ b/tests/data/groundtruth/docling_v2/example_06.html.json @@ -4,7 +4,7 @@ "name": "example_06", "origin": { "mimetype": "text/html", - "binary_hash": 14574683870626799530, + "binary_hash": 10224930410364781672, "filename": "example_06.html" }, "furniture": { @@ -30,14 +30,35 @@ "$ref": "#/texts/3" }, { - "$ref": "#/texts/4" + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/6" } ], "content_layer": "body", "name": "_root_", "label": "unspecified" }, - "groups": [], + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + } + ], + "content_layer": "body", + "name": "details", + "label": "section" + } + ], "texts": [ { "self_ref": "#/texts/0", @@ -89,6 +110,30 @@ }, { "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Heading for the details element", + "text": "Heading for the details element" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Description of the details element.", + "text": "Description of the details element." + }, + { + "self_ref": "#/texts/6", "parent": { "$ref": "#/body" }, diff --git a/tests/data/groundtruth/docling_v2/example_06.html.md b/tests/data/groundtruth/docling_v2/example_06.html.md index ed105e6..de672b8 100644 --- a/tests/data/groundtruth/docling_v2/example_06.html.md +++ b/tests/data/groundtruth/docling_v2/example_06.html.md @@ -7,4 +7,8 @@ This is a regular paragraph. This is a third div with a new line. +Heading for the details element + +Description of the details element. + This is a fourth div with a bold paragraph. \ No newline at end of file diff --git a/tests/data/html/example_06.html b/tests/data/html/example_06.html index efafd27..8c244a5 100644 --- a/tests/data/html/example_06.html +++ b/tests/data/html/example_06.html @@ -7,6 +7,10 @@
This is a regular paragraph.
Description of the details element.
+This is a fourth div with a bold paragraph.