From f94da44ec5c7a8c92b9dd60e4df5dc945ed6d1ea Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Thu, 13 Mar 2025 16:56:58 +0100 Subject: [PATCH] fix(html): handle nested empty lists (#1154) Address the case of nested lists in empty list items. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/html_backend.py | 12 +- .../docling_v2/example_07.html.itxt | 22 ++ .../docling_v2/example_07.html.json | 374 ++++++++++++++++++ .../groundtruth/docling_v2/example_07.html.md | 14 + tests/data/html/example_07.html | 40 ++ 5 files changed, 456 insertions(+), 6 deletions(-) create mode 100644 tests/data/groundtruth/docling_v2/example_07.html.itxt create mode 100644 tests/data/groundtruth/docling_v2/example_07.html.json create mode 100644 tests/data/groundtruth/docling_v2/example_07.html.md create mode 100644 tests/data/html/example_07.html diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 14c2b44..d222297 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -134,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.analyze_tag(cast(Tag, element), doc) except Exception as exc_child: _log.error( - f"Error processing child from tag{tag.name}: {exc_child}" + f"Error processing child from tag {tag.name}: {repr(exc_child)}" ) raise exc_child elif isinstance(element, NavigableString) and not isinstance( @@ -347,11 +347,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): content_layer=self.content_layer, ) self.level += 1 - - self.walk(element, doc) - - self.parents[self.level + 1] = None - self.level -= 1 + self.walk(element, doc) + self.parents[self.level + 1] = None + self.level -= 1 + else: + self.walk(element, doc) elif element.text.strip(): text = element.text.strip() diff --git a/tests/data/groundtruth/docling_v2/example_07.html.itxt b/tests/data/groundtruth/docling_v2/example_07.html.itxt new file mode 100644 index 0000000..675749e --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_07.html.itxt @@ -0,0 +1,22 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: list: group list + item-2 at level 2: list_item: Asia + item-3 at level 3: list: group list + item-4 at level 4: list_item: China + item-5 at level 4: list_item: Japan + item-6 at level 4: list_item: Thailand + item-7 at level 2: list_item: Europe + item-8 at level 3: list: group list + item-9 at level 4: list_item: UK + item-10 at level 4: list_item: Germany + item-11 at level 4: list_item: Switzerland + item-12 at level 5: list: group list + item-13 at level 6: list: group list + item-14 at level 7: list_item: Bern + item-15 at level 7: list_item: Aargau + item-16 at level 4: list_item: Italy + item-17 at level 5: list: group list + item-18 at level 6: list: group list + item-19 at level 7: list_item: Piedmont + item-20 at level 7: list_item: Liguria + item-21 at level 2: list_item: Africa \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_07.html.json b/tests/data/groundtruth/docling_v2/example_07.html.json new file mode 100644 index 0000000..248a47b --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_07.html.json @@ -0,0 +1,374 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.2.0", + "name": "example_07", + "origin": { + "mimetype": "text/html", + "binary_hash": 623628706615267627, + "filename": "example_07.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/13" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/texts/4" + }, + "children": [ + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/10" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/texts/7" + }, + "children": [ + { + "$ref": "#/groups/4" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/4", + "parent": { + "$ref": "#/groups/3" + }, + "children": [ + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/5", + "parent": { + "$ref": "#/texts/10" + }, + "children": [ + { + "$ref": "#/groups/6" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/6", + "parent": { + "$ref": "#/groups/5" + }, + "children": [ + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/texts/12" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Asia", + "text": "Asia", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "China", + "text": "China", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Japan", + "text": "Japan", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Thailand", + "text": "Thailand", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/groups/2" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Europe", + "text": "Europe", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "UK", + "text": "UK", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Germany", + "text": "Germany", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/2" + }, + "children": [ + { + "$ref": "#/groups/3" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Switzerland", + "text": "Switzerland", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Bern", + "text": "Bern", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Aargau", + "text": "Aargau", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/2" + }, + "children": [ + { + "$ref": "#/groups/5" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Italy", + "text": "Italy", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Piedmont", + "text": "Piedmont", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Liguria", + "text": "Liguria", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Africa", + "text": "Africa", + "enumerated": false, + "marker": "-" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_07.html.md b/tests/data/groundtruth/docling_v2/example_07.html.md new file mode 100644 index 0000000..0a4b9fa --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_07.html.md @@ -0,0 +1,14 @@ +- Asia + - China + - Japan + - Thailand +- Europe + - UK + - Germany + - Switzerland + - Bern + - Aargau + - Italy + - Piedmont + - Liguria +- Africa \ No newline at end of file diff --git a/tests/data/html/example_07.html b/tests/data/html/example_07.html new file mode 100644 index 0000000..a8c58c1 --- /dev/null +++ b/tests/data/html/example_07.html @@ -0,0 +1,40 @@ + + + + + \ No newline at end of file