diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 0af3db5..32e69b9 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -240,7 +240,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): numid = None # Handle lists - if numid is not None and ilevel is not None: + if ( + numid is not None + and ilevel is not None + and p_style_id not in ["Title", "Heading"] + ): self.add_listitem( element, docx_obj, @@ -254,12 +258,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) self.update_history(p_style_id, p_level, numid, ilevel) return - elif numid is None and self.prev_numid() is not None: # Close list - for key, val in self.parents.items(): - if key >= self.level_at_new_list: + elif ( + numid is None + and self.prev_numid() is not None + and p_style_id not in ["Title", "Heading"] + ): # Close list + if self.level_at_new_list: + for key in range(len(self.parents)): + if key >= self.level_at_new_list: + self.parents[key] = None + self.level = self.level_at_new_list - 1 + self.level_at_new_list = None + else: + for key in range(len(self.parents)): self.parents[key] = None - self.level = self.level_at_new_list - 1 - self.level_at_new_list = None + self.level = 0 + if p_style_id in ["Title"]: for key, val in self.parents.items(): self.parents[key] = None diff --git a/tests/data/docx/unit_test_headers_numbered.docx b/tests/data/docx/unit_test_headers_numbered.docx new file mode 100644 index 0000000..259125c Binary files /dev/null and b/tests/data/docx/unit_test_headers_numbered.docx differ diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.itxt b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.itxt new file mode 100644 index 0000000..fe31772 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.itxt @@ -0,0 +1,52 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: title: Test Document + item-2 at level 2: paragraph: + item-3 at level 2: section_header: Section 1 + item-4 at level 1: paragraph: + item-5 at level 1: paragraph: Paragraph 1.1 + item-6 at level 1: paragraph: + item-7 at level 1: paragraph: Paragraph 1.2 + item-8 at level 1: paragraph: + item-9 at level 1: section: group header-0 + item-10 at level 2: section: group header-1 + item-11 at level 3: section_header: Section 1.1 + item-12 at level 4: paragraph: + item-13 at level 4: paragraph: Paragraph 1.1.1 + item-14 at level 4: paragraph: + item-15 at level 4: paragraph: Paragraph 1.1.2 + item-16 at level 4: paragraph: + item-17 at level 3: section_header: Section 1.2 + item-18 at level 4: paragraph: + item-19 at level 4: paragraph: Paragraph 1.1.1 + item-20 at level 4: paragraph: + item-21 at level 4: paragraph: Paragraph 1.1.2 + item-22 at level 4: paragraph: + item-23 at level 4: section_header: Section 1.2.3 + item-24 at level 5: paragraph: + item-25 at level 5: paragraph: Paragraph 1.2.3.1 + item-26 at level 5: paragraph: + item-27 at level 5: paragraph: Paragraph 1.2.3.1 + item-28 at level 5: paragraph: + item-29 at level 5: paragraph: + item-30 at level 2: section_header: Section 2 + item-31 at level 1: paragraph: + item-32 at level 1: paragraph: Paragraph 2.1 + item-33 at level 1: paragraph: + item-34 at level 1: paragraph: Paragraph 2.2 + item-35 at level 1: paragraph: + item-36 at level 1: section: group header-0 + item-37 at level 2: section: group header-1 + item-38 at level 3: section: group header-2 + item-39 at level 4: section_header: Section 2.1.1 + item-40 at level 5: paragraph: + item-41 at level 5: paragraph: Paragraph 2.1.1.1 + item-42 at level 5: paragraph: + item-43 at level 5: paragraph: Paragraph 2.1.1.1 + item-44 at level 5: paragraph: + item-45 at level 3: section_header: Section 2.1 + item-46 at level 4: paragraph: + item-47 at level 4: paragraph: Paragraph 2.1.1 + item-48 at level 4: paragraph: + item-49 at level 4: paragraph: Paragraph 2.1.2 + item-50 at level 4: paragraph: + item-51 at level 4: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json new file mode 100644 index 0000000..38a25d3 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json @@ -0,0 +1,753 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "unit_test_headers_numbered", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 7684538628968220703, + "filename": "unit_test_headers_numbered.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/28" + }, + { + "$ref": "#/texts/29" + }, + { + "$ref": "#/texts/30" + }, + { + "$ref": "#/texts/31" + }, + { + "$ref": "#/texts/32" + }, + { + "$ref": "#/groups/2" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/groups/1" + }, + { + "$ref": "#/texts/27" + } + ], + "name": "header-0", + "label": "section" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/14" + } + ], + "name": "header-1", + "label": "section" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/groups/3" + } + ], + "name": "header-0", + "label": "section" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/groups/2" + }, + "children": [ + { + "$ref": "#/groups/4" + }, + { + "$ref": "#/texts/39" + } + ], + "name": "header-1", + "label": "section" + }, + { + "self_ref": "#/groups/4", + "parent": { + "$ref": "#/groups/3" + }, + "children": [ + { + "$ref": "#/texts/33" + } + ], + "name": "header-2", + "label": "section" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + } + ], + "label": "title", + "prov": [], + "orig": "Test Document", + "text": "Test Document" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "section_header", + "prov": [], + "orig": "Section 1", + "text": "Section 1", + "level": 1 + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.1", + "text": "Paragraph 1.1" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.2", + "text": "Paragraph 1.2" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/texts/12" + }, + { + "$ref": "#/texts/13" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 1.1", + "text": "Section 1.1", + "level": 2 + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/texts/8" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/texts/8" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.1.1", + "text": "Paragraph 1.1.1" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/texts/8" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/texts/8" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.1.2", + "text": "Paragraph 1.1.2" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/texts/8" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/texts/15" + }, + { + "$ref": "#/texts/16" + }, + { + "$ref": "#/texts/17" + }, + { + "$ref": "#/texts/18" + }, + { + "$ref": "#/texts/19" + }, + { + "$ref": "#/texts/20" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 1.2", + "text": "Section 1.2", + "level": 2 + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/texts/14" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/texts/14" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.1.1", + "text": "Paragraph 1.1.1" + }, + { + "self_ref": "#/texts/17", + "parent": { + "$ref": "#/texts/14" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/texts/14" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.1.2", + "text": "Paragraph 1.1.2" + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/texts/14" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/texts/14" + }, + "children": [ + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" + }, + { + "$ref": "#/texts/24" + }, + { + "$ref": "#/texts/25" + }, + { + "$ref": "#/texts/26" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 1.2.3", + "text": "Section 1.2.3", + "level": 3 + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.2.3.1", + "text": "Paragraph 1.2.3.1" + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 1.2.3.1", + "text": "Paragraph 1.2.3.1" + }, + { + "self_ref": "#/texts/25", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/texts/20" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/27", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "label": "section_header", + "prov": [], + "orig": "Section 2", + "text": "Section 2", + "level": 1 + }, + { + "self_ref": "#/texts/28", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/29", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1", + "text": "Paragraph 2.1" + }, + { + "self_ref": "#/texts/30", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/31", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.2", + "text": "Paragraph 2.2" + }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/groups/4" + }, + "children": [ + { + "$ref": "#/texts/34" + }, + { + "$ref": "#/texts/35" + }, + { + "$ref": "#/texts/36" + }, + { + "$ref": "#/texts/37" + }, + { + "$ref": "#/texts/38" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 2.1.1", + "text": "Section 2.1.1", + "level": 3 + }, + { + "self_ref": "#/texts/34", + "parent": { + "$ref": "#/texts/33" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/35", + "parent": { + "$ref": "#/texts/33" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1.1.1", + "text": "Paragraph 2.1.1.1" + }, + { + "self_ref": "#/texts/36", + "parent": { + "$ref": "#/texts/33" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/37", + "parent": { + "$ref": "#/texts/33" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1.1.1", + "text": "Paragraph 2.1.1.1" + }, + { + "self_ref": "#/texts/38", + "parent": { + "$ref": "#/texts/33" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/39", + "parent": { + "$ref": "#/groups/3" + }, + "children": [ + { + "$ref": "#/texts/40" + }, + { + "$ref": "#/texts/41" + }, + { + "$ref": "#/texts/42" + }, + { + "$ref": "#/texts/43" + }, + { + "$ref": "#/texts/44" + }, + { + "$ref": "#/texts/45" + } + ], + "label": "section_header", + "prov": [], + "orig": "Section 2.1", + "text": "Section 2.1", + "level": 2 + }, + { + "self_ref": "#/texts/40", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/41", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1.1", + "text": "Paragraph 2.1.1" + }, + { + "self_ref": "#/texts/42", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/43", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "Paragraph 2.1.2", + "text": "Paragraph 2.1.2" + }, + { + "self_ref": "#/texts/44", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/45", + "parent": { + "$ref": "#/texts/39" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.md b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.md new file mode 100644 index 0000000..d4c8acc --- /dev/null +++ b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.md @@ -0,0 +1,43 @@ +# Test Document + +## Section 1 + +Paragraph 1.1 + +Paragraph 1.2 + +### Section 1.1 + +Paragraph 1.1.1 + +Paragraph 1.1.2 + +### Section 1.2 + +Paragraph 1.1.1 + +Paragraph 1.1.2 + +#### Section 1.2.3 + +Paragraph 1.2.3.1 + +Paragraph 1.2.3.1 + +## Section 2 + +Paragraph 2.1 + +Paragraph 2.2 + +#### Section 2.1.1 + +Paragraph 2.1.1.1 + +Paragraph 2.1.1.1 + +### Section 2.1 + +Paragraph 2.1.1 + +Paragraph 2.1.2 \ No newline at end of file