diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 78fe7df..47775b8 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -53,6 +53,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.max_levels: int = 10 self.level_at_new_list: Optional[int] = None self.parents: dict[int, Optional[NodeItem]] = {} + self.numbered_headers: dict[int, int] = {} for i in range(-1, self.max_levels): self.parents[i] = None @@ -346,7 +347,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): parent=None, label=DocItemLabel.TITLE, text=text ) elif "Heading" in p_style_id: - self.add_header(doc, p_level, text) + style_element = getattr(paragraph.style, "element", None) + if style_element: + is_numbered_style = ( + "" in style_element.xml or "" in element.xml + ) + else: + is_numbered_style = False + self.add_header(doc, p_level, text, is_numbered_style) elif len(equations) > 0: if (raw_text is None or len(raw_text) == 0) and len(text) > 0: @@ -415,7 +423,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): return def add_header( - self, doc: DoclingDocument, curr_level: Optional[int], text: str + self, + doc: DoclingDocument, + curr_level: Optional[int], + text: str, + is_numbered_style: bool = False, ) -> None: level = self.get_level() if isinstance(curr_level, int): @@ -433,17 +445,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if key >= curr_level: self.parents[key] = None - self.parents[curr_level] = doc.add_heading( - parent=self.parents[curr_level - 1], - text=text, - level=curr_level, - ) + current_level = curr_level + parent_level = curr_level - 1 + add_level = curr_level else: - self.parents[self.level] = doc.add_heading( - parent=self.parents[self.level - 1], - text=text, - level=1, - ) + current_level = self.level + parent_level = self.level - 1 + add_level = 1 + + if is_numbered_style: + if add_level in self.numbered_headers: + self.numbered_headers[add_level] += 1 + else: + self.numbered_headers[add_level] = 1 + text = f"{self.numbered_headers[add_level]} {text}" + + # Reset deeper levels + next_level = add_level + 1 + while next_level in self.numbered_headers: + self.numbered_headers[next_level] = 0 + next_level += 1 + + # Scan upper levels + previous_level = add_level - 1 + while previous_level in self.numbered_headers: + # MSWord convention: no empty sublevels + # I.e., sub-sub section (2.0.1) without a sub-section (2.1) + # is processed as 2.1.1 + if self.numbered_headers[previous_level] == 0: + self.numbered_headers[previous_level] += 1 + + text = f"{self.numbered_headers[previous_level]}.{text}" + previous_level -= 1 + + self.parents[current_level] = doc.add_heading( + parent=self.parents[parent_level], + text=text, + level=add_level, + ) return def add_listitem( diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.itxt b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.itxt index fe31772..8b916d5 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.itxt +++ b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.itxt @@ -1,7 +1,7 @@ item-0 at level 0: unspecified: group _root_ item-1 at level 1: title: Test Document item-2 at level 2: paragraph: - item-3 at level 2: section_header: Section 1 + item-3 at level 2: section_header: 1 Section 1 item-4 at level 1: paragraph: item-5 at level 1: paragraph: Paragraph 1.1 item-6 at level 1: paragraph: @@ -9,26 +9,26 @@ item-0 at level 0: unspecified: group _root_ item-8 at level 1: paragraph: item-9 at level 1: section: group header-0 item-10 at level 2: section: group header-1 - item-11 at level 3: section_header: Section 1.1 + item-11 at level 3: section_header: 1.1 Section 1.1 item-12 at level 4: paragraph: item-13 at level 4: paragraph: Paragraph 1.1.1 item-14 at level 4: paragraph: item-15 at level 4: paragraph: Paragraph 1.1.2 item-16 at level 4: paragraph: - item-17 at level 3: section_header: Section 1.2 + item-17 at level 3: section_header: 1.2 Section 1.2 item-18 at level 4: paragraph: item-19 at level 4: paragraph: Paragraph 1.1.1 item-20 at level 4: paragraph: item-21 at level 4: paragraph: Paragraph 1.1.2 item-22 at level 4: paragraph: - item-23 at level 4: section_header: Section 1.2.3 + item-23 at level 4: section_header: 1.2.1 Section 1.2.3 item-24 at level 5: paragraph: item-25 at level 5: paragraph: Paragraph 1.2.3.1 item-26 at level 5: paragraph: item-27 at level 5: paragraph: Paragraph 1.2.3.1 item-28 at level 5: paragraph: item-29 at level 5: paragraph: - item-30 at level 2: section_header: Section 2 + item-30 at level 2: section_header: 2 Section 2 item-31 at level 1: paragraph: item-32 at level 1: paragraph: Paragraph 2.1 item-33 at level 1: paragraph: @@ -37,13 +37,13 @@ item-0 at level 0: unspecified: group _root_ item-36 at level 1: section: group header-0 item-37 at level 2: section: group header-1 item-38 at level 3: section: group header-2 - item-39 at level 4: section_header: Section 2.1.1 + item-39 at level 4: section_header: 2.1.1 Section 2.1.1 item-40 at level 5: paragraph: item-41 at level 5: paragraph: Paragraph 2.1.1.1 item-42 at level 5: paragraph: item-43 at level 5: paragraph: Paragraph 2.1.1.1 item-44 at level 5: paragraph: - item-45 at level 3: section_header: Section 2.1 + item-45 at level 3: section_header: 2.2 Section 2.1 item-46 at level 4: paragraph: item-47 at level 4: paragraph: Paragraph 2.1.1 item-48 at level 4: paragraph: diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json index 24df7f2..f29621a 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json @@ -183,8 +183,8 @@ "content_layer": "body", "label": "section_header", "prov": [], - "orig": "Section 1", - "text": "Section 1", + "orig": "1 Section 1", + "text": "1 Section 1", "level": 1 }, { @@ -272,8 +272,8 @@ "content_layer": "body", "label": "section_header", "prov": [], - "orig": "Section 1.1", - "text": "Section 1.1", + "orig": "1.1 Section 1.1", + "text": "1.1 Section 1.1", "level": 2 }, { @@ -364,8 +364,8 @@ "content_layer": "body", "label": "section_header", "prov": [], - "orig": "Section 1.2", - "text": "Section 1.2", + "orig": "1.2 Section 1.2", + "text": "1.2 Section 1.2", "level": 2 }, { @@ -456,8 +456,8 @@ "content_layer": "body", "label": "section_header", "prov": [], - "orig": "Section 1.2.3", - "text": "Section 1.2.3", + "orig": "1.2.1 Section 1.2.3", + "text": "1.2.1 Section 1.2.3", "level": 3 }, { @@ -541,8 +541,8 @@ "content_layer": "body", "label": "section_header", "prov": [], - "orig": "Section 2", - "text": "Section 2", + "orig": "2 Section 2", + "text": "2 Section 2", "level": 1 }, { @@ -630,8 +630,8 @@ "content_layer": "body", "label": "section_header", "prov": [], - "orig": "Section 2.1.1", - "text": "Section 2.1.1", + "orig": "2.1.1 Section 2.1.1", + "text": "2.1.1 Section 2.1.1", "level": 3 }, { @@ -722,8 +722,8 @@ "content_layer": "body", "label": "section_header", "prov": [], - "orig": "Section 2.1", - "text": "Section 2.1", + "orig": "2.2 Section 2.1", + "text": "2.2 Section 2.1", "level": 2 }, { diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.md b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.md index d4c8acc..fb1b82c 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.md +++ b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.md @@ -1,42 +1,42 @@ # Test Document -## Section 1 +## 1 Section 1 Paragraph 1.1 Paragraph 1.2 -### Section 1.1 +### 1.1 Section 1.1 Paragraph 1.1.1 Paragraph 1.1.2 -### Section 1.2 +### 1.2 Section 1.2 Paragraph 1.1.1 Paragraph 1.1.2 -#### Section 1.2.3 +#### 1.2.1 Section 1.2.3 Paragraph 1.2.3.1 Paragraph 1.2.3.1 -## Section 2 +## 2 Section 2 Paragraph 2.1 Paragraph 2.2 -#### Section 2.1.1 +#### 2.1.1 Section 2.1.1 Paragraph 2.1.1.1 Paragraph 2.1.1.1 -### Section 2.1 +### 2.2 Section 2.1 Paragraph 2.1.1