fix(docx): identifying numbered headers (#1231)

* Modifications to identify numbered headers

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>

* Add style check

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>

---------

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
Rafael Teixeira de Lima 2025-03-25 11:41:02 +01:00 committed by GitHub
parent 0974ba4e1c
commit f739d0e4c5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 79 additions and 40 deletions

View File

@ -53,6 +53,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.max_levels: int = 10 self.max_levels: int = 10
self.level_at_new_list: Optional[int] = None self.level_at_new_list: Optional[int] = None
self.parents: dict[int, Optional[NodeItem]] = {} self.parents: dict[int, Optional[NodeItem]] = {}
self.numbered_headers: dict[int, int] = {}
for i in range(-1, self.max_levels): for i in range(-1, self.max_levels):
self.parents[i] = None self.parents[i] = None
@ -346,7 +347,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
parent=None, label=DocItemLabel.TITLE, text=text parent=None, label=DocItemLabel.TITLE, text=text
) )
elif "Heading" in p_style_id: elif "Heading" in p_style_id:
self.add_header(doc, p_level, text) style_element = getattr(paragraph.style, "element", None)
if style_element:
is_numbered_style = (
"<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
)
else:
is_numbered_style = False
self.add_header(doc, p_level, text, is_numbered_style)
elif len(equations) > 0: elif len(equations) > 0:
if (raw_text is None or len(raw_text) == 0) and len(text) > 0: if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
@ -415,7 +423,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return return
def add_header( def add_header(
self, doc: DoclingDocument, curr_level: Optional[int], text: str self,
doc: DoclingDocument,
curr_level: Optional[int],
text: str,
is_numbered_style: bool = False,
) -> None: ) -> None:
level = self.get_level() level = self.get_level()
if isinstance(curr_level, int): if isinstance(curr_level, int):
@ -433,17 +445,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if key >= curr_level: if key >= curr_level:
self.parents[key] = None self.parents[key] = None
self.parents[curr_level] = doc.add_heading( current_level = curr_level
parent=self.parents[curr_level - 1], parent_level = curr_level - 1
text=text, add_level = curr_level
level=curr_level,
)
else: else:
self.parents[self.level] = doc.add_heading( current_level = self.level
parent=self.parents[self.level - 1], parent_level = self.level - 1
text=text, add_level = 1
level=1,
) if is_numbered_style:
if add_level in self.numbered_headers:
self.numbered_headers[add_level] += 1
else:
self.numbered_headers[add_level] = 1
text = f"{self.numbered_headers[add_level]} {text}"
# Reset deeper levels
next_level = add_level + 1
while next_level in self.numbered_headers:
self.numbered_headers[next_level] = 0
next_level += 1
# Scan upper levels
previous_level = add_level - 1
while previous_level in self.numbered_headers:
# MSWord convention: no empty sublevels
# I.e., sub-sub section (2.0.1) without a sub-section (2.1)
# is processed as 2.1.1
if self.numbered_headers[previous_level] == 0:
self.numbered_headers[previous_level] += 1
text = f"{self.numbered_headers[previous_level]}.{text}"
previous_level -= 1
self.parents[current_level] = doc.add_heading(
parent=self.parents[parent_level],
text=text,
level=add_level,
)
return return
def add_listitem( def add_listitem(

View File

@ -1,7 +1,7 @@
item-0 at level 0: unspecified: group _root_ item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Test Document item-1 at level 1: title: Test Document
item-2 at level 2: paragraph: item-2 at level 2: paragraph:
item-3 at level 2: section_header: Section 1 item-3 at level 2: section_header: 1 Section 1
item-4 at level 1: paragraph: item-4 at level 1: paragraph:
item-5 at level 1: paragraph: Paragraph 1.1 item-5 at level 1: paragraph: Paragraph 1.1
item-6 at level 1: paragraph: item-6 at level 1: paragraph:
@ -9,26 +9,26 @@ item-0 at level 0: unspecified: group _root_
item-8 at level 1: paragraph: item-8 at level 1: paragraph:
item-9 at level 1: section: group header-0 item-9 at level 1: section: group header-0
item-10 at level 2: section: group header-1 item-10 at level 2: section: group header-1
item-11 at level 3: section_header: Section 1.1 item-11 at level 3: section_header: 1.1 Section 1.1
item-12 at level 4: paragraph: item-12 at level 4: paragraph:
item-13 at level 4: paragraph: Paragraph 1.1.1 item-13 at level 4: paragraph: Paragraph 1.1.1
item-14 at level 4: paragraph: item-14 at level 4: paragraph:
item-15 at level 4: paragraph: Paragraph 1.1.2 item-15 at level 4: paragraph: Paragraph 1.1.2
item-16 at level 4: paragraph: item-16 at level 4: paragraph:
item-17 at level 3: section_header: Section 1.2 item-17 at level 3: section_header: 1.2 Section 1.2
item-18 at level 4: paragraph: item-18 at level 4: paragraph:
item-19 at level 4: paragraph: Paragraph 1.1.1 item-19 at level 4: paragraph: Paragraph 1.1.1
item-20 at level 4: paragraph: item-20 at level 4: paragraph:
item-21 at level 4: paragraph: Paragraph 1.1.2 item-21 at level 4: paragraph: Paragraph 1.1.2
item-22 at level 4: paragraph: item-22 at level 4: paragraph:
item-23 at level 4: section_header: Section 1.2.3 item-23 at level 4: section_header: 1.2.1 Section 1.2.3
item-24 at level 5: paragraph: item-24 at level 5: paragraph:
item-25 at level 5: paragraph: Paragraph 1.2.3.1 item-25 at level 5: paragraph: Paragraph 1.2.3.1
item-26 at level 5: paragraph: item-26 at level 5: paragraph:
item-27 at level 5: paragraph: Paragraph 1.2.3.1 item-27 at level 5: paragraph: Paragraph 1.2.3.1
item-28 at level 5: paragraph: item-28 at level 5: paragraph:
item-29 at level 5: paragraph: item-29 at level 5: paragraph:
item-30 at level 2: section_header: Section 2 item-30 at level 2: section_header: 2 Section 2
item-31 at level 1: paragraph: item-31 at level 1: paragraph:
item-32 at level 1: paragraph: Paragraph 2.1 item-32 at level 1: paragraph: Paragraph 2.1
item-33 at level 1: paragraph: item-33 at level 1: paragraph:
@ -37,13 +37,13 @@ item-0 at level 0: unspecified: group _root_
item-36 at level 1: section: group header-0 item-36 at level 1: section: group header-0
item-37 at level 2: section: group header-1 item-37 at level 2: section: group header-1
item-38 at level 3: section: group header-2 item-38 at level 3: section: group header-2
item-39 at level 4: section_header: Section 2.1.1 item-39 at level 4: section_header: 2.1.1 Section 2.1.1
item-40 at level 5: paragraph: item-40 at level 5: paragraph:
item-41 at level 5: paragraph: Paragraph 2.1.1.1 item-41 at level 5: paragraph: Paragraph 2.1.1.1
item-42 at level 5: paragraph: item-42 at level 5: paragraph:
item-43 at level 5: paragraph: Paragraph 2.1.1.1 item-43 at level 5: paragraph: Paragraph 2.1.1.1
item-44 at level 5: paragraph: item-44 at level 5: paragraph:
item-45 at level 3: section_header: Section 2.1 item-45 at level 3: section_header: 2.2 Section 2.1
item-46 at level 4: paragraph: item-46 at level 4: paragraph:
item-47 at level 4: paragraph: Paragraph 2.1.1 item-47 at level 4: paragraph: Paragraph 2.1.1
item-48 at level 4: paragraph: item-48 at level 4: paragraph:

View File

@ -183,8 +183,8 @@
"content_layer": "body", "content_layer": "body",
"label": "section_header", "label": "section_header",
"prov": [], "prov": [],
"orig": "Section 1", "orig": "1 Section 1",
"text": "Section 1", "text": "1 Section 1",
"level": 1 "level": 1
}, },
{ {
@ -272,8 +272,8 @@
"content_layer": "body", "content_layer": "body",
"label": "section_header", "label": "section_header",
"prov": [], "prov": [],
"orig": "Section 1.1", "orig": "1.1 Section 1.1",
"text": "Section 1.1", "text": "1.1 Section 1.1",
"level": 2 "level": 2
}, },
{ {
@ -364,8 +364,8 @@
"content_layer": "body", "content_layer": "body",
"label": "section_header", "label": "section_header",
"prov": [], "prov": [],
"orig": "Section 1.2", "orig": "1.2 Section 1.2",
"text": "Section 1.2", "text": "1.2 Section 1.2",
"level": 2 "level": 2
}, },
{ {
@ -456,8 +456,8 @@
"content_layer": "body", "content_layer": "body",
"label": "section_header", "label": "section_header",
"prov": [], "prov": [],
"orig": "Section 1.2.3", "orig": "1.2.1 Section 1.2.3",
"text": "Section 1.2.3", "text": "1.2.1 Section 1.2.3",
"level": 3 "level": 3
}, },
{ {
@ -541,8 +541,8 @@
"content_layer": "body", "content_layer": "body",
"label": "section_header", "label": "section_header",
"prov": [], "prov": [],
"orig": "Section 2", "orig": "2 Section 2",
"text": "Section 2", "text": "2 Section 2",
"level": 1 "level": 1
}, },
{ {
@ -630,8 +630,8 @@
"content_layer": "body", "content_layer": "body",
"label": "section_header", "label": "section_header",
"prov": [], "prov": [],
"orig": "Section 2.1.1", "orig": "2.1.1 Section 2.1.1",
"text": "Section 2.1.1", "text": "2.1.1 Section 2.1.1",
"level": 3 "level": 3
}, },
{ {
@ -722,8 +722,8 @@
"content_layer": "body", "content_layer": "body",
"label": "section_header", "label": "section_header",
"prov": [], "prov": [],
"orig": "Section 2.1", "orig": "2.2 Section 2.1",
"text": "Section 2.1", "text": "2.2 Section 2.1",
"level": 2 "level": 2
}, },
{ {

View File

@ -1,42 +1,42 @@
# Test Document # Test Document
## Section 1 ## 1 Section 1
Paragraph 1.1 Paragraph 1.1
Paragraph 1.2 Paragraph 1.2
### Section 1.1 ### 1.1 Section 1.1
Paragraph 1.1.1 Paragraph 1.1.1
Paragraph 1.1.2 Paragraph 1.1.2
### Section 1.2 ### 1.2 Section 1.2
Paragraph 1.1.1 Paragraph 1.1.1
Paragraph 1.1.2 Paragraph 1.1.2
#### Section 1.2.3 #### 1.2.1 Section 1.2.3
Paragraph 1.2.3.1 Paragraph 1.2.3.1
Paragraph 1.2.3.1 Paragraph 1.2.3.1
## Section 2 ## 2 Section 2
Paragraph 2.1 Paragraph 2.1
Paragraph 2.2 Paragraph 2.2
#### Section 2.1.1 #### 2.1.1 Section 2.1.1
Paragraph 2.1.1.1 Paragraph 2.1.1.1
Paragraph 2.1.1.1 Paragraph 2.1.1.1
### Section 2.1 ### 2.2 Section 2.1
Paragraph 2.1.1 Paragraph 2.1.1