fix(docx): identifying numbered headers (#1231)

* Modifications to identify numbered headers

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>

* Add style check

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>

---------

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
Rafael Teixeira de Lima 2025-03-25 11:41:02 +01:00 committed by GitHub
parent 0974ba4e1c
commit f739d0e4c5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 79 additions and 40 deletions

View File

@ -53,6 +53,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.max_levels: int = 10
self.level_at_new_list: Optional[int] = None
self.parents: dict[int, Optional[NodeItem]] = {}
self.numbered_headers: dict[int, int] = {}
for i in range(-1, self.max_levels):
self.parents[i] = None
@ -346,7 +347,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
parent=None, label=DocItemLabel.TITLE, text=text
)
elif "Heading" in p_style_id:
self.add_header(doc, p_level, text)
style_element = getattr(paragraph.style, "element", None)
if style_element:
is_numbered_style = (
"<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
)
else:
is_numbered_style = False
self.add_header(doc, p_level, text, is_numbered_style)
elif len(equations) > 0:
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
@ -415,7 +423,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return
def add_header(
self, doc: DoclingDocument, curr_level: Optional[int], text: str
self,
doc: DoclingDocument,
curr_level: Optional[int],
text: str,
is_numbered_style: bool = False,
) -> None:
level = self.get_level()
if isinstance(curr_level, int):
@ -433,17 +445,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if key >= curr_level:
self.parents[key] = None
self.parents[curr_level] = doc.add_heading(
parent=self.parents[curr_level - 1],
text=text,
level=curr_level,
)
current_level = curr_level
parent_level = curr_level - 1
add_level = curr_level
else:
self.parents[self.level] = doc.add_heading(
parent=self.parents[self.level - 1],
text=text,
level=1,
)
current_level = self.level
parent_level = self.level - 1
add_level = 1
if is_numbered_style:
if add_level in self.numbered_headers:
self.numbered_headers[add_level] += 1
else:
self.numbered_headers[add_level] = 1
text = f"{self.numbered_headers[add_level]} {text}"
# Reset deeper levels
next_level = add_level + 1
while next_level in self.numbered_headers:
self.numbered_headers[next_level] = 0
next_level += 1
# Scan upper levels
previous_level = add_level - 1
while previous_level in self.numbered_headers:
# MSWord convention: no empty sublevels
# I.e., sub-sub section (2.0.1) without a sub-section (2.1)
# is processed as 2.1.1
if self.numbered_headers[previous_level] == 0:
self.numbered_headers[previous_level] += 1
text = f"{self.numbered_headers[previous_level]}.{text}"
previous_level -= 1
self.parents[current_level] = doc.add_heading(
parent=self.parents[parent_level],
text=text,
level=add_level,
)
return
def add_listitem(

View File

@ -1,7 +1,7 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Test Document
item-2 at level 2: paragraph:
item-3 at level 2: section_header: Section 1
item-3 at level 2: section_header: 1 Section 1
item-4 at level 1: paragraph:
item-5 at level 1: paragraph: Paragraph 1.1
item-6 at level 1: paragraph:
@ -9,26 +9,26 @@ item-0 at level 0: unspecified: group _root_
item-8 at level 1: paragraph:
item-9 at level 1: section: group header-0
item-10 at level 2: section: group header-1
item-11 at level 3: section_header: Section 1.1
item-11 at level 3: section_header: 1.1 Section 1.1
item-12 at level 4: paragraph:
item-13 at level 4: paragraph: Paragraph 1.1.1
item-14 at level 4: paragraph:
item-15 at level 4: paragraph: Paragraph 1.1.2
item-16 at level 4: paragraph:
item-17 at level 3: section_header: Section 1.2
item-17 at level 3: section_header: 1.2 Section 1.2
item-18 at level 4: paragraph:
item-19 at level 4: paragraph: Paragraph 1.1.1
item-20 at level 4: paragraph:
item-21 at level 4: paragraph: Paragraph 1.1.2
item-22 at level 4: paragraph:
item-23 at level 4: section_header: Section 1.2.3
item-23 at level 4: section_header: 1.2.1 Section 1.2.3
item-24 at level 5: paragraph:
item-25 at level 5: paragraph: Paragraph 1.2.3.1
item-26 at level 5: paragraph:
item-27 at level 5: paragraph: Paragraph 1.2.3.1
item-28 at level 5: paragraph:
item-29 at level 5: paragraph:
item-30 at level 2: section_header: Section 2
item-30 at level 2: section_header: 2 Section 2
item-31 at level 1: paragraph:
item-32 at level 1: paragraph: Paragraph 2.1
item-33 at level 1: paragraph:
@ -37,13 +37,13 @@ item-0 at level 0: unspecified: group _root_
item-36 at level 1: section: group header-0
item-37 at level 2: section: group header-1
item-38 at level 3: section: group header-2
item-39 at level 4: section_header: Section 2.1.1
item-39 at level 4: section_header: 2.1.1 Section 2.1.1
item-40 at level 5: paragraph:
item-41 at level 5: paragraph: Paragraph 2.1.1.1
item-42 at level 5: paragraph:
item-43 at level 5: paragraph: Paragraph 2.1.1.1
item-44 at level 5: paragraph:
item-45 at level 3: section_header: Section 2.1
item-45 at level 3: section_header: 2.2 Section 2.1
item-46 at level 4: paragraph:
item-47 at level 4: paragraph: Paragraph 2.1.1
item-48 at level 4: paragraph:

View File

@ -183,8 +183,8 @@
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Section 1",
"text": "Section 1",
"orig": "1 Section 1",
"text": "1 Section 1",
"level": 1
},
{
@ -272,8 +272,8 @@
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Section 1.1",
"text": "Section 1.1",
"orig": "1.1 Section 1.1",
"text": "1.1 Section 1.1",
"level": 2
},
{
@ -364,8 +364,8 @@
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Section 1.2",
"text": "Section 1.2",
"orig": "1.2 Section 1.2",
"text": "1.2 Section 1.2",
"level": 2
},
{
@ -456,8 +456,8 @@
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Section 1.2.3",
"text": "Section 1.2.3",
"orig": "1.2.1 Section 1.2.3",
"text": "1.2.1 Section 1.2.3",
"level": 3
},
{
@ -541,8 +541,8 @@
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Section 2",
"text": "Section 2",
"orig": "2 Section 2",
"text": "2 Section 2",
"level": 1
},
{
@ -630,8 +630,8 @@
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Section 2.1.1",
"text": "Section 2.1.1",
"orig": "2.1.1 Section 2.1.1",
"text": "2.1.1 Section 2.1.1",
"level": 3
},
{
@ -722,8 +722,8 @@
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Section 2.1",
"text": "Section 2.1",
"orig": "2.2 Section 2.1",
"text": "2.2 Section 2.1",
"level": 2
},
{

View File

@ -1,42 +1,42 @@
# Test Document
## Section 1
## 1 Section 1
Paragraph 1.1
Paragraph 1.2
### Section 1.1
### 1.1 Section 1.1
Paragraph 1.1.1
Paragraph 1.1.2
### Section 1.2
### 1.2 Section 1.2
Paragraph 1.1.1
Paragraph 1.1.2
#### Section 1.2.3
#### 1.2.1 Section 1.2.3
Paragraph 1.2.3.1
Paragraph 1.2.3.1
## Section 2
## 2 Section 2
Paragraph 2.1
Paragraph 2.2
#### Section 2.1.1
#### 2.1.1 Section 2.1.1
Paragraph 2.1.1.1
Paragraph 2.1.1.1
### Section 2.1
### 2.2 Section 2.1
Paragraph 2.1.1