fix(docx): identifying numbered headers (#1231)
* Modifications to identify numbered headers Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * Add style check Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> --------- Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
parent
0974ba4e1c
commit
f739d0e4c5
@ -53,6 +53,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.max_levels: int = 10
|
||||
self.level_at_new_list: Optional[int] = None
|
||||
self.parents: dict[int, Optional[NodeItem]] = {}
|
||||
self.numbered_headers: dict[int, int] = {}
|
||||
for i in range(-1, self.max_levels):
|
||||
self.parents[i] = None
|
||||
|
||||
@ -346,7 +347,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
parent=None, label=DocItemLabel.TITLE, text=text
|
||||
)
|
||||
elif "Heading" in p_style_id:
|
||||
self.add_header(doc, p_level, text)
|
||||
style_element = getattr(paragraph.style, "element", None)
|
||||
if style_element:
|
||||
is_numbered_style = (
|
||||
"<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
|
||||
)
|
||||
else:
|
||||
is_numbered_style = False
|
||||
self.add_header(doc, p_level, text, is_numbered_style)
|
||||
|
||||
elif len(equations) > 0:
|
||||
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
|
||||
@ -415,7 +423,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
return
|
||||
|
||||
def add_header(
|
||||
self, doc: DoclingDocument, curr_level: Optional[int], text: str
|
||||
self,
|
||||
doc: DoclingDocument,
|
||||
curr_level: Optional[int],
|
||||
text: str,
|
||||
is_numbered_style: bool = False,
|
||||
) -> None:
|
||||
level = self.get_level()
|
||||
if isinstance(curr_level, int):
|
||||
@ -433,17 +445,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
if key >= curr_level:
|
||||
self.parents[key] = None
|
||||
|
||||
self.parents[curr_level] = doc.add_heading(
|
||||
parent=self.parents[curr_level - 1],
|
||||
text=text,
|
||||
level=curr_level,
|
||||
)
|
||||
current_level = curr_level
|
||||
parent_level = curr_level - 1
|
||||
add_level = curr_level
|
||||
else:
|
||||
self.parents[self.level] = doc.add_heading(
|
||||
parent=self.parents[self.level - 1],
|
||||
text=text,
|
||||
level=1,
|
||||
)
|
||||
current_level = self.level
|
||||
parent_level = self.level - 1
|
||||
add_level = 1
|
||||
|
||||
if is_numbered_style:
|
||||
if add_level in self.numbered_headers:
|
||||
self.numbered_headers[add_level] += 1
|
||||
else:
|
||||
self.numbered_headers[add_level] = 1
|
||||
text = f"{self.numbered_headers[add_level]} {text}"
|
||||
|
||||
# Reset deeper levels
|
||||
next_level = add_level + 1
|
||||
while next_level in self.numbered_headers:
|
||||
self.numbered_headers[next_level] = 0
|
||||
next_level += 1
|
||||
|
||||
# Scan upper levels
|
||||
previous_level = add_level - 1
|
||||
while previous_level in self.numbered_headers:
|
||||
# MSWord convention: no empty sublevels
|
||||
# I.e., sub-sub section (2.0.1) without a sub-section (2.1)
|
||||
# is processed as 2.1.1
|
||||
if self.numbered_headers[previous_level] == 0:
|
||||
self.numbered_headers[previous_level] += 1
|
||||
|
||||
text = f"{self.numbered_headers[previous_level]}.{text}"
|
||||
previous_level -= 1
|
||||
|
||||
self.parents[current_level] = doc.add_heading(
|
||||
parent=self.parents[parent_level],
|
||||
text=text,
|
||||
level=add_level,
|
||||
)
|
||||
return
|
||||
|
||||
def add_listitem(
|
||||
|
@ -1,7 +1,7 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: title: Test Document
|
||||
item-2 at level 2: paragraph:
|
||||
item-3 at level 2: section_header: Section 1
|
||||
item-3 at level 2: section_header: 1 Section 1
|
||||
item-4 at level 1: paragraph:
|
||||
item-5 at level 1: paragraph: Paragraph 1.1
|
||||
item-6 at level 1: paragraph:
|
||||
@ -9,26 +9,26 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-8 at level 1: paragraph:
|
||||
item-9 at level 1: section: group header-0
|
||||
item-10 at level 2: section: group header-1
|
||||
item-11 at level 3: section_header: Section 1.1
|
||||
item-11 at level 3: section_header: 1.1 Section 1.1
|
||||
item-12 at level 4: paragraph:
|
||||
item-13 at level 4: paragraph: Paragraph 1.1.1
|
||||
item-14 at level 4: paragraph:
|
||||
item-15 at level 4: paragraph: Paragraph 1.1.2
|
||||
item-16 at level 4: paragraph:
|
||||
item-17 at level 3: section_header: Section 1.2
|
||||
item-17 at level 3: section_header: 1.2 Section 1.2
|
||||
item-18 at level 4: paragraph:
|
||||
item-19 at level 4: paragraph: Paragraph 1.1.1
|
||||
item-20 at level 4: paragraph:
|
||||
item-21 at level 4: paragraph: Paragraph 1.1.2
|
||||
item-22 at level 4: paragraph:
|
||||
item-23 at level 4: section_header: Section 1.2.3
|
||||
item-23 at level 4: section_header: 1.2.1 Section 1.2.3
|
||||
item-24 at level 5: paragraph:
|
||||
item-25 at level 5: paragraph: Paragraph 1.2.3.1
|
||||
item-26 at level 5: paragraph:
|
||||
item-27 at level 5: paragraph: Paragraph 1.2.3.1
|
||||
item-28 at level 5: paragraph:
|
||||
item-29 at level 5: paragraph:
|
||||
item-30 at level 2: section_header: Section 2
|
||||
item-30 at level 2: section_header: 2 Section 2
|
||||
item-31 at level 1: paragraph:
|
||||
item-32 at level 1: paragraph: Paragraph 2.1
|
||||
item-33 at level 1: paragraph:
|
||||
@ -37,13 +37,13 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-36 at level 1: section: group header-0
|
||||
item-37 at level 2: section: group header-1
|
||||
item-38 at level 3: section: group header-2
|
||||
item-39 at level 4: section_header: Section 2.1.1
|
||||
item-39 at level 4: section_header: 2.1.1 Section 2.1.1
|
||||
item-40 at level 5: paragraph:
|
||||
item-41 at level 5: paragraph: Paragraph 2.1.1.1
|
||||
item-42 at level 5: paragraph:
|
||||
item-43 at level 5: paragraph: Paragraph 2.1.1.1
|
||||
item-44 at level 5: paragraph:
|
||||
item-45 at level 3: section_header: Section 2.1
|
||||
item-45 at level 3: section_header: 2.2 Section 2.1
|
||||
item-46 at level 4: paragraph:
|
||||
item-47 at level 4: paragraph: Paragraph 2.1.1
|
||||
item-48 at level 4: paragraph:
|
||||
|
@ -183,8 +183,8 @@
|
||||
"content_layer": "body",
|
||||
"label": "section_header",
|
||||
"prov": [],
|
||||
"orig": "Section 1",
|
||||
"text": "Section 1",
|
||||
"orig": "1 Section 1",
|
||||
"text": "1 Section 1",
|
||||
"level": 1
|
||||
},
|
||||
{
|
||||
@ -272,8 +272,8 @@
|
||||
"content_layer": "body",
|
||||
"label": "section_header",
|
||||
"prov": [],
|
||||
"orig": "Section 1.1",
|
||||
"text": "Section 1.1",
|
||||
"orig": "1.1 Section 1.1",
|
||||
"text": "1.1 Section 1.1",
|
||||
"level": 2
|
||||
},
|
||||
{
|
||||
@ -364,8 +364,8 @@
|
||||
"content_layer": "body",
|
||||
"label": "section_header",
|
||||
"prov": [],
|
||||
"orig": "Section 1.2",
|
||||
"text": "Section 1.2",
|
||||
"orig": "1.2 Section 1.2",
|
||||
"text": "1.2 Section 1.2",
|
||||
"level": 2
|
||||
},
|
||||
{
|
||||
@ -456,8 +456,8 @@
|
||||
"content_layer": "body",
|
||||
"label": "section_header",
|
||||
"prov": [],
|
||||
"orig": "Section 1.2.3",
|
||||
"text": "Section 1.2.3",
|
||||
"orig": "1.2.1 Section 1.2.3",
|
||||
"text": "1.2.1 Section 1.2.3",
|
||||
"level": 3
|
||||
},
|
||||
{
|
||||
@ -541,8 +541,8 @@
|
||||
"content_layer": "body",
|
||||
"label": "section_header",
|
||||
"prov": [],
|
||||
"orig": "Section 2",
|
||||
"text": "Section 2",
|
||||
"orig": "2 Section 2",
|
||||
"text": "2 Section 2",
|
||||
"level": 1
|
||||
},
|
||||
{
|
||||
@ -630,8 +630,8 @@
|
||||
"content_layer": "body",
|
||||
"label": "section_header",
|
||||
"prov": [],
|
||||
"orig": "Section 2.1.1",
|
||||
"text": "Section 2.1.1",
|
||||
"orig": "2.1.1 Section 2.1.1",
|
||||
"text": "2.1.1 Section 2.1.1",
|
||||
"level": 3
|
||||
},
|
||||
{
|
||||
@ -722,8 +722,8 @@
|
||||
"content_layer": "body",
|
||||
"label": "section_header",
|
||||
"prov": [],
|
||||
"orig": "Section 2.1",
|
||||
"text": "Section 2.1",
|
||||
"orig": "2.2 Section 2.1",
|
||||
"text": "2.2 Section 2.1",
|
||||
"level": 2
|
||||
},
|
||||
{
|
||||
|
@ -1,42 +1,42 @@
|
||||
# Test Document
|
||||
|
||||
## Section 1
|
||||
## 1 Section 1
|
||||
|
||||
Paragraph 1.1
|
||||
|
||||
Paragraph 1.2
|
||||
|
||||
### Section 1.1
|
||||
### 1.1 Section 1.1
|
||||
|
||||
Paragraph 1.1.1
|
||||
|
||||
Paragraph 1.1.2
|
||||
|
||||
### Section 1.2
|
||||
### 1.2 Section 1.2
|
||||
|
||||
Paragraph 1.1.1
|
||||
|
||||
Paragraph 1.1.2
|
||||
|
||||
#### Section 1.2.3
|
||||
#### 1.2.1 Section 1.2.3
|
||||
|
||||
Paragraph 1.2.3.1
|
||||
|
||||
Paragraph 1.2.3.1
|
||||
|
||||
## Section 2
|
||||
## 2 Section 2
|
||||
|
||||
Paragraph 2.1
|
||||
|
||||
Paragraph 2.2
|
||||
|
||||
#### Section 2.1.1
|
||||
#### 2.1.1 Section 2.1.1
|
||||
|
||||
Paragraph 2.1.1.1
|
||||
|
||||
Paragraph 2.1.1.1
|
||||
|
||||
### Section 2.1
|
||||
### 2.2 Section 2.1
|
||||
|
||||
Paragraph 2.1.1
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user