feat(MS Word backend): Make detection of headers and other styles localization agnostic (#534)

Using style id instead of style names, which should be localization agnostic

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maxim Lysak 2024-12-06 15:17:56 +01:00 committed by GitHub
parent 53039a8367
commit 3e073dfbeb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,4 +1,5 @@
import logging import logging
import re
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Set, Union
@ -166,6 +167,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
except ValueError: except ValueError:
return default return default
def split_text_and_number(self, input_string):
match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
if match:
parts = list(filter(None, match.groups()))
return parts
else:
return [input_string]
def get_numId_and_ilvl(self, paragraph): def get_numId_and_ilvl(self, paragraph):
# Access the XML element of the paragraph # Access the XML element of the paragraph
numPr = paragraph._element.find( numPr = paragraph._element.find(
@ -188,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def get_label_and_level(self, paragraph): def get_label_and_level(self, paragraph):
if paragraph.style is None: if paragraph.style is None:
return "Normal", None return "Normal", None
label = paragraph.style.name label = paragraph.style.style_id
if label is None: if label is None:
return "Normal", None return "Normal", None
if ":" in label: if ":" in label:
@ -197,7 +206,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if len(parts) == 2: if len(parts) == 2:
return parts[0], int(parts[1]) return parts[0], int(parts[1])
parts = label.split(" ") parts = self.split_text_and_number(label)
if "Heading" in label and len(parts) == 2: if "Heading" in label and len(parts) == 2:
parts.sort() parts.sort()
@ -225,7 +234,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Identify wether list is a numbered list or not # Identify wether list is a numbered list or not
# is_numbered = "List Bullet" not in paragraph.style.name # is_numbered = "List Bullet" not in paragraph.style.name
is_numbered = False is_numbered = False
p_style_name, p_level = self.get_label_and_level(paragraph) p_style_id, p_level = self.get_label_and_level(paragraph)
numid, ilevel = self.get_numId_and_ilvl(paragraph) numid, ilevel = self.get_numId_and_ilvl(paragraph)
if numid == 0: if numid == 0:
@ -237,14 +246,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
element, element,
docx_obj, docx_obj,
doc, doc,
p_style_name, p_style_id,
p_level, p_level,
numid, numid,
ilevel, ilevel,
text, text,
is_numbered, is_numbered,
) )
self.update_history(p_style_name, p_level, numid, ilevel) self.update_history(p_style_id, p_level, numid, ilevel)
return return
elif numid is None and self.prev_numid() is not None: # Close list elif numid is None and self.prev_numid() is not None: # Close list
for key, val in self.parents.items(): for key, val in self.parents.items():
@ -252,16 +261,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.parents[key] = None self.parents[key] = None
self.level = self.level_at_new_list - 1 self.level = self.level_at_new_list - 1
self.level_at_new_list = None self.level_at_new_list = None
if p_style_name in ["Title"]: if p_style_id in ["Title"]:
for key, val in self.parents.items(): for key, val in self.parents.items():
self.parents[key] = None self.parents[key] = None
self.parents[0] = doc.add_text( self.parents[0] = doc.add_text(
parent=None, label=DocItemLabel.TITLE, text=text parent=None, label=DocItemLabel.TITLE, text=text
) )
elif "Heading" in p_style_name: elif "Heading" in p_style_id:
self.add_header(element, docx_obj, doc, p_style_name, p_level, text) self.add_header(element, docx_obj, doc, p_style_id, p_level, text)
elif p_style_name in [ elif p_style_id in [
"Paragraph", "Paragraph",
"Normal", "Normal",
"Subtitle", "Subtitle",
@ -284,7 +293,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
) )
self.update_history(p_style_name, p_level, numid, ilevel) self.update_history(p_style_id, p_level, numid, ilevel)
return return
def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str): def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
@ -322,7 +331,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
element, element,
docx_obj, docx_obj,
doc, doc,
p_style_name, p_style_id,
p_level, p_level,
numid, numid,
ilevel, ilevel,