feat: equations to latex in MSWord backend (with inline groups) (#1114)
* Equation groups Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * fix: Proper handling of orphan IDs in layout postprocessing (#1118) * Fix the handling of orphan IDs in layout postprocessing Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update test cases Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * chore: bump version to 2.25.2 [skip ci] * docs: add description of DOCLING_ARTIFACTS_PATH env var (#1124) add env var in docs Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * fix(CLI): fix help message for abort options (#1130) fix help message Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * perf: New revision code formula model and document picture classifier (#1140) * new version code formula model Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * new version document picture classifier Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * new code formula model Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * restored original code formula test pdf Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> --------- Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> Co-authored-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * feat: Use new TableFormer model weights and default to accurate model version (#1100) * feat: New tableformer model weights [WIP] Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> * Updated TF version Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated tests, after merging with Main, Switched to Accurate TF model by default Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com> Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * chore: bump version to 2.26.0 [skip ci] * fix: Pass tests, update docling-core to 2.22.0 (#1150) fix: update docling-core to 2.22.0 Update dependency library docling-core to latest release 2.22.0 Fix regression tests and ground truth files Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Updating content hash Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> --------- Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Matteo <43417658+Matteo-Omenetti@users.noreply.github.com> Co-authored-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
aa92a57fa9
commit
6eb718f849
453
docling/backend/docx/latex/omml.py
Normal file
453
docling/backend/docx/latex/omml.py
Normal file
@@ -0,0 +1,453 @@
|
||||
"""
|
||||
Office Math Markup Language (OMML)
|
||||
|
||||
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
|
||||
On 23/01/2025
|
||||
"""
|
||||
|
||||
import lxml.etree as ET
|
||||
from pylatexenc.latexencode import UnicodeToLatexEncoder
|
||||
|
||||
from docling.backend.docx.latex.latex_dict import (
|
||||
ALN,
|
||||
ARR,
|
||||
BACKSLASH,
|
||||
BLANK,
|
||||
BRK,
|
||||
CHARS,
|
||||
CHR,
|
||||
CHR_BO,
|
||||
CHR_DEFAULT,
|
||||
D_DEFAULT,
|
||||
F_DEFAULT,
|
||||
FUNC,
|
||||
FUNC_PLACE,
|
||||
LIM_FUNC,
|
||||
LIM_TO,
|
||||
LIM_UPP,
|
||||
POS,
|
||||
POS_DEFAULT,
|
||||
RAD,
|
||||
RAD_DEFAULT,
|
||||
SUB,
|
||||
SUP,
|
||||
D,
|
||||
F,
|
||||
M,
|
||||
T,
|
||||
)
|
||||
|
||||
OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
|
||||
|
||||
|
||||
def load(stream):
|
||||
tree = ET.parse(stream)
|
||||
for omath in tree.findall(OMML_NS + "oMath"):
|
||||
yield oMath2Latex(omath)
|
||||
|
||||
|
||||
def load_string(string):
|
||||
root = ET.fromstring(string)
|
||||
for omath in root.findall(OMML_NS + "oMath"):
|
||||
yield oMath2Latex(omath)
|
||||
|
||||
|
||||
def escape_latex(strs):
|
||||
last = None
|
||||
new_chr = []
|
||||
strs = strs.replace(r"\\", "\\")
|
||||
for c in strs:
|
||||
if (c in CHARS) and (last != BACKSLASH):
|
||||
new_chr.append(BACKSLASH + c)
|
||||
else:
|
||||
new_chr.append(c)
|
||||
last = c
|
||||
return BLANK.join(new_chr)
|
||||
|
||||
|
||||
def get_val(key, default=None, store=CHR):
|
||||
if key is not None:
|
||||
return key if not store else store.get(key, key)
|
||||
else:
|
||||
return default
|
||||
|
||||
|
||||
class Tag2Method(object):
|
||||
|
||||
def call_method(self, elm, stag=None):
|
||||
getmethod = self.tag2meth.get
|
||||
if stag is None:
|
||||
stag = elm.tag.replace(OMML_NS, "")
|
||||
method = getmethod(stag)
|
||||
if method:
|
||||
return method(self, elm)
|
||||
else:
|
||||
return None
|
||||
|
||||
def process_children_list(self, elm, include=None):
|
||||
"""
|
||||
process children of the elm,return iterable
|
||||
"""
|
||||
for _e in list(elm):
|
||||
if OMML_NS not in _e.tag:
|
||||
continue
|
||||
stag = _e.tag.replace(OMML_NS, "")
|
||||
if include and (stag not in include):
|
||||
continue
|
||||
t = self.call_method(_e, stag=stag)
|
||||
if t is None:
|
||||
t = self.process_unknow(_e, stag)
|
||||
if t is None:
|
||||
continue
|
||||
yield (stag, t, _e)
|
||||
|
||||
def process_children_dict(self, elm, include=None):
|
||||
"""
|
||||
process children of the elm,return dict
|
||||
"""
|
||||
latex_chars = dict()
|
||||
for stag, t, e in self.process_children_list(elm, include):
|
||||
latex_chars[stag] = t
|
||||
return latex_chars
|
||||
|
||||
def process_children(self, elm, include=None):
|
||||
"""
|
||||
process children of the elm,return string
|
||||
"""
|
||||
return BLANK.join(
|
||||
(
|
||||
t if not isinstance(t, Tag2Method) else str(t)
|
||||
for stag, t, e in self.process_children_list(elm, include)
|
||||
)
|
||||
)
|
||||
|
||||
def process_unknow(self, elm, stag):
|
||||
return None
|
||||
|
||||
|
||||
class Pr(Tag2Method):
|
||||
|
||||
text = ""
|
||||
|
||||
__val_tags = ("chr", "pos", "begChr", "endChr", "type")
|
||||
|
||||
__innerdict = None # can't use the __dict__
|
||||
|
||||
""" common properties of element"""
|
||||
|
||||
def __init__(self, elm):
|
||||
self.__innerdict = {}
|
||||
self.text = self.process_children(elm)
|
||||
|
||||
def __str__(self):
|
||||
return self.text
|
||||
|
||||
def __unicode__(self):
|
||||
return self.__str__(self)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return self.__innerdict.get(name, None)
|
||||
|
||||
def do_brk(self, elm):
|
||||
self.__innerdict["brk"] = BRK
|
||||
return BRK
|
||||
|
||||
def do_common(self, elm):
|
||||
stag = elm.tag.replace(OMML_NS, "")
|
||||
if stag in self.__val_tags:
|
||||
t = elm.get("{0}val".format(OMML_NS))
|
||||
self.__innerdict[stag] = t
|
||||
return None
|
||||
|
||||
tag2meth = {
|
||||
"brk": do_brk,
|
||||
"chr": do_common,
|
||||
"pos": do_common,
|
||||
"begChr": do_common,
|
||||
"endChr": do_common,
|
||||
"type": do_common,
|
||||
}
|
||||
|
||||
|
||||
class oMath2Latex(Tag2Method):
|
||||
"""
|
||||
Convert oMath element of omml to latex
|
||||
"""
|
||||
|
||||
_t_dict = T
|
||||
|
||||
__direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e")
|
||||
u = UnicodeToLatexEncoder(
|
||||
replacement_latex_protection="braces-all",
|
||||
unknown_char_policy="keep",
|
||||
unknown_char_warning=False,
|
||||
)
|
||||
|
||||
def __init__(self, element):
|
||||
self._latex = self.process_children(element)
|
||||
|
||||
def __str__(self):
|
||||
return self.latex.replace(" ", " ")
|
||||
|
||||
def __unicode__(self):
|
||||
return self.__str__(self)
|
||||
|
||||
def process_unknow(self, elm, stag):
|
||||
if stag in self.__direct_tags:
|
||||
return self.process_children(elm)
|
||||
elif stag[-2:] == "Pr":
|
||||
return Pr(elm)
|
||||
else:
|
||||
return None
|
||||
|
||||
@property
|
||||
def latex(self):
|
||||
return self._latex
|
||||
|
||||
def do_acc(self, elm):
|
||||
"""
|
||||
the accent function
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
latex_s = get_val(
|
||||
c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR
|
||||
)
|
||||
return latex_s.format(c_dict["e"])
|
||||
|
||||
def do_bar(self, elm):
|
||||
"""
|
||||
the bar function
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
pr = c_dict["barPr"]
|
||||
latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS)
|
||||
return pr.text + latex_s.format(c_dict["e"])
|
||||
|
||||
def do_d(self, elm):
|
||||
"""
|
||||
the delimiter object
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
pr = c_dict["dPr"]
|
||||
null = D_DEFAULT.get("null")
|
||||
|
||||
s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T)
|
||||
e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T)
|
||||
delim = pr.text + D.format(
|
||||
left=null if not s_val else escape_latex(s_val),
|
||||
text=c_dict["e"],
|
||||
right=null if not e_val else escape_latex(e_val),
|
||||
)
|
||||
return delim
|
||||
|
||||
def do_spre(self, elm):
|
||||
"""
|
||||
the Pre-Sub-Superscript object -- Not support yet
|
||||
"""
|
||||
pass
|
||||
|
||||
def do_sub(self, elm):
|
||||
text = self.process_children(elm)
|
||||
return SUB.format(text)
|
||||
|
||||
def do_sup(self, elm):
|
||||
text = self.process_children(elm)
|
||||
return SUP.format(text)
|
||||
|
||||
def do_f(self, elm):
|
||||
"""
|
||||
the fraction object
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
pr = c_dict["fPr"]
|
||||
latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
|
||||
return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
|
||||
|
||||
def do_func(self, elm):
|
||||
"""
|
||||
the Function-Apply object (Examples:sin cos)
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
func_name = c_dict.get("fName")
|
||||
return func_name.replace(FUNC_PLACE, c_dict.get("e"))
|
||||
|
||||
def do_fname(self, elm):
|
||||
"""
|
||||
the func name
|
||||
"""
|
||||
latex_chars = []
|
||||
for stag, t, e in self.process_children_list(elm):
|
||||
if stag == "r":
|
||||
if FUNC.get(t):
|
||||
latex_chars.append(FUNC[t])
|
||||
else:
|
||||
raise NotSupport("Not support func %s" % t)
|
||||
else:
|
||||
latex_chars.append(t)
|
||||
t = BLANK.join(latex_chars)
|
||||
return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
|
||||
|
||||
def do_groupchr(self, elm):
|
||||
"""
|
||||
the Group-Character object
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
pr = c_dict["groupChrPr"]
|
||||
latex_s = get_val(pr.chr)
|
||||
return pr.text + latex_s.format(c_dict["e"])
|
||||
|
||||
def do_rad(self, elm):
|
||||
"""
|
||||
the radical object
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
text = c_dict.get("e")
|
||||
deg_text = c_dict.get("deg")
|
||||
if deg_text:
|
||||
return RAD.format(deg=deg_text, text=text)
|
||||
else:
|
||||
return RAD_DEFAULT.format(text=text)
|
||||
|
||||
def do_eqarr(self, elm):
|
||||
"""
|
||||
the Array object
|
||||
"""
|
||||
return ARR.format(
|
||||
text=BRK.join(
|
||||
[t for stag, t, e in self.process_children_list(elm, include=("e",))]
|
||||
)
|
||||
)
|
||||
|
||||
def do_limlow(self, elm):
|
||||
"""
|
||||
the Lower-Limit object
|
||||
"""
|
||||
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
||||
latex_s = LIM_FUNC.get(t_dict["e"])
|
||||
if not latex_s:
|
||||
raise NotSupport("Not support lim %s" % t_dict["e"])
|
||||
else:
|
||||
return latex_s.format(lim=t_dict.get("lim"))
|
||||
|
||||
def do_limupp(self, elm):
|
||||
"""
|
||||
the Upper-Limit object
|
||||
"""
|
||||
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
||||
return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e"))
|
||||
|
||||
def do_lim(self, elm):
|
||||
"""
|
||||
the lower limit of the limLow object and the upper limit of the limUpp function
|
||||
"""
|
||||
return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1])
|
||||
|
||||
def do_m(self, elm):
|
||||
"""
|
||||
the Matrix object
|
||||
"""
|
||||
rows = []
|
||||
for stag, t, e in self.process_children_list(elm):
|
||||
if stag == "mPr":
|
||||
pass
|
||||
elif stag == "mr":
|
||||
rows.append(t)
|
||||
return M.format(text=BRK.join(rows))
|
||||
|
||||
def do_mr(self, elm):
|
||||
"""
|
||||
a single row of the matrix m
|
||||
"""
|
||||
return ALN.join(
|
||||
[t for stag, t, e in self.process_children_list(elm, include=("e",))]
|
||||
)
|
||||
|
||||
def do_nary(self, elm):
|
||||
"""
|
||||
the n-ary object
|
||||
"""
|
||||
res = []
|
||||
bo = ""
|
||||
for stag, t, e in self.process_children_list(elm):
|
||||
if stag == "naryPr":
|
||||
bo = get_val(t.chr, store=CHR_BO)
|
||||
else:
|
||||
res.append(t)
|
||||
return bo + BLANK.join(res)
|
||||
|
||||
def process_unicode(self, s):
|
||||
# s = s if isinstance(s,unicode) else unicode(s,'utf-8')
|
||||
# print(s, self._t_dict.get(s, s), unicode_to_latex(s))
|
||||
# _str.append( self._t_dict.get(s, s) )
|
||||
|
||||
out_latex_str = self.u.unicode_to_latex(s)
|
||||
|
||||
# print(s, out_latex_str)
|
||||
|
||||
if (
|
||||
s.startswith("{") is False
|
||||
and out_latex_str.startswith("{")
|
||||
and s.endswith("}") is False
|
||||
and out_latex_str.endswith("}")
|
||||
):
|
||||
out_latex_str = f" {out_latex_str[1:-1]} "
|
||||
|
||||
# print(s, out_latex_str)
|
||||
|
||||
if "ensuremath" in out_latex_str:
|
||||
out_latex_str = out_latex_str.replace("\\ensuremath{", " ")
|
||||
out_latex_str = out_latex_str.replace("}", " ")
|
||||
|
||||
# print(s, out_latex_str)
|
||||
|
||||
if out_latex_str.strip().startswith("\\text"):
|
||||
out_latex_str = f" \\text{{{out_latex_str}}} "
|
||||
|
||||
# print(s, out_latex_str)
|
||||
|
||||
return out_latex_str
|
||||
|
||||
def do_r(self, elm):
|
||||
"""
|
||||
Get text from 'r' element,And try convert them to latex symbols
|
||||
@todo text style support , (sty)
|
||||
@todo \text (latex pure text support)
|
||||
"""
|
||||
_str = []
|
||||
_base_str = []
|
||||
for s in elm.findtext("./{0}t".format(OMML_NS)):
|
||||
out_latex_str = self.process_unicode(s)
|
||||
_str.append(out_latex_str)
|
||||
_base_str.append(s)
|
||||
|
||||
proc_str = escape_latex(BLANK.join(_str))
|
||||
base_proc_str = BLANK.join(_base_str)
|
||||
|
||||
if "{" not in base_proc_str and "\\{" in proc_str:
|
||||
proc_str = proc_str.replace("\\{", "{")
|
||||
|
||||
if "}" not in base_proc_str and "\\}" in proc_str:
|
||||
proc_str = proc_str.replace("\\}", "}")
|
||||
|
||||
return proc_str
|
||||
|
||||
tag2meth = {
|
||||
"acc": do_acc,
|
||||
"r": do_r,
|
||||
"bar": do_bar,
|
||||
"sub": do_sub,
|
||||
"sup": do_sup,
|
||||
"f": do_f,
|
||||
"func": do_func,
|
||||
"fName": do_fname,
|
||||
"groupChr": do_groupchr,
|
||||
"d": do_d,
|
||||
"rad": do_rad,
|
||||
"eqArr": do_eqarr,
|
||||
"limLow": do_limlow,
|
||||
"limUpp": do_limupp,
|
||||
"lim": do_lim,
|
||||
"m": do_m,
|
||||
"mr": do_mr,
|
||||
"nary": do_nary,
|
||||
}
|
||||
Reference in New Issue
Block a user