From 6eb718f8493038d1b4b6ae836df5a24aa13cd17e Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Thu, 13 Mar 2025 15:12:22 +0100 Subject: [PATCH] feat: equations to latex in MSWord backend (with inline groups) (#1114) * Equation groups Signed-off-by: Rafael Teixeira de Lima * fix: Proper handling of orphan IDs in layout postprocessing (#1118) * Fix the handling of orphan IDs in layout postprocessing Signed-off-by: Christoph Auer * Update test cases Signed-off-by: Christoph Auer --------- Signed-off-by: Christoph Auer Signed-off-by: Rafael Teixeira de Lima * chore: bump version to 2.25.2 [skip ci] * docs: add description of DOCLING_ARTIFACTS_PATH env var (#1124) add env var in docs Signed-off-by: Michele Dolfi Signed-off-by: Rafael Teixeira de Lima * fix(CLI): fix help message for abort options (#1130) fix help message Signed-off-by: Michele Dolfi Signed-off-by: Rafael Teixeira de Lima * perf: New revision code formula model and document picture classifier (#1140) * new version code formula model Signed-off-by: Matteo-Omenetti * new version document picture classifier Signed-off-by: Matteo-Omenetti * new code formula model Signed-off-by: Matteo-Omenetti * restored original code formula test pdf Signed-off-by: Matteo-Omenetti --------- Signed-off-by: Matteo-Omenetti Co-authored-by: Matteo-Omenetti Signed-off-by: Rafael Teixeira de Lima * feat: Use new TableFormer model weights and default to accurate model version (#1100) * feat: New tableformer model weights [WIP] Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> * Updated TF version Signed-off-by: Maksym Lysak * Updated tests, after merging with Main, Switched to Accurate TF model by default Signed-off-by: Maksym Lysak --------- Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Signed-off-by: Maksym Lysak Co-authored-by: Maksym Lysak Signed-off-by: Rafael Teixeira de Lima * chore: bump version to 2.26.0 [skip ci] * fix: Pass tests, update docling-core to 2.22.0 (#1150) fix: update docling-core to 2.22.0 Update dependency library docling-core to latest release 2.22.0 Fix regression tests and ground truth files Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Updating content hash Signed-off-by: Rafael Teixeira de Lima --------- Signed-off-by: Rafael Teixeira de Lima Signed-off-by: Christoph Auer Signed-off-by: Michele Dolfi Signed-off-by: Matteo-Omenetti Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Signed-off-by: Maksym Lysak Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Co-authored-by: github-actions[bot] Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Matteo <43417658+Matteo-Omenetti@users.noreply.github.com> Co-authored-by: Matteo-Omenetti Co-authored-by: Maksym Lysak Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/docx/__init__.py | 0 docling/backend/docx/latex/__init__.py | 0 docling/backend/docx/latex/latex_dict.py | 271 ++++++++ docling/backend/docx/latex/omml.py | 453 +++++++++++++ docling/backend/msword_backend.py | 66 +- poetry.lock | 124 ++-- pyproject.toml | 100 +-- tests/data/docx/equations.docx | Bin 0 -> 15017 bytes .../docling_v2/equations.docx.itxt | 40 ++ .../docling_v2/equations.docx.json | 616 ++++++++++++++++++ .../groundtruth/docling_v2/equations.docx.md | 29 + 11 files changed, 1610 insertions(+), 89 deletions(-) create mode 100644 docling/backend/docx/__init__.py create mode 100644 docling/backend/docx/latex/__init__.py create mode 100644 docling/backend/docx/latex/latex_dict.py create mode 100644 docling/backend/docx/latex/omml.py create mode 100644 tests/data/docx/equations.docx create mode 100644 tests/data/groundtruth/docling_v2/equations.docx.itxt create mode 100644 tests/data/groundtruth/docling_v2/equations.docx.json create mode 100644 tests/data/groundtruth/docling_v2/equations.docx.md diff --git a/docling/backend/docx/__init__.py b/docling/backend/docx/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docling/backend/docx/latex/__init__.py b/docling/backend/docx/latex/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docling/backend/docx/latex/latex_dict.py b/docling/backend/docx/latex/latex_dict.py new file mode 100644 index 0000000..280358b --- /dev/null +++ b/docling/backend/docx/latex/latex_dict.py @@ -0,0 +1,271 @@ +# -*- coding: utf-8 -*- + +""" +Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py +On 23/01/2025 +""" + +from __future__ import unicode_literals + +CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~") + +BLANK = "" +BACKSLASH = "\\" +ALN = "&" + +CHR = { + # Unicode : Latex Math Symbols + # Top accents + "\u0300": "\\grave{{{0}}}", + "\u0301": "\\acute{{{0}}}", + "\u0302": "\\hat{{{0}}}", + "\u0303": "\\tilde{{{0}}}", + "\u0304": "\\bar{{{0}}}", + "\u0305": "\\overbar{{{0}}}", + "\u0306": "\\breve{{{0}}}", + "\u0307": "\\dot{{{0}}}", + "\u0308": "\\ddot{{{0}}}", + "\u0309": "\\ovhook{{{0}}}", + "\u030a": "\\ocirc{{{0}}}}", + "\u030c": "\\check{{{0}}}}", + "\u0310": "\\candra{{{0}}}", + "\u0312": "\\oturnedcomma{{{0}}}", + "\u0315": "\\ocommatopright{{{0}}}", + "\u031a": "\\droang{{{0}}}", + "\u0338": "\\not{{{0}}}", + "\u20d0": "\\leftharpoonaccent{{{0}}}", + "\u20d1": "\\rightharpoonaccent{{{0}}}", + "\u20d2": "\\vertoverlay{{{0}}}", + "\u20d6": "\\overleftarrow{{{0}}}", + "\u20d7": "\\vec{{{0}}}", + "\u20db": "\\dddot{{{0}}}", + "\u20dc": "\\ddddot{{{0}}}", + "\u20e1": "\\overleftrightarrow{{{0}}}", + "\u20e7": "\\annuity{{{0}}}", + "\u20e9": "\\widebridgeabove{{{0}}}", + "\u20f0": "\\asteraccent{{{0}}}", + # Bottom accents + "\u0330": "\\wideutilde{{{0}}}", + "\u0331": "\\underbar{{{0}}}", + "\u20e8": "\\threeunderdot{{{0}}}", + "\u20ec": "\\underrightharpoondown{{{0}}}", + "\u20ed": "\\underleftharpoondown{{{0}}}", + "\u20ee": "\\underledtarrow{{{0}}}", + "\u20ef": "\\underrightarrow{{{0}}}", + # Over | group + "\u23b4": "\\overbracket{{{0}}}", + "\u23dc": "\\overparen{{{0}}}", + "\u23de": "\\overbrace{{{0}}}", + # Under| group + "\u23b5": "\\underbracket{{{0}}}", + "\u23dd": "\\underparen{{{0}}}", + "\u23df": "\\underbrace{{{0}}}", +} + +CHR_BO = { + # Big operators, + "\u2140": "\\Bbbsum", + "\u220f": "\\prod", + "\u2210": "\\coprod", + "\u2211": "\\sum", + "\u222b": "\\int", + "\u22c0": "\\bigwedge", + "\u22c1": "\\bigvee", + "\u22c2": "\\bigcap", + "\u22c3": "\\bigcup", + "\u2a00": "\\bigodot", + "\u2a01": "\\bigoplus", + "\u2a02": "\\bigotimes", +} + +T = { + "\u2192": "\\rightarrow ", + # Greek letters + "\U0001d6fc": "\\alpha ", + "\U0001d6fd": "\\beta ", + "\U0001d6fe": "\\gamma ", + "\U0001d6ff": "\\theta ", + "\U0001d700": "\\epsilon ", + "\U0001d701": "\\zeta ", + "\U0001d702": "\\eta ", + "\U0001d703": "\\theta ", + "\U0001d704": "\\iota ", + "\U0001d705": "\\kappa ", + "\U0001d706": "\\lambda ", + "\U0001d707": "\\m ", + "\U0001d708": "\\n ", + "\U0001d709": "\\xi ", + "\U0001d70a": "\\omicron ", + "\U0001d70b": "\\pi ", + "\U0001d70c": "\\rho ", + "\U0001d70d": "\\varsigma ", + "\U0001d70e": "\\sigma ", + "\U0001d70f": "\\ta ", + "\U0001d710": "\\upsilon ", + "\U0001d711": "\\phi ", + "\U0001d712": "\\chi ", + "\U0001d713": "\\psi ", + "\U0001d714": "\\omega ", + "\U0001d715": "\\partial ", + "\U0001d716": "\\varepsilon ", + "\U0001d717": "\\vartheta ", + "\U0001d718": "\\varkappa ", + "\U0001d719": "\\varphi ", + "\U0001d71a": "\\varrho ", + "\U0001d71b": "\\varpi ", + # Relation symbols + "\u2190": "\\leftarrow ", + "\u2191": "\\uparrow ", + "\u2192": "\\rightarrow ", + "\u2193": "\\downright ", + "\u2194": "\\leftrightarrow ", + "\u2195": "\\updownarrow ", + "\u2196": "\\nwarrow ", + "\u2197": "\\nearrow ", + "\u2198": "\\searrow ", + "\u2199": "\\swarrow ", + "\u22ee": "\\vdots ", + "\u22ef": "\\cdots ", + "\u22f0": "\\adots ", + "\u22f1": "\\ddots ", + "\u2260": "\\ne ", + "\u2264": "\\leq ", + "\u2265": "\\geq ", + "\u2266": "\\leqq ", + "\u2267": "\\geqq ", + "\u2268": "\\lneqq ", + "\u2269": "\\gneqq ", + "\u226a": "\\ll ", + "\u226b": "\\gg ", + "\u2208": "\\in ", + "\u2209": "\\notin ", + "\u220b": "\\ni ", + "\u220c": "\\nni ", + # Ordinary symbols + "\u221e": "\\infty ", + # Binary relations + "\u00b1": "\\pm ", + "\u2213": "\\mp ", + # Italic, Latin, uppercase + "\U0001d434": "A", + "\U0001d435": "B", + "\U0001d436": "C", + "\U0001d437": "D", + "\U0001d438": "E", + "\U0001d439": "F", + "\U0001d43a": "G", + "\U0001d43b": "H", + "\U0001d43c": "I", + "\U0001d43d": "J", + "\U0001d43e": "K", + "\U0001d43f": "L", + "\U0001d440": "M", + "\U0001d441": "N", + "\U0001d442": "O", + "\U0001d443": "P", + "\U0001d444": "Q", + "\U0001d445": "R", + "\U0001d446": "S", + "\U0001d447": "T", + "\U0001d448": "U", + "\U0001d449": "V", + "\U0001d44a": "W", + "\U0001d44b": "X", + "\U0001d44c": "Y", + "\U0001d44d": "Z", + # Italic, Latin, lowercase + "\U0001d44e": "a", + "\U0001d44f": "b", + "\U0001d450": "c", + "\U0001d451": "d", + "\U0001d452": "e", + "\U0001d453": "f", + "\U0001d454": "g", + "\U0001d456": "i", + "\U0001d457": "j", + "\U0001d458": "k", + "\U0001d459": "l", + "\U0001d45a": "m", + "\U0001d45b": "n", + "\U0001d45c": "o", + "\U0001d45d": "p", + "\U0001d45e": "q", + "\U0001d45f": "r", + "\U0001d460": "s", + "\U0001d461": "t", + "\U0001d462": "u", + "\U0001d463": "v", + "\U0001d464": "w", + "\U0001d465": "x", + "\U0001d466": "y", + "\U0001d467": "z", +} + +FUNC = { + "sin": "\\sin({fe})", + "cos": "\\cos({fe})", + "tan": "\\tan({fe})", + "arcsin": "\\arcsin({fe})", + "arccos": "\\arccos({fe})", + "arctan": "\\arctan({fe})", + "arccot": "\\arccot({fe})", + "sinh": "\\sinh({fe})", + "cosh": "\\cosh({fe})", + "tanh": "\\tanh({fe})", + "coth": "\\coth({fe})", + "sec": "\\sec({fe})", + "csc": "\\csc({fe})", +} + +FUNC_PLACE = "{fe}" + +BRK = "\\\\" + +CHR_DEFAULT = { + "ACC_VAL": "\\hat{{{0}}}", +} + +POS = { + "top": "\\overline{{{0}}}", # not sure + "bot": "\\underline{{{0}}}", +} + +POS_DEFAULT = { + "BAR_VAL": "\\overline{{{0}}}", +} + +SUB = "_{{{0}}}" + +SUP = "^{{{0}}}" + +F = { + "bar": "\\frac{{{num}}}{{{den}}}", + "skw": r"^{{{num}}}/_{{{den}}}", + "noBar": "\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}", + "lin": "{{{num}}}/{{{den}}}", +} +F_DEFAULT = "\\frac{{{num}}}{{{den}}}" + +D = "\\left{left}{text}\\right{right}" + +D_DEFAULT = { + "left": "(", + "right": ")", + "null": ".", +} + +RAD = "\\sqrt[{deg}]{{{text}}}" +RAD_DEFAULT = "\\sqrt{{{text}}}" +ARR = "{text}" + +LIM_FUNC = { + "lim": "\\lim_{{{lim}}}", + "max": "\\max_{{{lim}}}", + "min": "\\min_{{{lim}}}", +} + +LIM_TO = ("\\rightarrow", "\\to") + +LIM_UPP = "\\overset{{{lim}}}{{{text}}}" + +M = "\\begin{{matrix}}{text}\\end{{matrix}}" diff --git a/docling/backend/docx/latex/omml.py b/docling/backend/docx/latex/omml.py new file mode 100644 index 0000000..add0de7 --- /dev/null +++ b/docling/backend/docx/latex/omml.py @@ -0,0 +1,453 @@ +""" +Office Math Markup Language (OMML) + +Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py +On 23/01/2025 +""" + +import lxml.etree as ET +from pylatexenc.latexencode import UnicodeToLatexEncoder + +from docling.backend.docx.latex.latex_dict import ( + ALN, + ARR, + BACKSLASH, + BLANK, + BRK, + CHARS, + CHR, + CHR_BO, + CHR_DEFAULT, + D_DEFAULT, + F_DEFAULT, + FUNC, + FUNC_PLACE, + LIM_FUNC, + LIM_TO, + LIM_UPP, + POS, + POS_DEFAULT, + RAD, + RAD_DEFAULT, + SUB, + SUP, + D, + F, + M, + T, +) + +OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}" + + +def load(stream): + tree = ET.parse(stream) + for omath in tree.findall(OMML_NS + "oMath"): + yield oMath2Latex(omath) + + +def load_string(string): + root = ET.fromstring(string) + for omath in root.findall(OMML_NS + "oMath"): + yield oMath2Latex(omath) + + +def escape_latex(strs): + last = None + new_chr = [] + strs = strs.replace(r"\\", "\\") + for c in strs: + if (c in CHARS) and (last != BACKSLASH): + new_chr.append(BACKSLASH + c) + else: + new_chr.append(c) + last = c + return BLANK.join(new_chr) + + +def get_val(key, default=None, store=CHR): + if key is not None: + return key if not store else store.get(key, key) + else: + return default + + +class Tag2Method(object): + + def call_method(self, elm, stag=None): + getmethod = self.tag2meth.get + if stag is None: + stag = elm.tag.replace(OMML_NS, "") + method = getmethod(stag) + if method: + return method(self, elm) + else: + return None + + def process_children_list(self, elm, include=None): + """ + process children of the elm,return iterable + """ + for _e in list(elm): + if OMML_NS not in _e.tag: + continue + stag = _e.tag.replace(OMML_NS, "") + if include and (stag not in include): + continue + t = self.call_method(_e, stag=stag) + if t is None: + t = self.process_unknow(_e, stag) + if t is None: + continue + yield (stag, t, _e) + + def process_children_dict(self, elm, include=None): + """ + process children of the elm,return dict + """ + latex_chars = dict() + for stag, t, e in self.process_children_list(elm, include): + latex_chars[stag] = t + return latex_chars + + def process_children(self, elm, include=None): + """ + process children of the elm,return string + """ + return BLANK.join( + ( + t if not isinstance(t, Tag2Method) else str(t) + for stag, t, e in self.process_children_list(elm, include) + ) + ) + + def process_unknow(self, elm, stag): + return None + + +class Pr(Tag2Method): + + text = "" + + __val_tags = ("chr", "pos", "begChr", "endChr", "type") + + __innerdict = None # can't use the __dict__ + + """ common properties of element""" + + def __init__(self, elm): + self.__innerdict = {} + self.text = self.process_children(elm) + + def __str__(self): + return self.text + + def __unicode__(self): + return self.__str__(self) + + def __getattr__(self, name): + return self.__innerdict.get(name, None) + + def do_brk(self, elm): + self.__innerdict["brk"] = BRK + return BRK + + def do_common(self, elm): + stag = elm.tag.replace(OMML_NS, "") + if stag in self.__val_tags: + t = elm.get("{0}val".format(OMML_NS)) + self.__innerdict[stag] = t + return None + + tag2meth = { + "brk": do_brk, + "chr": do_common, + "pos": do_common, + "begChr": do_common, + "endChr": do_common, + "type": do_common, + } + + +class oMath2Latex(Tag2Method): + """ + Convert oMath element of omml to latex + """ + + _t_dict = T + + __direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e") + u = UnicodeToLatexEncoder( + replacement_latex_protection="braces-all", + unknown_char_policy="keep", + unknown_char_warning=False, + ) + + def __init__(self, element): + self._latex = self.process_children(element) + + def __str__(self): + return self.latex.replace(" ", " ") + + def __unicode__(self): + return self.__str__(self) + + def process_unknow(self, elm, stag): + if stag in self.__direct_tags: + return self.process_children(elm) + elif stag[-2:] == "Pr": + return Pr(elm) + else: + return None + + @property + def latex(self): + return self._latex + + def do_acc(self, elm): + """ + the accent function + """ + c_dict = self.process_children_dict(elm) + latex_s = get_val( + c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR + ) + return latex_s.format(c_dict["e"]) + + def do_bar(self, elm): + """ + the bar function + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["barPr"] + latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS) + return pr.text + latex_s.format(c_dict["e"]) + + def do_d(self, elm): + """ + the delimiter object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["dPr"] + null = D_DEFAULT.get("null") + + s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T) + e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T) + delim = pr.text + D.format( + left=null if not s_val else escape_latex(s_val), + text=c_dict["e"], + right=null if not e_val else escape_latex(e_val), + ) + return delim + + def do_spre(self, elm): + """ + the Pre-Sub-Superscript object -- Not support yet + """ + pass + + def do_sub(self, elm): + text = self.process_children(elm) + return SUB.format(text) + + def do_sup(self, elm): + text = self.process_children(elm) + return SUP.format(text) + + def do_f(self, elm): + """ + the fraction object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["fPr"] + latex_s = get_val(pr.type, default=F_DEFAULT, store=F) + return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den")) + + def do_func(self, elm): + """ + the Function-Apply object (Examples:sin cos) + """ + c_dict = self.process_children_dict(elm) + func_name = c_dict.get("fName") + return func_name.replace(FUNC_PLACE, c_dict.get("e")) + + def do_fname(self, elm): + """ + the func name + """ + latex_chars = [] + for stag, t, e in self.process_children_list(elm): + if stag == "r": + if FUNC.get(t): + latex_chars.append(FUNC[t]) + else: + raise NotSupport("Not support func %s" % t) + else: + latex_chars.append(t) + t = BLANK.join(latex_chars) + return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this + + def do_groupchr(self, elm): + """ + the Group-Character object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["groupChrPr"] + latex_s = get_val(pr.chr) + return pr.text + latex_s.format(c_dict["e"]) + + def do_rad(self, elm): + """ + the radical object + """ + c_dict = self.process_children_dict(elm) + text = c_dict.get("e") + deg_text = c_dict.get("deg") + if deg_text: + return RAD.format(deg=deg_text, text=text) + else: + return RAD_DEFAULT.format(text=text) + + def do_eqarr(self, elm): + """ + the Array object + """ + return ARR.format( + text=BRK.join( + [t for stag, t, e in self.process_children_list(elm, include=("e",))] + ) + ) + + def do_limlow(self, elm): + """ + the Lower-Limit object + """ + t_dict = self.process_children_dict(elm, include=("e", "lim")) + latex_s = LIM_FUNC.get(t_dict["e"]) + if not latex_s: + raise NotSupport("Not support lim %s" % t_dict["e"]) + else: + return latex_s.format(lim=t_dict.get("lim")) + + def do_limupp(self, elm): + """ + the Upper-Limit object + """ + t_dict = self.process_children_dict(elm, include=("e", "lim")) + return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e")) + + def do_lim(self, elm): + """ + the lower limit of the limLow object and the upper limit of the limUpp function + """ + return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1]) + + def do_m(self, elm): + """ + the Matrix object + """ + rows = [] + for stag, t, e in self.process_children_list(elm): + if stag == "mPr": + pass + elif stag == "mr": + rows.append(t) + return M.format(text=BRK.join(rows)) + + def do_mr(self, elm): + """ + a single row of the matrix m + """ + return ALN.join( + [t for stag, t, e in self.process_children_list(elm, include=("e",))] + ) + + def do_nary(self, elm): + """ + the n-ary object + """ + res = [] + bo = "" + for stag, t, e in self.process_children_list(elm): + if stag == "naryPr": + bo = get_val(t.chr, store=CHR_BO) + else: + res.append(t) + return bo + BLANK.join(res) + + def process_unicode(self, s): + # s = s if isinstance(s,unicode) else unicode(s,'utf-8') + # print(s, self._t_dict.get(s, s), unicode_to_latex(s)) + # _str.append( self._t_dict.get(s, s) ) + + out_latex_str = self.u.unicode_to_latex(s) + + # print(s, out_latex_str) + + if ( + s.startswith("{") is False + and out_latex_str.startswith("{") + and s.endswith("}") is False + and out_latex_str.endswith("}") + ): + out_latex_str = f" {out_latex_str[1:-1]} " + + # print(s, out_latex_str) + + if "ensuremath" in out_latex_str: + out_latex_str = out_latex_str.replace("\\ensuremath{", " ") + out_latex_str = out_latex_str.replace("}", " ") + + # print(s, out_latex_str) + + if out_latex_str.strip().startswith("\\text"): + out_latex_str = f" \\text{{{out_latex_str}}} " + + # print(s, out_latex_str) + + return out_latex_str + + def do_r(self, elm): + """ + Get text from 'r' element,And try convert them to latex symbols + @todo text style support , (sty) + @todo \text (latex pure text support) + """ + _str = [] + _base_str = [] + for s in elm.findtext("./{0}t".format(OMML_NS)): + out_latex_str = self.process_unicode(s) + _str.append(out_latex_str) + _base_str.append(s) + + proc_str = escape_latex(BLANK.join(_str)) + base_proc_str = BLANK.join(_base_str) + + if "{" not in base_proc_str and "\\{" in proc_str: + proc_str = proc_str.replace("\\{", "{") + + if "}" not in base_proc_str and "\\}" in proc_str: + proc_str = proc_str.replace("\\}", "}") + + return proc_str + + tag2meth = { + "acc": do_acc, + "r": do_r, + "bar": do_bar, + "sub": do_sub, + "sup": do_sup, + "f": do_f, + "func": do_func, + "fName": do_fname, + "groupChr": do_groupchr, + "d": do_d, + "rad": do_rad, + "eqArr": do_eqarr, + "limLow": do_limlow, + "limUpp": do_limupp, + "lim": do_lim, + "m": do_m, + "mr": do_mr, + "nary": do_nary, + } diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 1a504bc..390ea5d 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -26,6 +26,7 @@ from PIL import Image, UnidentifiedImageError from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.backend.docx.latex.omml import oMath2Latex from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument @@ -260,6 +261,25 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): else: return label, None + def handle_equations_in_text(self, element, text): + only_texts = [] + only_equations = [] + texts_and_equations = [] + for subt in element.iter(): + tag_name = etree.QName(subt).localname + if tag_name == "t" and "math" not in subt.tag: + only_texts.append(subt.text) + texts_and_equations.append(subt.text) + elif "oMath" in subt.tag and "oMathPara" not in subt.tag: + latex_equation = str(oMath2Latex(subt)) + only_equations.append(latex_equation) + texts_and_equations.append(latex_equation) + + if "".join(only_texts) != text: + return text + + return "".join(texts_and_equations), only_equations + def handle_text_elements( self, element: BaseOxmlElement, @@ -268,9 +288,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) -> None: paragraph = Paragraph(element, docx_obj) - if paragraph.text is None: + raw_text = paragraph.text + text, equations = self.handle_equations_in_text(element=element, text=raw_text) + + if text is None: return - text = paragraph.text.strip() + text = text.strip() # Common styles for bullet and numbered lists. # "List Bullet", "List Number", "List Paragraph" @@ -323,6 +346,45 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): elif "Heading" in p_style_id: self.add_header(doc, p_level, text) + elif len(equations) > 0: + if (raw_text is None or len(raw_text) == 0) and len(text) > 0: + # Standalone equation + level = self.get_level() + doc.add_text( + label=DocItemLabel.FORMULA, + parent=self.parents[level - 1], + text=text, + ) + else: + # Inline equation + level = self.get_level() + inline_equation = doc.add_group( + label=GroupLabel.INLINE, parent=self.parents[level - 1] + ) + text_tmp = text + for eq in equations: + if len(text_tmp) == 0: + break + pre_eq_text = text_tmp.split(eq, maxsplit=1)[0] + text_tmp = text_tmp.split(eq, maxsplit=1)[1] + if len(pre_eq_text) > 0: + doc.add_text( + label=DocItemLabel.PARAGRAPH, + parent=inline_equation, + text=pre_eq_text, + ) + doc.add_text( + label=DocItemLabel.FORMULA, + parent=inline_equation, + text=eq, + ) + if len(text_tmp) > 0: + doc.add_text( + label=DocItemLabel.PARAGRAPH, + parent=inline_equation, + text=text_tmp, + ) + elif p_style_id in [ "Paragraph", "Normal", diff --git a/poetry.lock b/poetry.lock index bae2c56..5d4fdca 100644 --- a/poetry.lock +++ b/poetry.lock @@ -33,13 +33,13 @@ testing = ["bitsandbytes", "datasets", "diffusers", "evaluate", "parameterized", [[package]] name = "aiohappyeyeballs" -version = "2.4.6" +version = "2.4.8" description = "Happy Eyeballs for asyncio" optional = false python-versions = ">=3.9" files = [ - {file = "aiohappyeyeballs-2.4.6-py3-none-any.whl", hash = "sha256:147ec992cf873d74f5062644332c539fcd42956dc69453fe5204195e560517e1"}, - {file = "aiohappyeyeballs-2.4.6.tar.gz", hash = "sha256:9b05052f9042985d32ecbe4b59a77ae19c006a78f1344d7fdad69d28ded3d0b0"}, + {file = "aiohappyeyeballs-2.4.8-py3-none-any.whl", hash = "sha256:6cac4f5dd6e34a9644e69cf9021ef679e4394f54e58a183056d12009e42ea9e3"}, + {file = "aiohappyeyeballs-2.4.8.tar.gz", hash = "sha256:19728772cb12263077982d2f55453babd8bec6a052a926cd5c0c42796da8bf62"}, ] [[package]] @@ -311,6 +311,24 @@ files = [ docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"] +[[package]] +name = "backrefs" +version = "5.8" +description = "A wrapper around re and regex that adds additional back references." +optional = false +python-versions = ">=3.9" +files = [ + {file = "backrefs-5.8-py310-none-any.whl", hash = "sha256:c67f6638a34a5b8730812f5101376f9d41dc38c43f1fdc35cb54700f6ed4465d"}, + {file = "backrefs-5.8-py311-none-any.whl", hash = "sha256:2e1c15e4af0e12e45c8701bd5da0902d326b2e200cafcd25e49d9f06d44bb61b"}, + {file = "backrefs-5.8-py312-none-any.whl", hash = "sha256:bbef7169a33811080d67cdf1538c8289f76f0942ff971222a16034da88a73486"}, + {file = "backrefs-5.8-py313-none-any.whl", hash = "sha256:e3a63b073867dbefd0536425f43db618578528e3896fb77be7141328642a1585"}, + {file = "backrefs-5.8-py39-none-any.whl", hash = "sha256:a66851e4533fb5b371aa0628e1fee1af05135616b86140c9d787a2ffdf4b8fdc"}, + {file = "backrefs-5.8.tar.gz", hash = "sha256:2cab642a205ce966af3dd4b38ee36009b31fa9502a35fd61d59ccc116e40a6bd"}, +] + +[package.extras] +extras = ["regex"] + [[package]] name = "beautifulsoup4" version = "4.13.3" @@ -880,13 +898,13 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"] [[package]] name = "docling-ibm-models" -version = "3.4.0" +version = "3.4.1" description = "This package contains the AI models used by the Docling PDF conversion package" optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "docling_ibm_models-3.4.0-py3-none-any.whl", hash = "sha256:186517ff1f76e76113600fa1e5a699927325081a8013fdd5d0551121c2e34190"}, - {file = "docling_ibm_models-3.4.0.tar.gz", hash = "sha256:fb79beeb07d1bb9bc8acf9d0a44643cd7ce1910aa418cd685e2e477b13eeafee"}, + {file = "docling_ibm_models-3.4.1-py3-none-any.whl", hash = "sha256:c3582c99dddfa3f0eafcf80cf1267fd8efa39c4a74cc7a88f9dd49684fac2986"}, + {file = "docling_ibm_models-3.4.1.tar.gz", hash = "sha256:093b4dff2ea284a4953c3aa009e29945208b8d389b94fb14940a03a93f673e96"}, ] [package.dependencies] @@ -1331,13 +1349,13 @@ test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", [[package]] name = "griffe" -version = "1.5.7" +version = "1.6.0" description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." optional = false python-versions = ">=3.9" files = [ - {file = "griffe-1.5.7-py3-none-any.whl", hash = "sha256:4af8ec834b64de954d447c7b6672426bb145e71605c74a4e22d510cc79fe7d8b"}, - {file = "griffe-1.5.7.tar.gz", hash = "sha256:465238c86deaf1137761f700fb343edd8ffc846d72f6de43c3c345ccdfbebe92"}, + {file = "griffe-1.6.0-py3-none-any.whl", hash = "sha256:9f1dfe035d4715a244ed2050dfbceb05b1f470809ed4f6bb10ece5a7302f8dd1"}, + {file = "griffe-1.6.0.tar.gz", hash = "sha256:eb5758088b9c73ad61c7ac014f3cdfb4c57b5c2fcbfca69996584b702aefa354"}, ] [package.dependencies] @@ -1818,18 +1836,18 @@ testing = ["Django", "attrs", "colorama", "docopt", "pytest (<9.0.0)"] [[package]] name = "jeepney" -version = "0.8.0" +version = "0.9.0" description = "Low-level, pure Python DBus protocol wrapper." optional = false python-versions = ">=3.7" files = [ - {file = "jeepney-0.8.0-py3-none-any.whl", hash = "sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755"}, - {file = "jeepney-0.8.0.tar.gz", hash = "sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806"}, + {file = "jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683"}, + {file = "jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732"}, ] [package.extras] test = ["async-timeout", "pytest", "pytest-asyncio (>=0.17)", "pytest-trio", "testpath", "trio"] -trio = ["async_generator", "trio"] +trio = ["trio"] [[package]] name = "jinja2" @@ -2715,17 +2733,18 @@ pygments = ">2.12.0" [[package]] name = "mkdocs-material" -version = "9.6.5" +version = "9.6.7" description = "Documentation that simply works" optional = false python-versions = ">=3.8" files = [ - {file = "mkdocs_material-9.6.5-py3-none-any.whl", hash = "sha256:aad3e6fb860c20870f75fb2a69ef901f1be727891e41adb60b753efcae19453b"}, - {file = "mkdocs_material-9.6.5.tar.gz", hash = "sha256:b714679a8c91b0ffe2188e11ed58c44d2523e9c2ae26a29cc652fa7478faa21f"}, + {file = "mkdocs_material-9.6.7-py3-none-any.whl", hash = "sha256:8a159e45e80fcaadd9fbeef62cbf928569b93df954d4dc5ba76d46820caf7b47"}, + {file = "mkdocs_material-9.6.7.tar.gz", hash = "sha256:3e2c1fceb9410056c2d91f334a00cdea3215c28750e00c691c1e46b2a33309b4"}, ] [package.dependencies] babel = ">=2.10,<3.0" +backrefs = ">=5.7.post1,<6.0" colorama = ">=0.4,<1.0" jinja2 = ">=3.0,<4.0" markdown = ">=3.2,<4.0" @@ -2734,7 +2753,6 @@ mkdocs-material-extensions = ">=1.3,<2.0" paginate = ">=0.5,<1.0" pygments = ">=2.16,<3.0" pymdown-extensions = ">=10.2,<11.0" -regex = ">=2022.4" requests = ">=2.26,<3.0" [package.extras] @@ -4755,13 +4773,13 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" [[package]] name = "pydantic-settings" -version = "2.8.0" +version = "2.8.1" description = "Settings management using Pydantic" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic_settings-2.8.0-py3-none-any.whl", hash = "sha256:c782c7dc3fb40e97b238e713c25d26f64314aece2e91abcff592fcac15f71820"}, - {file = "pydantic_settings-2.8.0.tar.gz", hash = "sha256:88e2ca28f6e68ea102c99c3c401d6c9078e68a5df600e97b43891c34e089500a"}, + {file = "pydantic_settings-2.8.1-py3-none-any.whl", hash = "sha256:81942d5ac3d905f7f3ee1a70df5dfb62d5569c12f51a5a647defc1c3d9ee2e9c"}, + {file = "pydantic_settings-2.8.1.tar.gz", hash = "sha256:d5c663dfbe9db9d5e1c646b2e161da12f0d734d422ee56f567d0ea2cee4e8585"}, ] [package.dependencies] @@ -4798,6 +4816,16 @@ files = [ [package.extras] windows-terminal = ["colorama (>=0.4.6)"] +[[package]] +name = "pylatexenc" +version = "2.10" +description = "Simple LaTeX parser providing latex-to-unicode and unicode-to-latex conversion" +optional = false +python-versions = "*" +files = [ + {file = "pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3"}, +] + [[package]] name = "pylint" version = "2.17.7" @@ -5897,26 +5925,26 @@ files = [ [[package]] name = "safetensors" -version = "0.5.2" +version = "0.5.3" description = "" optional = false python-versions = ">=3.7" files = [ - {file = "safetensors-0.5.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:45b6092997ceb8aa3801693781a71a99909ab9cc776fbc3fa9322d29b1d3bef2"}, - {file = "safetensors-0.5.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6d0d6a8ee2215a440e1296b843edf44fd377b055ba350eaba74655a2fe2c4bae"}, - {file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86016d40bcaa3bcc9a56cd74d97e654b5f4f4abe42b038c71e4f00a089c4526c"}, - {file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:990833f70a5f9c7d3fc82c94507f03179930ff7d00941c287f73b6fcbf67f19e"}, - {file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dfa7c2f3fe55db34eba90c29df94bcdac4821043fc391cb5d082d9922013869"}, - {file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:46ff2116150ae70a4e9c490d2ab6b6e1b1b93f25e520e540abe1b81b48560c3a"}, - {file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ab696dfdc060caffb61dbe4066b86419107a24c804a4e373ba59be699ebd8d5"}, - {file = "safetensors-0.5.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:03c937100f38c9ff4c1507abea9928a6a9b02c9c1c9c3609ed4fb2bf413d4975"}, - {file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:a00e737948791b94dad83cf0eafc09a02c4d8c2171a239e8c8572fe04e25960e"}, - {file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:d3a06fae62418ec8e5c635b61a8086032c9e281f16c63c3af46a6efbab33156f"}, - {file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:1506e4c2eda1431099cebe9abf6c76853e95d0b7a95addceaa74c6019c65d8cf"}, - {file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5c5b5d9da594f638a259fca766046f44c97244cc7ab8bef161b3e80d04becc76"}, - {file = "safetensors-0.5.2-cp38-abi3-win32.whl", hash = "sha256:fe55c039d97090d1f85277d402954dd6ad27f63034fa81985a9cc59655ac3ee2"}, - {file = "safetensors-0.5.2-cp38-abi3-win_amd64.whl", hash = "sha256:78abdddd03a406646107f973c7843276e7b64e5e32623529dc17f3d94a20f589"}, - {file = "safetensors-0.5.2.tar.gz", hash = "sha256:cb4a8d98ba12fa016f4241932b1fc5e702e5143f5374bba0bbcf7ddc1c4cf2b8"}, + {file = "safetensors-0.5.3-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:bd20eb133db8ed15b40110b7c00c6df51655a2998132193de2f75f72d99c7073"}, + {file = "safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:21d01c14ff6c415c485616b8b0bf961c46b3b343ca59110d38d744e577f9cce7"}, + {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11bce6164887cd491ca75c2326a113ba934be596e22b28b1742ce27b1d076467"}, + {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4a243be3590bc3301c821da7a18d87224ef35cbd3e5f5727e4e0728b8172411e"}, + {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8bd84b12b1670a6f8e50f01e28156422a2bc07fb16fc4e98bded13039d688a0d"}, + {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:391ac8cab7c829452175f871fcaf414aa1e292b5448bd02620f675a7f3e7abb9"}, + {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cead1fa41fc54b1e61089fa57452e8834f798cb1dc7a09ba3524f1eb08e0317a"}, + {file = "safetensors-0.5.3-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1077f3e94182d72618357b04b5ced540ceb71c8a813d3319f1aba448e68a770d"}, + {file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:799021e78287bac619c7b3f3606730a22da4cda27759ddf55d37c8db7511c74b"}, + {file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:df26da01aaac504334644e1b7642fa000bfec820e7cef83aeac4e355e03195ff"}, + {file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:32c3ef2d7af8b9f52ff685ed0bc43913cdcde135089ae322ee576de93eae5135"}, + {file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:37f1521be045e56fc2b54c606d4455573e717b2d887c579ee1dbba5f868ece04"}, + {file = "safetensors-0.5.3-cp38-abi3-win32.whl", hash = "sha256:cfc0ec0846dcf6763b0ed3d1846ff36008c6e7290683b61616c4b040f6a54ace"}, + {file = "safetensors-0.5.3-cp38-abi3-win_amd64.whl", hash = "sha256:836cbbc320b47e80acd40e44c8682db0e8ad7123209f69b093def21ec7cafd11"}, + {file = "safetensors-0.5.3.tar.gz", hash = "sha256:b6b0d6ecacec39a4fdd99cc19f4576f5219ce858e6fd8dbe7609df0b8dc56965"}, ] [package.dependencies] @@ -6213,13 +6241,13 @@ train = ["accelerate (>=0.20.3)", "datasets"] [[package]] name = "setuptools" -version = "75.8.1" +version = "75.8.2" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.9" files = [ - {file = "setuptools-75.8.1-py3-none-any.whl", hash = "sha256:3bc32c0b84c643299ca94e77f834730f126efd621de0cc1de64119e0e17dab1f"}, - {file = "setuptools-75.8.1.tar.gz", hash = "sha256:65fb779a8f28895242923582eadca2337285f0891c2c9e160754df917c3d2530"}, + {file = "setuptools-75.8.2-py3-none-any.whl", hash = "sha256:558e47c15f1811c1fa7adbd0096669bf76c1d3f433f58324df69f3f5ecac4e8f"}, + {file = "setuptools-75.8.2.tar.gz", hash = "sha256:4880473a969e5f23f2a2be3646b2dfd84af9028716d398e46192f84bc36900d2"}, ] [package.extras] @@ -7217,13 +7245,13 @@ files = [ [[package]] name = "types-requests" -version = "2.32.0.20241016" +version = "2.32.0.20250301" description = "Typing stubs for requests" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "types-requests-2.32.0.20241016.tar.gz", hash = "sha256:0d9cad2f27515d0e3e3da7134a1b6f28fb97129d86b867f24d9c726452634d95"}, - {file = "types_requests-2.32.0.20241016-py3-none-any.whl", hash = "sha256:4195d62d6d3e043a4eaaf08ff8a62184584d2e8684e9d2aa178c7915a7da3747"}, + {file = "types_requests-2.32.0.20250301-py3-none-any.whl", hash = "sha256:0003e0124e2cbefefb88222ff822b48616af40c74df83350f599a650c8de483b"}, + {file = "types_requests-2.32.0.20250301.tar.gz", hash = "sha256:3d909dc4eaab159c0d964ebe8bfa326a7afb4578d8706408d417e17d61b0c500"}, ] [package.dependencies] @@ -7231,13 +7259,13 @@ urllib3 = ">=2" [[package]] name = "types-tqdm" -version = "4.67.0.20241221" +version = "4.67.0.20250301" description = "Typing stubs for tqdm" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "types_tqdm-4.67.0.20241221-py3-none-any.whl", hash = "sha256:a1f1c9cda5c2d8482d2c73957a5398bfdedda10f6bc7b3b4e812d5c910486d29"}, - {file = "types_tqdm-4.67.0.20241221.tar.gz", hash = "sha256:e56046631056922385abe89aeb18af5611f471eadd7918a0ad7f34d84cd4c8cc"}, + {file = "types_tqdm-4.67.0.20250301-py3-none-any.whl", hash = "sha256:8af97deb8e6874af833555dc1fe0fcd456b1a789470bf6cd8813d4e7ee4f6c5b"}, + {file = "types_tqdm-4.67.0.20250301.tar.gz", hash = "sha256:5e89a38ad89b867823368eb97d9f90d2fc69806bb055dde62716a05da62b5e0d"}, ] [package.dependencies] @@ -7833,4 +7861,4 @@ vlm = ["accelerate", "transformers", "transformers"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "f3b5175d40375322ef5ca45e769e49991d132015a6d462b70715829732e20e68" +content-hash = "c37ae7d39cb2af7031248c2f0308c91160facafd948e982899245e5d8369bbbb" diff --git a/pyproject.toml b/pyproject.toml index b26ade8..23e7aab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,23 +2,43 @@ name = "docling" version = "2.26.0" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." -authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Panos Vagenas ", "Peter Staar "] +authors = [ + "Christoph Auer ", + "Michele Dolfi ", + "Maxim Lysak ", + "Nikos Livathinos ", + "Ahmed Nassar ", + "Panos Vagenas ", + "Peter Staar ", +] license = "MIT" readme = "README.md" repository = "https://github.com/DS4SD/docling" homepage = "https://github.com/DS4SD/docling" -keywords= ["docling", "convert", "document", "pdf", "docx", "html", "markdown", "layout model", "segmentation", "table structure", "table former"] - classifiers = [ - "License :: OSI Approved :: MIT License", - "Operating System :: MacOS :: MacOS X", - "Operating System :: POSIX :: Linux", - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - "Programming Language :: Python :: 3" - ] -packages = [{include = "docling"}] +keywords = [ + "docling", + "convert", + "document", + "pdf", + "docx", + "html", + "markdown", + "layout model", + "segmentation", + "table structure", + "table former", +] +classifiers = [ + "License :: OSI Approved :: MIT License", + "Operating System :: MacOS :: MacOS X", + "Operating System :: POSIX :: Linux", + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Programming Language :: Python :: 3", +] +packages = [{ include = "docling" }] [tool.poetry.dependencies] ###################### @@ -40,7 +60,7 @@ certifi = ">=2024.7.4" rtree = "^1.3.0" scipy = [ { version = "^1.6.0", markers = "python_version >= '3.10'" }, - { version = ">=1.6.0,<1.14.0", markers = "python_version < '3.10'" } + { version = ">=1.6.0,<1.14.0", markers = "python_version < '3.10'" }, ] typer = "^0.12.5" python-docx = "^1.1.2" @@ -56,21 +76,22 @@ onnxruntime = [ # 1.19.2 is the last version with python3.9 support, # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0 { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" }, - { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" } + { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }, ] transformers = [ - {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true }, - {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true } + { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true }, + { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true }, ] accelerate = [ - {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true }, + { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true }, ] pillow = ">=10.0.0,<12.0.0" tqdm = "^4.65.0" +pylatexenc = "^2.10" [tool.poetry.group.dev.dependencies] -black = {extras = ["jupyter"], version = "^24.4.2"} +black = { extras = ["jupyter"], version = "^24.4.2" } pytest = "^7.2.2" pre-commit = "^3.7.1" mypy = "^1.10.1" @@ -93,7 +114,7 @@ types-tqdm = "^4.67.0.20241221" mkdocs-material = "^9.5.40" mkdocs-jupyter = "^0.25.0" mkdocs-click = "^0.8.1" -mkdocstrings = {extras = ["python"], version = "^0.27.0"} +mkdocstrings = { extras = ["python"], version = "^0.27.0" } griffe-pydantic = "^1.1.0" [tool.poetry.group.examples.dependencies] @@ -108,8 +129,8 @@ optional = true [tool.poetry.group.constraints.dependencies] numpy = [ - { version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' }, - { version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' }, + { version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' }, + { version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' }, ] [tool.poetry.group.mac_intel] @@ -117,12 +138,12 @@ optional = true [tool.poetry.group.mac_intel.dependencies] torch = [ - {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2"}, - {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2"} + { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2" }, + { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2" }, ] torchvision = [ - {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0"}, - {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2"} + { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0" }, + { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2" }, ] [tool.poetry.extras] @@ -147,7 +168,7 @@ include = '\.pyi?$' [tool.isort] profile = "black" line_length = 88 -py_version=39 +py_version = 39 [tool.mypy] pretty = true @@ -158,18 +179,19 @@ python_version = "3.10" [[tool.mypy.overrides]] module = [ - "docling_parse.*", - "pypdfium2.*", - "networkx.*", - "scipy.*", - "filetype.*", - "tesserocr.*", - "docling_ibm_models.*", - "easyocr.*", - "ocrmac.*", - "lxml.*", - "huggingface_hub.*", - "transformers.*", + "docling_parse.*", + "pypdfium2.*", + "networkx.*", + "scipy.*", + "filetype.*", + "tesserocr.*", + "docling_ibm_models.*", + "easyocr.*", + "ocrmac.*", + "lxml.*", + "huggingface_hub.*", + "transformers.*", + "pylatexenc.*", ] ignore_missing_imports = true diff --git a/tests/data/docx/equations.docx b/tests/data/docx/equations.docx new file mode 100644 index 0000000000000000000000000000000000000000..8ab71b96dd1ad5257f8ab3c925376baa0259b8dd GIT binary patch literal 15017 zcmeHuV|XRo*6xbYvF)T|t2?%B+qRvKZM);7W81dvbkMPRv-dgY+c@Xm``r8Y`|5dW zu3Br<`_`(eG3Oj}Kvn_-6a@edfCK;lgn-S08B28_0Du4t06+#n0&57^SUVb7JL)L9 z*%~=$(Yjh$66Apblji_{KhFQ3?Z5F8s7n~N?x9Bzx=(z9k8M=a|Cv`x2^`LyL?eF$ zf#CtH{2Djd_SS(6ET;q%3vEe6%5=ZMq&yHXy^>}Lg;?)GdVzkqD+idCUK8s!V7#sW7@iJ z-vCM8Aho5CsfWOZ)6PNq6koH*;$f|*hZ1`1c{G=u;F&BNW4{{U zfO`>zg~5{|s}coq>WJ^iEJrmoKvd|WR}%iiQ&H2cxvGF#dKG8uM(I0g-O~zwVI)u+r_k;yE<8o-78ZcxP&yD*lIMr;~H!GzrnkXimN&%nkH%ZdrJEdayOtdCZ@=z_^4#I02hFYeI6>a*1Y z2mtW@4g!$|NeCA<#FBSz4S1= z=K-$))9rFA-5B}ObOuwa7|Rfl8e)>jt4rpKZ?Bw7OTb!3dLpAUv++~z4jIBuTM1eh zSV>B-VeL>0kJ`N&&uy-N#BXii8C(}^cVN;F?u=hYh{Y&I0>af$!zQu7V;@3N#=DUB zDFyHLi$0kVQB2Mmkksa9*(ojFr+72vq%zDen2Tz7K$LO`bp+wIOkjFL8}n2rGttCE z_iWT?^(h|5H!((5MPF>tc_n74kL zuuiqc?ej&EI&AvdK=OOF-7U%AK+}>wKV;hWO#~XOUjvND#RRX}e5u`}y-rWE?pO!C zV%>b*g1zG^c0`aMmveZCv4aJ8`MeTS6H}EE6GicT)ZJ-sL-LlOUM>NeOd!yY%DNnH zj+_s_;@wDLi`o4;VAs1ucDs?oAb8e2@1vu`V{ug-;2)vB5x%(AK~`MqCXsB16dG6r zaboCrM4^J&(Ti19bma#O_f;dbLlUJMV|Epo=%J{RN7;E`-QeqS4sDkkF)?%af6`0^ zT-C7z$oVH(z)WEY<`}83fa~7_dT?rVvBNB2NoN8}X$ZXO2qfpx;;QiFVI6k7MNe60 zYGCOlb^pBPZ0qSFl+1t_bc*{NF&qL0fxtOo9Z6r%-X%nT_&XMNWQ ztoJNbe=wp0PXOWFUKUGB<`W+%zNRE1s}8gVLE;_Yu`f&#?ubIYa9_~pDn3*~`t*yC z=FS$l3@|?Mb1060X8`^;djH0yAoi$j)CpN|GYUL@WO>S;r36D*(GnuAJ!bD7oVpmw z`Y5q#s7I#2ht~XOA=LEEdMo0t=&=F`_+wo%`kEl!>~i`L-aR{AW3F%qqeG+M%x-hv z-cWcRAgeow-j5qSQx7U66< zc?#xk9v)qAQeH<4X6RhVN=Tcx!VM$^R7eDD2<-8OaY{Jq7CzY3KkP@{*tU*5oYUc( z*$Z>e_B?pwp$(ogP=&6!D}ANpx@jSGuXS_ewi6^(n{r#VBa69?=~5OG)tCgcXJvFI zcSvl|)EEu2rxp=7-AQZ7)J+dX2K|L!tYHpxIf9ukr^hqbi!1~awpMG~j9W0kN~wl= zsajHz+(ttWrlumkH{ zuqSQl#Yud@f6@wNy#D-oIUY~*SUN*iI|vGL1MloeZ6v!EtEd8bHHYh(B=~uuJ(MT< zoD0^i@1$#leSY71(_xN1^CIeNX-3o-{9tx?n}&zm&tLoFtD{Yg$@#-G$G`R~R!5_h z#&zDmt5%KxM!0L$$lKFW&`g#+>%En^6)_d|RqbTc5}AnL*~PXXA+*%hsHGcR6=tR& zB(J6h57NfP^!KDIDXfBIQG7Oao5^g0NbCz`(ae9@N2-8|mBL2tyGYZ`x#KMnPQn`H z_JG(7VU?Ba@2f+&vS;d-_jr59jLFtT1+h$l2qUtiYRC)014e;Dxl{wfhlB;rMDcaD_bx0pe=$KohHBWHo*>HZA#?@TB z?W{(i$_uJ^G0ZCg_wZL7F1Snb2ucJex~ju~T}LGasnCQoTsR*oZS-4nPlBg#vmrsx zMMaoqE~LX2uVVkKa~Q)umEvFR_MWc*`ich*yG$RA% zp63J?sh&yQTgRKIQlHV?e?d@2d@Wx^M~JF87QHv9&MqTa?AKSg_2!|A&rm;bDv8g$ z^U@HP$NlNC&Yo24Li+6JqS@xo$DyokSZ_6}pB}`d7`g9ML+xc&U3$qIm)<$8?cIh< zrJ5sJGeQpQ>SeOZJm}>VvIr`S7}1CZ*k)1K#w-jiX=A%p`?h-T+8xr{)0>rd)lLAT zOVSUzrC)DUOjVNuy+>0kWfl9yZ+`!()ucYEK05c7p$=|fsw;T?oD^4&(TR|uZa4l5 z_`Kw(OH8|y{{}Z-!va*q&u7X+Y(BqAgyoan>2?{Oom1kTvI>>a4Wp@3&}Tgs`(H~x zU-@VFMP4~P5Z_)`li!twEfLv~U>(TeX2Di8)c0yRKV!FunVJtPXK$_^-goT3X!C)W zS^|zRz~9vu#%n1j!^BBF-fLWS{c?d%J?R!MlQO6E@Cql-$A2DOdpy5v92Sk35bfgb z-9vL4l)~IEG&Baq5pKtRzj}ikEW?_-^71~nhSu4%$qeGuuP%Nj^sgWn#^2(J6Lx0E zSsA>4e(+w|ZdW-6>C9w$8HIeg3);l?Hi7jy3E!scxoDPHzI@Epe_0l}>znR){Q0xs zJcKf#7g`JPdcN_8&)ck&w`N>)nhlOseC9Z$Z(==}PLI#!@^bl@Xps6VDg+{Vd5)NR z=wh$S26GfD%r#6K?Q!B2?@gxM?t$ByRD)*-@KIX{vux*U%bsTo8ZlLKc$$313`_ev z$fjll1Yl^-Wx86X>yYSiKA`T^_URKzh#10-eb!`E#u2re4mYS6lIw{Dx)kf#1VSC6 zwaZY9R{b;yWL)Iv5vLekYWWu`+fdCb+Ru{({m#0tII-EixUseenenPA;?j~-(_ouN zW_QF*j_RcA(@l>&A9mWmTXGAAwQ@Hf5{8d;5Dowm=yyx*AIqIT+jIX~`T%`+AwDd^ z|K3}9+~|kt|FH`H_7;%hMZ@7G%fDwSPPn@83J}8MDW--RzuE5mAu4tC3u0*&XH*G3QZ~Gh~nOGvbHGBBg5y-j6dr1Pc|56I@D+SX~WmZOg1Fb7_gWTR*`( zcKIoW;Q9o^uQ<#`+&L0-k zX&YMsY&IZYxfKX`Y~Lfsj@ZRjaWxFV;^`#>V-AIsCBmlg!wndVWT{KL^7~7=^s>r2 zwQ90vY2O;~2hOWf9GcW-axbH(+8dFhvwxBk%5b|semJH6U29ByU+N~n0f2D=008wv z8GhFqM^hs!Bf8&PhTqo!YUAP9Y$%=RPdp*b9C*>4$*uuBwNlRUq$dS?tU2vT+J@-}u$tDQ*@7C}5eVrX=1|z6G+r5>uL4P7W@o$0ij*ByBRJDseZ7eBXxlO8751-osh!4QiCc8K?E*DmwbLB zOZpPSFIWEPv@+p7^aUNN^6Nk#F2(#%%+%6wc|pg6zY7$|=a@R2FOz{;aMK{$0N~~r z5SqbxA8EULb2}PA!6-{%>zwx_dkMecv03{u{0gx3j>Y1Hg(nu@5G&2jTy6_5EWn^a z^Gu)wnnvluT2N!3G4F7`kV~yLW%@(J+f~DxamB6@OdB|9bmA`-xU^U8`7R6DE7kR$ zMy(&S)Yz`D;&>Js58|C#_fO#30l@zG%d9QRHq7P{H&~3{4JV;(7R;^N8v~1q^~qY0 z9!2;1)ghxPutj~HX;IjL^%3(aa3nN2&=Lah4jBACLG;n*X*#!Rmym$`*l|CJZeU;I zqbJUEjC@p1-T`??*;cK~z?LtI_iOXK+&R6xt{v>me&to&yVKn>+1Gx5 zxZT|J0eXA74PKP#tlo7)dp|y;(|LUx>?}bCudvB(cYizGchPx$oK^VLHoisv34aH{ z&h}HLKL>Ndr#eYr;nQCvK|p~(@=_eh8f(o;v8XISUk^3POdP1K;X=w-I=Ha4tz{1~o>fa3vGIbB z3lM3+d7%&j&0%B`8X~qvSL-c;a2hAfmH_St+v7&@i`<*`kAw|yAmsW*%+KfJ&)zt^ zX8bAOOp5$EY5o_4A!3>+*~M7UT5y7KEZ5f)aQcXf3_BFV>g>x@7&%C|lB&3z12{RW z4+q|U%QK#EQ0wI8s*4d;4MYf5D${Bb@7!ASC&q6>g>B>f#QhDJ4ZDW^dO7l?&$@OuR*3?n`Ymm#td$Ty)5aE%1wZ3wY zYFP&wOiYGP#AIywG}^I!Y>ClA zJNo8$92Z`tnQehJYP);etW@c=M#wLh=&W4q(QvVx64XEHLX3z`<>3g($saBhJidbWlVHE z!W^4)6+hsFZ;|^}XA~BgTs+FxPx47NQ)fFiDm7veTwnM~C*8Z2PRzKZ%LBsdHY+Nh zjY`PkHq^P=A+^dDX>@cNDj~_28wq1m$4KNFVv2u!og~avaVDfrLqmi8&Yk@2P2F7x zPa05aacNGfN75@5bR+mk5v${N-@}z@BI&T4W#lH+f2+@F6*Cl=E=GhLhUh60^g1ZjR|#9 zkazMJ5)-X^3AM*bZ2JHjpM)XUCONf9>lFci7K(WValCD)Ez?8T-OJS7J7akAD_3StWkm z(M>6~p59#9JmTb5)c1@JiD`ah2+q>K#82~!r9?}YFDBI*H_gqYPbeEaU9Jo+L`kbe zLxZmI=%=c%w~TU`15r~UrB6Nml2&_?B%|M%mQgYU@EE4`7x;Da)|uPVb^q1kyHAPs z{WQ#_zwC3hDv~5#&4+(EwTaA-6G2MM8=m>SRLh`^y4<(;oVlI9ha_{qHcuvCjMiVoVm4LI#=an!6QJsam5r4@5`280M#%1 zM)?yq=z94^jp8{?q0VXQIjXG1%DL4@@9(dsW){s?^Yvo>+~ZSt#Av9j4=M|ks|d4))r6d@$sCRK0AL#fnCTvgb1%+6c_D)SDt zEK1;7_vx%gMs^;p7^!^v#(q7<$%Hu>F&-REm*JEZI-?k7hM;{L0EgJzr)XK!t&?u# zg5y}n2TV)`mI@A$7ZAPJ`F5q3ZLSYfd;}*8nR(@jbs9l2t1WZL(G%MdM~VnQleaV6 zQnK>EDJd#FJ%~#lnu=gFhifn1#0Q6s%m7bBG835IXI7CP?NNrt!9j z*GN^#fZXN0L(xMBKyeq5__h2HT?5o~O)|Op9AYxjw!wYcx$ST-zn$N&(NhclJ?_gi z7HbF-J^^aQ?gQ%d^gZf>HTSffpWBj4+3ZmKC}f@rXm3WF<^Y2=ySuL2h`^;0XD>0l z6*4xQ4|NJMu%?zEBCIFQo!+=i6Emx;3ufkKepbB@_^%jlWy5jRs!@zxbH=8)v!RxC^PJmAMdU6a~kr(+SLzv(FVV&_*ylelushPR^Q+cj&Ck+-q@d z?9Jm=wd5&UyvINE3b+(%&bvVPcs=64xTs!;NFnsZ6ye6<1Q^d20d*6|#>IS4V0kv9C{9nd0frs}_91BPFBdWg#|DK~PST)fs!? zplb-pUuTxiknEyJ92+{qdgDzHbn}xhylUL*q6Q?L90?plx0AmL z%I-JJ9(zzhEjnEjW^k0Ad&xE7gq;e|tPoSkLv03s#M&-ITZ13Qn672RPOkLT8gg=n zzxaUWvrF&%#9@pM5@Y5GhsiUn<_z!KcC}xaB#Ox6tJT;fXv`OLT`-11gxMkcSTVVo z(^2|RcGgI(Y}fbWTuRE#sV$dDPMRAY9|b{2n}8lLd=XkuI_S1;1ZNhBpvl$b+(H^r zkaTjBfNnIVDrlgw$GsE8gXz6_aDZ(t|~bR;%M%!-Ko8iAB;fZJ>V>#GSvYie}Qqm>Kdc-)yOqD)J? z9q0Lm@Mym+V_5tn^;6>!$a~}d(|;a*P9H=Zp%4H7iy{C3{$KXDgQJ_J(eH!Lsn)9f zsxXq*X6Y*gO13jTPAHMka49*3Si?ePJuYb++oXUlIT0`rEkIqq!}~A@8yh4K!b;uF zN)t5}@I00ylRa>=#mMvaQWw@n$2s0Z!i^v4!;a&|c^6L*aXu-MeO{@P8P!H@V%L>N za&VY$k`?Lwc69FZe0fPdnoXSL223z9q5Sq}ntFIr8xSz2bA>wUExN#o#(^%@qTl?G zwt-)+`c$RXc@;mOw!h68(ev`S_4C4(m66^IKf$mX((8@vl8p-QaO90(KbBW=#}c!$ z7Wh{+{E9&w_zCl+TT*C%wh)$0e@{~&rVP*&<;^UK1ll9~j;>@hN&-DTeyL1ZF);f8 z5Dt)U>SVSpMiKZfuj+zb4vDD+)+aF4wRH`#ZRYIu8u$HNEzGex=xu$bWY&%mP!U03h^ z_>Ur9dot#&j^U40A$t(p_1S@r0ewEb=$6R)sV`|81qWr1g>Ral?XujWO{lMXFQji& zm|LKQp3iA_sBuEMGK?OD&}tSkB_4B~qOB-Un4e2Fs^03S^jiC9G-~M)iymG_b^{znXdZs z`Z=9hQa%=bQsjqDdSs&~YR9Jl-@$tPfl>OcPCB=jFS~s8uZtF__QnC(mim`&_KXUi zB5D_R!-p2Bg#8;SV&^__;IS{aWaU^Y2k$gp0|VVwcM=nOWx+f1W^S-${Z`s zW8(|eWjeW7X{TkmM;%ySH&g^_y;vM;egsM1&&T*bron!*BVg=!;<5zKIqRt|MvOE- z#Ks!TBV~>fqBO+`N19?!;uYCd-NA@7fKmXJ&U$DdF9&<>L;AJe)H;c1-P9@r_0dcM zJ{sf>?6b3%v!GUv%+~F*iH(n+;#u%i;OpD&`Hk(kRgJ<)hIdI0vv9KDgbk{@dSherHzf-{i54QG`j7h2Xw9H$a0 zLQ;if)6~ZurdNZL)1F^qV$F8lrc?)Al5A?DRyPf@vTbhgcmY`kYKvQlnF_O_2SxM{ zk24|um4Z=cesmkRtSV^KoP#RXd6F0DMqV51yIAje)*>gW(Q>bD!3jxNoCw|&;l_bk zo@7kRr34CM+l1p6Ok~C z!R*HD;gnvyjb|T1Nq}J0r6-(lh}cU9x{fMaaQ!5hN*W4}CVEGR2twKLQNvco4loqZQBdq z03_m6rGhwGGvy97j~RVz`-$^-7gzfQvyDW2rk^a&K^I8$pc^=5?3-AfT0iDKA{A#3 zh_dV4pacdZbhNIx^!6(iEC?eb3I-}NbVL|9m~g)zFfpm_1MXLrs_n+3xvrqivi$Cl zAOH(}DLXOOE3(#piJx45DsX-n6PPfn33LSI1epzIhiV8P{W?VDCh45?Tg7sEFGB1NPM3SGD^nVsJh8zL6C0QDJs9D?sl0$75Z*xQLE*r zsNV2xOH2iYrC>+vp`%S5uzXa6+3Lk}x%7*HvL;`5&rsy3f{H^hxyyT3hu_V3kVI-; z+wC57=-C`$oyy^@MeL@{Yu8`lJ-B5f+)p9H?gy&L%zKX#@#xa)636-yq5B~&hu}G= zVerqy(DH-ggb%*V^izcP_ykOQkXcQs;0r}TC zPB^O?;WP#XU|czG`)6@KRoM8}<5@Tq#-PUB)w_6`XbxOzCbkc>GS?V-&~fS^`mseO!lRSUYbese^_ia+AvL*YXBp=@MD9#OyUyBKBYqldvSK4R=^~y;? zc2BC6sNEV$ICHl0RHG5JsM}0sMyZ9`S{$*6Sa&Va*nGes+)bsSGF1TGW&dXy;f<|| zaU~`{gv6YNR@;CJSNz{$|U!ovLX+;nWmr}w6rVr9c zb6L{-k|4-`kcMmtviRB+Vexg7Jj%Ks&l|NKEUEd3YZ?^ja(K>jZL{cqgJr38Zh@NLnmX1@Qqfqoxl zG@1eH^JV*xeP0d9Yuiu6qZnUS#O-29^-WyAObDd^++myP5&ULlSKQAVq#Q_*#}hwc zR8h=e8F{k&+K9CbE`7AVMaZZrzXWMblL`;y!Q75&z9%Cz|4GOlgc)xXow&g=8A#5O zF0EO;M)ACYzy2W-(ZGp1+QbYEy#K7XcX|53U4Ldk{m1lO*W**Xsg6~=4k!<&!Y1P@ zpn^Vo)FliGUefq@nW|yP&7J!zOR3b2Mmo1==tvTbQ(@%9{?-yc?r8ULrp-8Ob7wV? zW^au;qO^4fwMyXi_mNR=Eg2bOO4hk(C2)3phVSp@wQUqa04;5cKy#qx9eoEcpfBwe z4Rz@f7dUDGg9QX>TVIY5ITNXG&$04eMz&ebFtvq@A0~P%^oJ^glQ&rD+sGC!7#lsN z0vGoa)s6prT&NzeSrs^`K221uonb$^sn0V@9;KM~4mcExxnx;(qM0B+-HCDQElT zM#v5>RjJTp{%$(-Wij}W4aJ~iQ8_E}-ptkewa1NC^N|HraaZ^}Yp5Pv@jPVYFt$+L(-nWz*PYp7 z=_CtWMp7X3RfRK*oALF;(CXc+4_zKj|NKF=UBtoSU6T$> z6N}Pv{N#HP$8^GRsu=cn5y6DC$NCPF?DOnxLZ&FupxeEi$ZaUZUSsa~tZf^mrp09} z#-m_EiOI?3r8J~!qe2uNIF08lSAm+m!y(IZ9OP>K!rKmf*x2iw!xBr*u@F~!1q&Z2 zr>RLv0*Yy55ms^O5OM}MuiTl31CpPqIM`4{f+Q)mTRQycI?WrKQlVLn)pA#9AxOKA z=%wLaQbkAT8kW!MHdYP{&UjLUN`du~BPsr{EF#KM_BL7=_xo38e;pUsqiu5JA&+aCMMYp4 z70(MvyS=%xjm?gNd?DHIsf;eBbkm3^Zmea+XT)P~4+2q#F`N9BqLE`FIyqUs(_TUz z{9wqL3xlxpsH)WL)UnV#zw-3h=J>Nf{5xAFJeb`aW{ZFf1Z?bS$P#+d^8}7syT
)*4+*a~|8N+5_yqnvYDc#5Bx>^!y`}kZT%!ETYv5v}uka_o zWyb8fbq@gwVC$LGBRI(8`&SkT0~R&32{ z1)I)Uy?96W*HE(0`Fi3=^f=#`(=$B1D#4>k?6NSsQLu)FfpFAl&(05=lzX2*+TeX! zNXQ3_(hdu5I!4Hy+CazG5UA+&YIw_1;k8WbjjAj7=-q&1&5dCg)Grpfr;o*a1luN-;eZl0E(6 zSnS@tk9~zudYci0R%QYG{RF2gL~T0dZF5%;%BI=W3vkqty^<8=o15DF!P=Zx{dE8F z1sUcZ_b`2=9J78D1|a?G1sQ)-11Re0TN?cyQ;i?BUi~ff4E6*Md|yreh&UL6%S@LO z?krb;&#RCiBuD}VXSM9JNl3u~9g(9vWkl13wLXnArWs{@epf2(+~d#eE>?p`8alOx zB5#wlcRzYPpFy&fLaS<3ry3Ukuy1^N+HcTG5atlA((dKIBrVCXKr@4GLkjC2V&*rf zVOC}cj&_cxG}dVxM`bJ3ldd$PStlH)$c%+Epvz6ENzmlP>QLfumCaU{Bc3)bHUtq- z_kgNkX)rC{1_qcYJMB@B-9p7nP4J4_TXZE@2lhJRf}tdtFbkG?gGdGgT`_e6nrE*}o*W0*Ma4CM_zYBF*0R^tW^4Gwa@j3?i#$~NU;1#x7_9b2RtpavXsRhI`N ze&*8GtXjEPqs1p>B|rJgieC~vofgKG<<#Ig8Zn>C6@vD{2X?;jUkP|?fd z4n=Kt{Mlw1?_icLg_DzVMp*d!$%QOM<`mU`kZ)sA+T$NQJii9Sy{a#)3FeW zm4~y>Od{5AXQF$WXQE3Hdvg2=wY~4Kgz%CT)wO#N4)YMs5A`7@+lz)NxY-);Y4?i; z{&4&8@mDs#egdn${>RK+WGVk{?+*#sN4^K*N502Ll83CljjaQnfsOs|8upRj{y#|` zAAK((PEO{d)M9W`>IF_>Q=~N)G?@X3g_BR;QFvFw3ck2pYZeWBbHh4dn~03etLfzR zZ1bd3s>nkPl!IP5!Q4L&h!2Qd**;ep`6g|$hXJ}gKWsKnk)98t$}~+&=ey+Ra^?#| zao7Q7h26Fi2ocxf2nN!KU%%mkx?vK-l7nlna ztV>H`^?Pj`(0F&Z&7(P`!(ZPPmoR&TwH??&OfiT}oZ#is6py%hS? zX_z1MbLPg>n}h74D?O&g7(Yc_c=E7;D9~qt}(_{_BrKr8pfy%jGuZZ=&&}VfPy~k00uI+fx zJmvs0XQ~!sd1q4jM;LF z@#mQP-{F6clKlZ^!uuEezhY*82mjqz{sU}A`%m!SOy|Ed{N0-Sg8`cUpA3Jo?f#Dc z>s0**8UV0j1OWbR;{H4QuY=~F;Y2Kdg8y>}m6ZVdP>