feat: equations to latex in MSWord backend (with inline groups) (#1114)
* Equation groups Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * fix: Proper handling of orphan IDs in layout postprocessing (#1118) * Fix the handling of orphan IDs in layout postprocessing Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update test cases Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * chore: bump version to 2.25.2 [skip ci] * docs: add description of DOCLING_ARTIFACTS_PATH env var (#1124) add env var in docs Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * fix(CLI): fix help message for abort options (#1130) fix help message Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * perf: New revision code formula model and document picture classifier (#1140) * new version code formula model Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * new version document picture classifier Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * new code formula model Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * restored original code formula test pdf Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> --------- Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> Co-authored-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * feat: Use new TableFormer model weights and default to accurate model version (#1100) * feat: New tableformer model weights [WIP] Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> * Updated TF version Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated tests, after merging with Main, Switched to Accurate TF model by default Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com> Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> * chore: bump version to 2.26.0 [skip ci] * fix: Pass tests, update docling-core to 2.22.0 (#1150) fix: update docling-core to 2.22.0 Update dependency library docling-core to latest release 2.22.0 Fix regression tests and ground truth files Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Updating content hash Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> --------- Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Matteo <43417658+Matteo-Omenetti@users.noreply.github.com> Co-authored-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
aa92a57fa9
commit
6eb718f849
@@ -26,6 +26,7 @@ from PIL import Image, UnidentifiedImageError
|
||||
from typing_extensions import override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.backend.docx.latex.omml import oMath2Latex
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
@@ -260,6 +261,25 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
return label, None
|
||||
|
||||
def handle_equations_in_text(self, element, text):
|
||||
only_texts = []
|
||||
only_equations = []
|
||||
texts_and_equations = []
|
||||
for subt in element.iter():
|
||||
tag_name = etree.QName(subt).localname
|
||||
if tag_name == "t" and "math" not in subt.tag:
|
||||
only_texts.append(subt.text)
|
||||
texts_and_equations.append(subt.text)
|
||||
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
|
||||
latex_equation = str(oMath2Latex(subt))
|
||||
only_equations.append(latex_equation)
|
||||
texts_and_equations.append(latex_equation)
|
||||
|
||||
if "".join(only_texts) != text:
|
||||
return text
|
||||
|
||||
return "".join(texts_and_equations), only_equations
|
||||
|
||||
def handle_text_elements(
|
||||
self,
|
||||
element: BaseOxmlElement,
|
||||
@@ -268,9 +288,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
) -> None:
|
||||
paragraph = Paragraph(element, docx_obj)
|
||||
|
||||
if paragraph.text is None:
|
||||
raw_text = paragraph.text
|
||||
text, equations = self.handle_equations_in_text(element=element, text=raw_text)
|
||||
|
||||
if text is None:
|
||||
return
|
||||
text = paragraph.text.strip()
|
||||
text = text.strip()
|
||||
|
||||
# Common styles for bullet and numbered lists.
|
||||
# "List Bullet", "List Number", "List Paragraph"
|
||||
@@ -323,6 +346,45 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
elif "Heading" in p_style_id:
|
||||
self.add_header(doc, p_level, text)
|
||||
|
||||
elif len(equations) > 0:
|
||||
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
|
||||
# Standalone equation
|
||||
level = self.get_level()
|
||||
doc.add_text(
|
||||
label=DocItemLabel.FORMULA,
|
||||
parent=self.parents[level - 1],
|
||||
text=text,
|
||||
)
|
||||
else:
|
||||
# Inline equation
|
||||
level = self.get_level()
|
||||
inline_equation = doc.add_group(
|
||||
label=GroupLabel.INLINE, parent=self.parents[level - 1]
|
||||
)
|
||||
text_tmp = text
|
||||
for eq in equations:
|
||||
if len(text_tmp) == 0:
|
||||
break
|
||||
pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
|
||||
text_tmp = text_tmp.split(eq, maxsplit=1)[1]
|
||||
if len(pre_eq_text) > 0:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
parent=inline_equation,
|
||||
text=pre_eq_text,
|
||||
)
|
||||
doc.add_text(
|
||||
label=DocItemLabel.FORMULA,
|
||||
parent=inline_equation,
|
||||
text=eq,
|
||||
)
|
||||
if len(text_tmp) > 0:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
parent=inline_equation,
|
||||
text=text_tmp,
|
||||
)
|
||||
|
||||
elif p_style_id in [
|
||||
"Paragraph",
|
||||
"Normal",
|
||||
|
||||
Reference in New Issue
Block a user