feat: Code and equation model for PDF and code blocks in markdown (#752)

* propagated changes for new CodeItem class

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>

* Rebased branch on latest main. changes for CodeItem

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>

* removed unused files

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>

* chore: update lockfile

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* pin latest docling-core

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update docling-core pinning

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* pin docling-core

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* use new add_code in backends and update typing in MD backend

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* added if statement for backend

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>

* removed unused import

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>

* removed print statements

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>

* gt for new pdf

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>

* Update docling/pipeline/standard_pdf_pipeline.py

Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Signed-off-by: Matteo <43417658+Matteo-Omenetti@users.noreply.github.com>

* fixed doc comment of __call__ function of code_formula_model

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>

* fix artifacts_path type

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* move imports

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* move expansion_factor to base class

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Matteo Omenetti <omenetti.matteo@gmail.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Matteo <43417658+Matteo-Omenetti@users.noreply.github.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
This commit is contained in:
Matteo
2025-01-24 16:54:22 +01:00
committed by GitHub
parent c58f75d0f7
commit 3213b247ad
28 changed files with 704 additions and 315 deletions

View File

@@ -3,19 +3,22 @@ import re
import warnings
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from typing import List, Optional, Set, Union
import marko
import marko.ext
import marko.ext.gfm
import marko.inline
from docling_core.types.doc import (
DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
NodeItem,
TableCell,
TableData,
TextItem,
)
from marko import Markdown
@@ -27,8 +30,7 @@ _log = logging.getLogger(__name__)
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def shorten_underscore_sequences(self, markdown_text, max_length=10):
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
# This regex will match any sequence of underscores
pattern = r"_+"
@@ -90,13 +92,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) from e
return
def close_table(self, doc=None):
def close_table(self, doc: DoclingDocument):
if self.in_table:
_log.debug("=== TABLE START ===")
for md_table_row in self.md_table_buffer:
_log.debug(md_table_row)
_log.debug("=== TABLE END ===")
tcells = []
tcells: List[TableCell] = []
result_table = []
for n, md_table_row in enumerate(self.md_table_buffer):
data = []
@@ -137,15 +139,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.in_table = False
self.md_table_buffer = [] # clean table markdown buffer
# Initialize Docling TableData
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
table_data = TableData(
num_rows=num_rows, num_cols=num_cols, table_cells=tcells
)
# Populate
for tcell in tcells:
data.table_cells.append(tcell)
table_data.table_cells.append(tcell)
if len(tcells) > 0:
doc.add_table(data=data)
doc.add_table(data=table_data)
return
def process_inline_text(self, parent_element, doc=None):
def process_inline_text(
self, parent_element: Optional[NodeItem], doc: DoclingDocument
):
# self.inline_text_buffer += str(text_in)
txt = self.inline_text_buffer.strip()
if len(txt) > 0:
@@ -156,14 +162,20 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
)
self.inline_text_buffer = ""
def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
def iterate_elements(
self,
element: marko.block.Element,
depth: int,
doc: DoclingDocument,
parent_element: Optional[NodeItem] = None,
):
# Iterates over all elements in the AST
# Check for different element types and process relevant details
if isinstance(element, marko.block.Heading):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(
f" - Heading level {element.level}, content: {element.children[0].children}"
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
)
if element.level == 1:
doc_label = DocItemLabel.TITLE
@@ -172,10 +184,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# Header could have arbitrary inclusion of bold, italic or emphasis,
# hence we need to traverse the tree to get full text of a header
strings = []
strings: List[str] = []
# Define a recursive function to traverse the tree
def traverse(node):
def traverse(node: marko.block.BlockElement):
# Check if the node has a "children" attribute
if hasattr(node, "children"):
# If "children" is a list, continue traversal
@@ -209,9 +221,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.process_inline_text(parent_element, doc)
_log.debug(" - List item")
snippet_text = str(element.children[0].children[0].children)
snippet_text = str(element.children[0].children[0].children) # type: ignore
is_numbered = False
if parent_element.label == GroupLabel.ORDERED_LIST:
if (
parent_element is not None
and isinstance(parent_element, DocItem)
and parent_element.label == GroupLabel.ORDERED_LIST
):
is_numbered = True
doc.add_list_item(
enumerated=is_numbered, parent=parent_element, text=snippet_text
@@ -221,7 +237,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
doc.add_picture(parent=parent_element, caption=element.title)
fig_caption: Optional[TextItem] = None
if element.title is not None and element.title != "":
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=element.title
)
doc.add_picture(parent=parent_element, caption=fig_caption)
elif isinstance(element, marko.block.Paragraph):
self.process_inline_text(parent_element, doc)
@@ -252,27 +275,21 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
doc.add_code(parent=parent_element, text=snippet_text)
elif isinstance(element, marko.block.CodeBlock):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
snippet_text = str(element.children[0].children).strip() # type: ignore
doc.add_code(parent=parent_element, text=snippet_text)
elif isinstance(element, marko.block.FencedCode):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
snippet_text = str(element.children[0].children).strip() # type: ignore
doc.add_code(parent=parent_element, text=snippet_text)
elif isinstance(element, marko.inline.LineBreak):
self.process_inline_text(parent_element, doc)