feat!: Docling v2 (#117)

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Christoph Auer
2024-10-16 21:02:03 +02:00
committed by GitHub
parent d504432c1e
commit 7d3be0edeb
144 changed files with 15180 additions and 3828 deletions

View File

@@ -1,68 +1,63 @@
from abc import ABC, abstractmethod
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
from typing import TYPE_CHECKING, Set, Union
from PIL import Image
from docling_core.types.doc import DoclingDocument
if TYPE_CHECKING:
from docling.datamodel.base_models import BoundingBox, Cell, PageSize
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
class PdfPageBackend(ABC):
class AbstractDocumentBackend(ABC):
@abstractmethod
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
pass
@abstractmethod
def get_text_cells(self) -> Iterable["Cell"]:
pass
@abstractmethod
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
pass
@abstractmethod
def get_page_image(
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
) -> Image.Image:
pass
@abstractmethod
def get_size(self) -> "PageSize":
pass
@abstractmethod
def is_valid(self) -> bool:
pass
@abstractmethod
def unload(self):
pass
class PdfDocumentBackend(ABC):
@abstractmethod
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
self.path_or_stream = path_or_stream
self.document_hash = document_hash
@abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend:
pass
@abstractmethod
def page_count(self) -> int:
pass
self.document_hash = in_doc.document_hash
self.input_format = in_doc.format
@abstractmethod
def is_valid(self) -> bool:
pass
@classmethod
@abstractmethod
def supports_pagination(cls) -> bool:
pass
@abstractmethod
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
@abstractmethod
def supported_formats(cls) -> Set["InputFormat"]:
pass
class PaginatedDocumentBackend(AbstractDocumentBackend):
"""DeclarativeDocumentBackend.
A declarative document backend is a backend that can transform to DoclingDocument
straight without a recognition pipeline.
"""
@abstractmethod
def page_count(self) -> int:
pass
class DeclarativeDocumentBackend(AbstractDocumentBackend):
"""DeclarativeDocumentBackend.
A declarative document backend is a backend that can transform to DoclingDocument
straight without a recognition pipeline.
"""
@abstractmethod
def convert(self) -> DoclingDocument:
pass

View File

@@ -5,12 +5,14 @@ from pathlib import Path
from typing import Iterable, List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_parse.docling_parse import pdf_parser
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
@@ -177,8 +179,8 @@ class DoclingParsePageBackend(PdfPageBackend):
return image
def get_size(self) -> PageSize:
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
def get_size(self) -> Size:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
def unload(self):
self._ppage = None
@@ -186,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend):
class DoclingParseDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self._pdoc = pdfium.PdfDocument(path_or_stream)
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
self.parser = pdf_parser()
success = False
if isinstance(path_or_stream, BytesIO):
if isinstance(self.path_or_stream, BytesIO):
success = self.parser.load_document_from_bytesio(
document_hash, path_or_stream
self.document_hash, self.path_or_stream
)
elif isinstance(self.path_or_stream, Path):
success = self.parser.load_document(
self.document_hash, str(self.path_or_stream)
)
elif isinstance(path_or_stream, Path):
success = self.parser.load_document(document_hash, str(path_or_stream))
if not success:
raise RuntimeError(
f"docling-parse could not load document {document_hash}."
f"docling-parse could not load document with hash {self.document_hash}."
)
def page_count(self) -> int:

View File

@@ -2,15 +2,19 @@ import logging
import random
from io import BytesIO
from pathlib import Path
from typing import Iterable, List, Optional, Union
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_parse.docling_parse import pdf_parser_v2
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell, Size
if TYPE_CHECKING:
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
@@ -190,8 +194,8 @@ class DoclingParseV2PageBackend(PdfPageBackend):
return image
def get_size(self) -> PageSize:
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
def get_size(self) -> Size:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
def unload(self):
self._ppage = None
@@ -199,23 +203,23 @@ class DoclingParseV2PageBackend(PdfPageBackend):
class DoclingParseV2DocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self._pdoc = pdfium.PdfDocument(path_or_stream)
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
self.parser = pdf_parser_v2("fatal")
success = False
if isinstance(path_or_stream, BytesIO):
success = self.parser.load_document_from_bytesio(
document_hash, path_or_stream
self.document_hash, path_or_stream
)
elif isinstance(path_or_stream, Path):
success = self.parser.load_document(document_hash, str(path_or_stream))
success = self.parser.load_document(self.document_hash, str(path_or_stream))
if not success:
raise RuntimeError(
f"docling-parse could not load document {document_hash}."
f"docling-parse v2 could not load document {self.document_hash}."
)
def page_count(self) -> int:

View File

@@ -0,0 +1,425 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from bs4 import BeautifulSoup
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
GroupLabel,
TableCell,
TableData,
)
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
_log.debug("About to init HTML backend...")
self.soup = None
# HTML file:
self.path_or_stream = path_or_stream
# Initialise the parents for the hierarchy
self.max_levels = 10
self.level = 0
self.parents = {} # type: ignore
for i in range(0, self.max_levels):
self.parents[i] = None
self.labels = {} # type: ignore
try:
if isinstance(self.path_or_stream, BytesIO):
text_stream = self.path_or_stream.getvalue().decode("utf-8")
self.soup = BeautifulSoup(text_stream, "html.parser")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
html_content = f.read()
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
raise RuntimeError(
f"Could not initialize HTML backend for file with hash {self.document_hash}."
) from e
def is_valid(self) -> bool:
return self.soup is not None
@classmethod
def supports_pagination(cls) -> bool:
return False
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.HTML}
def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff
doc = DoclingDocument(name="dummy")
_log.debug("Trying to convert HTML...")
if self.is_valid():
assert self.soup is not None
# Replace <br> tags with newline characters
for br in self.soup.body.find_all("br"):
br.replace_with("\n")
doc = self.walk(self.soup.body, doc)
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
)
return doc
def walk(self, element, doc):
try:
# Iterate over elements in the body of the document
for idx, element in enumerate(element.children):
try:
self.analyse_element(element, idx, doc)
except Exception as exc_child:
_log.error(" -> error treating child: ", exc_child)
_log.error(" => element: ", element, "\n")
raise exc_child
except Exception as exc:
pass
return doc
def analyse_element(self, element, idx, doc):
"""
if element.name!=None:
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
"""
if element.name in self.labels:
self.labels[element.name] += 1
else:
self.labels[element.name] = 1
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
self.handle_header(element, idx, doc)
elif element.name in ["p"]:
self.handle_paragraph(element, idx, doc)
elif element.name in ["ul", "ol"]:
self.handle_list(element, idx, doc)
elif element.name in ["li"]:
self.handle_listitem(element, idx, doc)
elif element.name == "table":
self.handle_table(element, idx, doc)
elif element.name == "figure":
self.handle_figure(element, idx, doc)
elif element.name == "img":
self.handle_image(element, idx, doc)
else:
self.walk(element, doc)
def get_direct_text(self, item):
"""Get the direct text of the <li> element (ignoring nested lists)."""
text = item.find(string=True, recursive=False)
if isinstance(text, str):
return text.strip()
return ""
# Function to recursively extract text from all child nodes
def extract_text_recursively(self, item):
result = []
if isinstance(item, str):
return [item]
result.append(self.get_direct_text(item))
try:
# Iterate over the children (and their text and tails)
for child in item:
try:
# Recursively get the child's text content
result.extend(self.extract_text_recursively(child))
except:
pass
except:
_log.warn("item has no children")
pass
return " ".join(result)
def handle_header(self, element, idx, doc):
"""Handles header tags (h1, h2, etc.)."""
hlevel = int(element.name.replace("h", ""))
slevel = hlevel - 1
label = DocItemLabel.SECTION_HEADER
text = element.text.strip()
if hlevel == 1:
for key, val in self.parents.items():
self.parents[key] = None
self.level = 1
self.parents[self.level] = doc.add_text(
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
)
elif hlevel == self.level:
self.parents[hlevel] = doc.add_text(
parent=self.parents[hlevel - 1], label=label, text=text
)
elif hlevel > self.level:
# add invisible group
for i in range(self.level + 1, hlevel):
self.parents[i] = doc.add_group(
name=f"header-{i}",
label=GroupLabel.SECTION,
parent=self.parents[i - 1],
)
self.parents[hlevel] = doc.add_text(
parent=self.parents[hlevel - 1], label=label, text=text
)
self.level = hlevel
elif hlevel < self.level:
# remove the tail
for key, val in self.parents.items():
if key > hlevel:
self.parents[key] = None
self.parents[hlevel] = doc.add_text(
parent=self.parents[hlevel - 1], label=label, text=text
)
self.level = hlevel
def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p)."""
if element.text is None:
return
text = element.text.strip()
label = DocItemLabel.PARAGRAPH
if len(text) == 0:
return
doc.add_text(parent=self.parents[self.level], label=label, text=text)
def handle_list(self, element, idx, doc):
"""Handles list tags (ul, ol) and their list items."""
if element.name == "ul":
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
)
elif element.name == "ol":
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level],
name="ordered list",
label=GroupLabel.ORDERED_LIST,
)
self.level += 1
self.walk(element, doc)
self.parents[self.level + 1] = None
self.level -= 1
def handle_listitem(self, element, idx, doc):
"""Handles listitem tags (li)."""
nested_lists = element.find(["ul", "ol"])
parent_list_label = self.parents[self.level].label
index_in_list = len(self.parents[self.level].children) + 1
if nested_lists:
name = element.name
text = self.get_direct_text(element)
marker = ""
enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST:
marker = str(index_in_list)
enumerated = True
# create a list-item
self.parents[self.level + 1] = doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
parent=self.parents[self.level],
)
self.level += 1
self.walk(element, doc)
self.parents[self.level + 1] = None
self.level -= 1
elif isinstance(element.text, str):
text = element.text.strip()
marker = ""
enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST:
marker = f"{str(index_in_list)}."
enumerated = True
doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
parent=self.parents[self.level],
)
else:
_log.warn("list-item has no text: ", element)
def handle_table(self, element, idx, doc):
"""Handles table tags."""
nested_tables = element.find("table")
if nested_tables is not None:
_log.warn("detected nested tables: skipping for now")
return
# Count the number of rows (number of <tr> elements)
num_rows = len(element.find_all("tr"))
# Find the number of columns (taking into account colspan)
num_cols = 0
for row in element.find_all("tr"):
col_count = 0
for cell in row.find_all(["td", "th"]):
colspan = int(cell.get("colspan", 1))
col_count += colspan
num_cols = max(num_cols, col_count)
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Iterate over the rows in the table
for row_idx, row in enumerate(element.find_all("tr")):
# For each row, find all the column cells (both <td> and <th>)
cells = row.find_all(["td", "th"])
# Check if each cell in the row is a header -> means it is a column header
col_header = True
for j, html_cell in enumerate(cells):
if html_cell.name == "td":
col_header = False
col_idx = 0
# Extract and print the text content of each cell
for _, html_cell in enumerate(cells):
text = html_cell.text
try:
text = self.extract_table_cell_text(html_cell)
except Exception as exc:
_log.warn("exception: ", exc)
exit(-1)
# label = html_cell.name
col_span = int(html_cell.get("colspan", 1))
row_span = int(html_cell.get("rowspan", 1))
while grid[row_idx][col_idx] is not None:
col_idx += 1
for r in range(row_span):
for c in range(col_span):
grid[row_idx + r][col_idx + c] = text
cell = TableCell(
text=text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
col_header=col_header,
row_header=((not col_header) and html_cell.name == "th"),
)
data.table_cells.append(cell)
doc.add_table(data=data, parent=self.parents[self.level])
def get_list_text(self, list_element, level=0):
"""Recursively extract text from <ul> or <ol> with proper indentation."""
result = []
bullet_char = "*" # Default bullet character for unordered lists
if list_element.name == "ol": # For ordered lists, use numbers
for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
# Add numbering for ordered lists
result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
# Handle nested lists
nested_list = li.find(["ul", "ol"])
if nested_list:
result.extend(self.get_list_text(nested_list, level + 1))
elif list_element.name == "ul": # For unordered lists, use bullet points
for li in list_element.find_all("li", recursive=False):
# Add bullet points for unordered lists
result.append(
f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
)
# Handle nested lists
nested_list = li.find(["ul", "ol"])
if nested_list:
result.extend(self.get_list_text(nested_list, level + 1))
return result
def extract_table_cell_text(self, cell):
"""Extract text from a table cell, including lists with indents."""
contains_lists = cell.find(["ul", "ol"])
if contains_lists is None:
return cell.text
else:
_log.debug(
"should extract the content correctly for table-cells with lists ..."
)
return cell.text
def handle_figure(self, element, idx, doc):
"""Handles image tags (img)."""
# Extract the image URI from the <img> tag
# image_uri = root.xpath('//figure//img/@src')[0]
contains_captions = element.find(["figcaption"])
if contains_captions is None:
doc.add_picture(parent=self.parents[self.level], caption=None)
else:
texts = []
for item in contains_captions:
texts.append(item.text)
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
)
doc.add_picture(
parent=self.parents[self.level],
caption=fig_caption,
)
def handle_image(self, element, idx, doc):
"""Handles image tags (img)."""
doc.add_picture(parent=self.parents[self.level], caption=None)

View File

@@ -0,0 +1,375 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from docling_core.types.doc import (
BoundingBox,
CoordOrigin,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
ProvenanceItem,
Size,
TableCell,
TableData,
)
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
from docling.backend.abstract_backend import (
DeclarativeDocumentBackend,
PaginatedDocumentBackend,
)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self.namespaces = {
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
"p": "http://schemas.openxmlformats.org/presentationml/2006/main",
}
# Powerpoint file:
self.path_or_stream = path_or_stream
self.pptx_obj = None
self.valid = False
try:
if isinstance(self.path_or_stream, BytesIO):
self.pptx_obj = Presentation(self.path_or_stream)
elif isinstance(self.path_or_stream, Path):
self.pptx_obj = Presentation(str(self.path_or_stream))
self.valid = True
except Exception as e:
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
) from e
return
def page_count(self) -> int:
if self.is_valid():
assert self.pptx_obj is not None
return len(self.pptx_obj.slides)
else:
return 0
def is_valid(self) -> bool:
return self.valid
@classmethod
def supports_pagination(cls) -> bool:
return True # True? if so, how to handle pages...
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.PPTX}
def convert(self) -> DoclingDocument:
# Parses the PPTX into a structured document model.
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin(
filename=fname,
mimetype="application/vnd.ms-powerpoint",
binary_hash=self.document_hash,
)
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument(
name=docname, origin=origin
) # must add origin information
doc = self.walk_linear(self.pptx_obj, doc)
return doc
def generate_prov(self, shape, slide_ind, text=""):
left = shape.left
top = shape.top
width = shape.width
height = shape.height
shape_bbox = [left, top, left + width, top + height]
shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
# prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
prov = ProvenanceItem(
page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
)
return prov
def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
is_a_list = False
enum_list_item_value = 0
for paragraph in shape.text_frame.paragraphs:
enum_list_item_value += 1
bullet_type = "None"
# Check if paragraph is a bullet point using the `element` XML
p = paragraph._element
if (
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
is not None
):
bullet_type = "Bullet"
is_a_list = True
elif (
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
is not None
):
bullet_type = "Numbered"
is_a_list = True
else:
is_a_list = False
if paragraph.level > 0:
# Most likely a sub-list
is_a_list = True
list_text = paragraph.text.strip()
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
if is_a_list:
# Determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits.
list_label = GroupLabel.LIST
if bullet_type == "Numbered":
list_label = GroupLabel.ORDERED_LIST
new_list = doc.add_group(
label=list_label, name=f"list", parent=parent_slide
)
else:
new_list = None
if is_a_list:
_log.debug("LIST DETECTED!")
else:
_log.debug("No List")
# for e in p.iter():
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
if len(e.text.strip()) > 0:
e_is_a_list_item = False
is_numbered = False
if (
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
is not None
):
bullet_type = "Bullet"
e_is_a_list_item = True
elif (
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
is not None
):
bullet_type = "Numbered"
is_numbered = True
e_is_a_list_item = True
else:
e_is_a_list_item = False
if e_is_a_list_item:
# Set marker and enumerated arguments if this is an enumeration element.
enum_marker = str(enum_list_item_value) + "."
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_list,
text=list_text,
prov=prov,
)
else:
# Assign proper label to the text, depending if it's a Title or Section Header
# For other types of text, assign - PARAGRAPH
doc_label = DocItemLabel.PARAGRAPH
if shape.is_placeholder:
placeholder_type = shape.placeholder_format.type
if placeholder_type in [
PP_PLACEHOLDER.CENTER_TITLE,
PP_PLACEHOLDER.TITLE,
]:
# It's a title
doc_label = DocItemLabel.TITLE
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
DocItemLabel.SECTION_HEADER
enum_list_item_value = 0
doc.add_text(
label=doc_label,
parent=parent_slide,
text=list_text,
prov=prov,
)
return
def handle_title(self, shape, parent_slide, slide_ind, doc):
placeholder_type = shape.placeholder_format.type
txt = shape.text.strip()
prov = self.generate_prov(shape, slide_ind, txt)
if len(txt.strip()) > 0:
# title = slide.shapes.title.text if slide.shapes.title else "No title"
if placeholder_type in [PP_PLACEHOLDER.CENTER_TITLE, PP_PLACEHOLDER.TITLE]:
_log.info(f"Title found: {shape.text}")
doc.add_text(
label=DocItemLabel.TITLE, parent=parent_slide, text=txt, prov=prov
)
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
_log.info(f"Subtitle found: {shape.text}")
# Using DocItemLabel.FOOTNOTE, while SUBTITLE label is not avail.
doc.add_text(
label=DocItemLabel.SECTION_HEADER,
parent=parent_slide,
text=txt,
prov=prov,
)
return
def handle_pictures(self, shape, parent_slide, slide_ind, doc):
# shape has picture
prov = self.generate_prov(shape, slide_ind, "")
doc.add_picture(parent=parent_slide, caption=None, prov=prov)
return
def handle_tables(self, shape, parent_slide, slide_ind, doc):
# Handling tables, images, charts
if shape.has_table:
table = shape.table
table_xml = shape._element
prov = self.generate_prov(shape, slide_ind, "")
num_cols = 0
num_rows = len(table.rows)
tcells = []
# Access the XML element for the shape that contains the table
table_xml = shape._element
for row_idx, row in enumerate(table.rows):
if len(row.cells) > num_cols:
num_cols = len(row.cells)
for col_idx, cell in enumerate(row.cells):
# Access the XML of the cell (this is the 'tc' element in table XML)
cell_xml = table_xml.xpath(
f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]"
)
if not cell_xml:
continue # If no cell XML is found, skip
cell_xml = cell_xml[0] # Get the first matching XML node
row_span = cell_xml.get("rowSpan") # Vertical span
col_span = cell_xml.get("gridSpan") # Horizontal span
if row_span is None:
row_span = 1
else:
row_span = int(row_span)
if col_span is None:
col_span = 1
else:
col_span = int(col_span)
icell = TableCell(
text=cell.text.strip(),
row_span=row_span,
col_span=col_span,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
col_header=False,
row_header=False,
)
if len(cell.text.strip()) > 0:
tcells.append(icell)
# Initialize Docling TableData
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
# Populate
for tcell in tcells:
data.table_cells.append(tcell)
if len(tcells) > 0:
# If table is not fully empty...
# Create Docling table
doc.add_table(data=data, prov=prov)
return
def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
# Units of size in PPTX by default are EMU units (English Metric Units)
slide_width = pptx_obj.slide_width
slide_height = pptx_obj.slide_height
text_content = [] # type: ignore
max_levels = 10
parents = {} # type: ignore
for i in range(0, max_levels):
parents[i] = None
# Loop through each slide
for slide_num, slide in enumerate(pptx_obj.slides):
slide_ind = pptx_obj.slides.index(slide)
parent_slide = doc.add_group(
name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
)
size = Size(width=slide_width, height=slide_height)
parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
# parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
# Loop through each shape in the slide
for shape in slide.shapes:
if shape.has_table:
# Handle Tables
self.handle_tables(shape, parent_slide, slide_ind, doc)
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
# Handle Tables
self.handle_pictures(shape, parent_slide, slide_ind, doc)
# If shape doesn't have any text, move on to the next shape
if not hasattr(shape, "text"):
continue
if shape.text is None:
continue
if len(shape.text.strip()) == 0:
continue
if not shape.has_text_frame:
_log.warn("Warning: shape has text but not text_frame")
continue
# if shape.is_placeholder:
# Handle Titles (Headers) and Subtitles
# Check if the shape is a placeholder (titles are placeholders)
# self.handle_title(shape, parent_slide, slide_ind, doc)
# self.handle_text_elements(shape, parent_slide, slide_ind, doc)
# else:
# Handle other text elements, including lists (bullet lists, numbered lists)
self.handle_text_elements(shape, parent_slide, slide_ind, doc)
# figures...
# doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
return doc

View File

@@ -0,0 +1,509 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Set, Union
import docx
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
TableCell,
TableData,
)
from lxml import etree
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class MsWordDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self.XML_KEY = (
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
)
self.xml_namespaces = {
"w": "http://schemas.microsoft.com/office/word/2003/wordml"
}
# self.initialise(path_or_stream)
# Word file:
self.path_or_stream = path_or_stream
self.valid = False
# Initialise the parents for the hierarchy
self.max_levels = 10
self.level_at_new_list = None
self.parents = {} # type: ignore
for i in range(-1, self.max_levels):
self.parents[i] = None
self.level = 0
self.listIter = 0
self.history = {
"names": [None],
"levels": [None],
"numids": [None],
"indents": [None],
}
self.docx_obj = None
try:
if isinstance(self.path_or_stream, BytesIO):
self.docx_obj = docx.Document(self.path_or_stream)
elif isinstance(self.path_or_stream, Path):
self.docx_obj = docx.Document(str(self.path_or_stream))
self.valid = True
except Exception as e:
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
) from e
def is_valid(self) -> bool:
return self.valid
@classmethod
def supports_pagination(cls) -> bool:
return False
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.DOCX}
def convert(self) -> DoclingDocument:
# Parses the DOCX into a structured document model.
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin(
filename=fname,
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
binary_hash=self.document_hash,
)
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument(name=docname, origin=origin)
if self.is_valid():
assert self.docx_obj is not None
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
return doc
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
)
def update_history(self, name, level, numid, ilevel):
self.history["names"].append(name)
self.history["levels"].append(level)
self.history["numids"].append(numid)
self.history["indents"].append(ilevel)
def prev_name(self):
return self.history["names"][-1]
def prev_level(self):
return self.history["levels"][-1]
def prev_numid(self):
return self.history["numids"][-1]
def prev_indent(self):
return self.history["indents"][-1]
def get_level(self) -> int:
"""Return the first None index."""
for k, v in self.parents.items():
if k >= 0 and v == None:
return k
return 0
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
for element in body:
tag_name = etree.QName(element).localname
# Check for Inline Images (drawings or blip elements)
found_drawing = etree.ElementBase.xpath(
element, ".//w:drawing", namespaces=self.xml_namespaces
)
found_pict = etree.ElementBase.xpath(
element, ".//w:pict", namespaces=self.xml_namespaces
)
# Check for Tables
if element.tag.endswith("tbl"):
try:
self.handle_tables(element, docx_obj, doc)
except Exception:
_log.debug("could not parse a table, broken docx table")
elif found_drawing or found_pict:
self.handle_pictures(element, docx_obj, doc)
# Check for Text
elif tag_name in ["p"]:
self.handle_text_elements(element, docx_obj, doc)
else:
_log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
return doc
def str_to_int(self, s, default=0):
if s is None:
return None
try:
return int(s)
except ValueError:
return default
def get_numId_and_ilvl(self, paragraph):
# Access the XML element of the paragraph
numPr = paragraph._element.find(
".//w:numPr", namespaces=paragraph._element.nsmap
)
if numPr is not None:
# Get the numId element and extract the value
numId_elem = numPr.find("w:numId", namespaces=paragraph._element.nsmap)
ilvl_elem = numPr.find("w:ilvl", namespaces=paragraph._element.nsmap)
numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
return self.str_to_int(numId, default=None), self.str_to_int(
ilvl, default=None
)
return None, None # If the paragraph is not part of a list
def get_label_and_level(self, paragraph):
if paragraph.style is None:
return "Normal", None
label = paragraph.style.name
if label is None:
return "Normal", None
if ":" in label:
parts = label.split(":")
if len(parts) == 2:
return parts[0], int(parts[1])
parts = label.split(" ")
if "Heading" in label and len(parts) == 2:
parts.sort()
label_str = ""
label_level = 0
if parts[0] == "Heading":
# print("{} - {}".format(parts[0], parts[1]))
label_str = parts[0]
label_level = self.str_to_int(parts[1], default=None)
if parts[1] == "Heading":
label_str = parts[1]
label_level = self.str_to_int(parts[0], default=None)
return label_str, label_level
else:
return label, None
def handle_text_elements(self, element, docx_obj, doc):
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
if paragraph.text is None:
# _log.warn(f"paragraph has text==None")
return
text = paragraph.text.strip()
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
# Common styles for bullet and numbered lists.
# "List Bullet", "List Number", "List Paragraph"
# TODO: reliably identify wether list is a numbered list or not
# is_numbered = "List Bullet" not in paragraph.style.name
is_numbered = False
p_style_name, p_level = self.get_label_and_level(paragraph)
numid, ilevel = self.get_numId_and_ilvl(paragraph)
# print("numid: {}, ilevel: {}, text: {}".format(numid, ilevel, text))
if numid == 0:
numid = None
# Handle lists
if numid is not None and ilevel is not None:
self.add_listitem(
element,
docx_obj,
doc,
p_style_name,
p_level,
numid,
ilevel,
text,
is_numbered,
)
self.update_history(p_style_name, p_level, numid, ilevel)
return
elif numid is None and self.prev_numid() is not None: # Close list
for key, val in self.parents.items():
if key >= self.level_at_new_list:
self.parents[key] = None
self.level = self.level_at_new_list - 1
self.level_at_new_list = None
if p_style_name in ["Title"]:
for key, val in self.parents.items():
self.parents[key] = None
self.parents[0] = doc.add_text(
parent=None, label=DocItemLabel.TITLE, text=text
)
elif "Heading" in p_style_name:
self.add_header(element, docx_obj, doc, p_style_name, p_level, text)
elif p_style_name in [
"Paragraph",
"Normal",
"Subtitle",
"Author",
"Default Text",
"List Paragraph",
"List Bullet",
"Quote",
]:
level = self.get_level()
doc.add_text(
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
)
else:
# Text style names can, and will have, not only default values but user values too
# hence we treat all other labels as pure text
level = self.get_level()
doc.add_text(
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
)
self.update_history(p_style_name, p_level, numid, ilevel)
return
def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
level = self.get_level()
if isinstance(curr_level, int):
if curr_level == level:
self.parents[level] = doc.add_heading(
parent=self.parents[level - 1], text=text
)
elif curr_level > level:
# add invisible group
for i in range(level, curr_level):
self.parents[i] = doc.add_group(
parent=self.parents[i - 1],
label=GroupLabel.SECTION,
name=f"header-{i}",
)
self.parents[curr_level] = doc.add_heading(
parent=self.parents[curr_level - 1], text=text
)
elif curr_level < level:
# remove the tail
for key, val in self.parents.items():
if key >= curr_level:
self.parents[key] = None
self.parents[curr_level] = doc.add_heading(
parent=self.parents[curr_level - 1], text=text
)
else:
self.parents[self.level] = doc.add_heading(
parent=self.parents[self.level - 1], text=text
)
return
def add_listitem(
self,
element,
docx_obj,
doc,
p_style_name,
p_level,
numid,
ilevel,
text: str,
is_numbered=False,
):
# is_numbered = is_numbered
enum_marker = ""
level = self.get_level()
if self.prev_numid() is None: # Open new list
self.level_at_new_list = level # type: ignore
self.parents[level] = doc.add_group(
label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
)
# TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[level],
text=text,
)
elif (
self.prev_numid() == numid and self.prev_indent() < ilevel
): # Open indented list
for i in range(
self.level_at_new_list + self.prev_indent() + 1,
self.level_at_new_list + ilevel + 1,
):
# TODO: determine if this is an unordered list or an ordered list.
# Set GroupLabel.ORDERED_LIST when it fits.
self.listIter = 0
if is_numbered:
self.parents[i] = doc.add_group(
label=GroupLabel.ORDERED_LIST,
name="list",
parent=self.parents[i - 1],
)
else:
self.parents[i] = doc.add_group(
label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
)
# TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[self.level_at_new_list + ilevel],
text=text,
)
elif self.prev_numid() == numid and ilevel < self.prev_indent(): # Close list
for k, v in self.parents.items():
if k > self.level_at_new_list + ilevel:
self.parents[k] = None
# TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[self.level_at_new_list + ilevel],
text=text,
)
self.listIter = 0
elif self.prev_numid() == numid or self.prev_indent() == ilevel:
# TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
if is_numbered:
enum_marker = str(self.listIter) + "."
is_numbered = True
doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=self.parents[level - 1],
text=text,
)
return
def handle_tables(self, element, docx_obj, doc):
# Function to check if a cell has a colspan (gridSpan)
def get_colspan(cell):
grid_span = cell._element.xpath("@w:gridSpan")
if grid_span:
return int(grid_span[0]) # Return the number of columns spanned
return 1 # Default is 1 (no colspan)
# Function to check if a cell has a rowspan (vMerge)
def get_rowspan(cell):
v_merge = cell._element.xpath("@w:vMerge")
if v_merge:
return v_merge[
0
] # 'restart' indicates the beginning of a rowspan, others are continuation
return 1
table = docx.table.Table(element, docx_obj)
num_rows = len(table.rows)
num_cols = 0
for row in table.rows:
# Calculate the max number of columns
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
# if row.cells:
# num_cols = max(num_cols, len(row.cells))
# Initialize the table grid
table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
for row_idx, row in enumerate(table.rows):
col_idx = 0
for c, cell in enumerate(row.cells):
row_span = get_rowspan(cell)
col_span = get_colspan(cell)
# Find the next available column in the grid
while table_grid[row_idx][col_idx] is not None:
col_idx += 1
# Fill the grid with the cell value, considering rowspan and colspan
for i in range(row_span if row_span == "restart" else 1):
for j in range(col_span):
table_grid[row_idx + i][col_idx + j] = ""
cell = TableCell(
text=cell.text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
col_header=False, # col_header,
row_header=False, # ((not col_header) and html_cell.name=='th')
)
data.table_cells.append(cell)
level = self.get_level()
doc.add_table(data=data, parent=self.parents[level - 1])
return
def handle_pictures(self, element, docx_obj, doc):
doc.add_picture(parent=self.parents[self.level], caption=None)
return

View File

@@ -0,0 +1,78 @@
from abc import ABC, abstractmethod
from io import BytesIO
from pathlib import Path
from typing import Iterable, Optional, Set, Union
from docling_core.types.doc import BoundingBox, Size
from PIL import Image
from docling.backend.abstract_backend import PaginatedDocumentBackend
from docling.datamodel.base_models import Cell, InputFormat
from docling.datamodel.document import InputDocument
class PdfPageBackend(ABC):
@abstractmethod
def get_text_in_rect(self, bbox: BoundingBox) -> str:
pass
@abstractmethod
def get_text_cells(self) -> Iterable[Cell]:
pass
@abstractmethod
def get_bitmap_rects(self, float: int = 1) -> Iterable[BoundingBox]:
pass
@abstractmethod
def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
pass
@abstractmethod
def get_size(self) -> Size:
pass
@abstractmethod
def is_valid(self) -> bool:
pass
@abstractmethod
def unload(self):
pass
class PdfDocumentBackend(PaginatedDocumentBackend):
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
if self.input_format is not InputFormat.PDF:
if self.input_format is InputFormat.IMAGE:
buf = BytesIO()
img = Image.open(self.path_or_stream)
img.save(buf, "PDF")
buf.seek(0)
self.path_or_stream = buf
else:
raise RuntimeError(
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
)
@abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend:
pass
@abstractmethod
def page_count(self) -> int:
pass
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.PDF}
@classmethod
def supports_pagination(cls) -> bool:
return True

View File

@@ -2,16 +2,20 @@ import logging
import random
from io import BytesIO
from pathlib import Path
from typing import Iterable, List, Optional, Union
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage, PdfTextPage
from pypdfium2 import PdfTextPage
from pypdfium2._helpers.misc import PdfiumError
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell
if TYPE_CHECKING:
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
@@ -222,8 +226,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
return image
def get_size(self) -> PageSize:
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
def get_size(self) -> Size:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
def unload(self):
self._ppage = None
@@ -231,13 +235,14 @@ class PyPdfiumPageBackend(PdfPageBackend):
class PyPdfiumDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
try:
self._pdoc = pdfium.PdfDocument(path_or_stream)
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
except PdfiumError as e:
raise RuntimeError(
f"pypdfium could not load document {document_hash}"
f"pypdfium could not load document with hash {self.document_hash}"
) from e
def page_count(self) -> int: