feat!: Docling v2 (#117)

--------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2024-10-16 21:02:03 +02:00
parent d504432c1e
commit 7d3be0edeb
144 changed files with 15180 additions and 3828 deletions
@@ -1,68 +1,63 @@
 from abc import ABC, abstractmethod
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
+from typing import TYPE_CHECKING, Set, Union

-from PIL import Image
+from docling_core.types.doc import DoclingDocument

 if TYPE_CHECKING:
-    from docling.datamodel.base_models import BoundingBox, Cell, PageSize
+    from docling.datamodel.base_models import InputFormat
+    from docling.datamodel.document import InputDocument


-class PdfPageBackend(ABC):
-
+class AbstractDocumentBackend(ABC):
    @abstractmethod
-    def get_text_in_rect(self, bbox: "BoundingBox") -> str:
-        pass
-
-    @abstractmethod
-    def get_text_cells(self) -> Iterable["Cell"]:
-        pass
-
-    @abstractmethod
-    def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
-        pass
-
-    @abstractmethod
-    def get_page_image(
-        self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
-    ) -> Image.Image:
-        pass
-
-    @abstractmethod
-    def get_size(self) -> "PageSize":
-        pass
-
-    @abstractmethod
-    def is_valid(self) -> bool:
-        pass
-
-    @abstractmethod
-    def unload(self):
-        pass
-
-
-class PdfDocumentBackend(ABC):
-    @abstractmethod
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
        self.path_or_stream = path_or_stream
-        self.document_hash = document_hash
-
-    @abstractmethod
-    def load_page(self, page_no: int) -> PdfPageBackend:
-        pass
-
-    @abstractmethod
-    def page_count(self) -> int:
-        pass
+        self.document_hash = in_doc.document_hash
+        self.input_format = in_doc.format

    @abstractmethod
    def is_valid(self) -> bool:
        pass

+    @classmethod
+    @abstractmethod
+    def supports_pagination(cls) -> bool:
+        pass
+
    @abstractmethod
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
            self.path_or_stream.close()

        self.path_or_stream = None
+
+    @classmethod
+    @abstractmethod
+    def supported_formats(cls) -> Set["InputFormat"]:
+        pass
+
+
+class PaginatedDocumentBackend(AbstractDocumentBackend):
+    """DeclarativeDocumentBackend.
+
+    A declarative document backend is a backend that can transform to DoclingDocument
+    straight without a recognition pipeline.
+    """
+
+    @abstractmethod
+    def page_count(self) -> int:
+        pass
+
+
+class DeclarativeDocumentBackend(AbstractDocumentBackend):
+    """DeclarativeDocumentBackend.
+
+    A declarative document backend is a backend that can transform to DoclingDocument
+    straight without a recognition pipeline.
+    """
+
+    @abstractmethod
+    def convert(self) -> DoclingDocument:
+        pass
@@ -5,12 +5,14 @@ from pathlib import Path
 from typing import Iterable, List, Optional, Union

 import pypdfium2 as pdfium
+from docling_core.types.doc import BoundingBox, CoordOrigin, Size
 from docling_parse.docling_parse import pdf_parser
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage

-from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import Cell
+from docling.datamodel.document import InputDocument

 _log = logging.getLogger(__name__)

@@ -177,8 +179,8 @@ class DoclingParsePageBackend(PdfPageBackend):

        return image

-    def get_size(self) -> PageSize:
-        return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
+    def get_size(self) -> Size:
+        return Size(width=self._ppage.get_width(), height=self._ppage.get_height())

    def unload(self):
        self._ppage = None
@@ -186,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend):


 class DoclingParseDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
-        super().__init__(path_or_stream, document_hash)
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)

-        self._pdoc = pdfium.PdfDocument(path_or_stream)
+        self._pdoc = pdfium.PdfDocument(self.path_or_stream)
        self.parser = pdf_parser()

        success = False
-        if isinstance(path_or_stream, BytesIO):
+        if isinstance(self.path_or_stream, BytesIO):
            success = self.parser.load_document_from_bytesio(
-                document_hash, path_or_stream
+                self.document_hash, self.path_or_stream
+            )
+        elif isinstance(self.path_or_stream, Path):
+            success = self.parser.load_document(
+                self.document_hash, str(self.path_or_stream)
            )
-        elif isinstance(path_or_stream, Path):
-            success = self.parser.load_document(document_hash, str(path_or_stream))

        if not success:
            raise RuntimeError(
-                f"docling-parse could not load document {document_hash}."
+                f"docling-parse could not load document with hash {self.document_hash}."
            )

    def page_count(self) -> int:
@@ -2,15 +2,19 @@ import logging
 import random
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, Iterable, List, Optional, Union

 import pypdfium2 as pdfium
+from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_parse.docling_parse import pdf_parser_v2
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage

-from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import Cell, Size
+
+if TYPE_CHECKING:
+    from docling.datamodel.document import InputDocument

 _log = logging.getLogger(__name__)

@@ -190,8 +194,8 @@ class DoclingParseV2PageBackend(PdfPageBackend):

        return image

-    def get_size(self) -> PageSize:
-        return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
+    def get_size(self) -> Size:
+        return Size(width=self._ppage.get_width(), height=self._ppage.get_height())

    def unload(self):
        self._ppage = None
@@ -199,23 +203,23 @@ class DoclingParseV2PageBackend(PdfPageBackend):


 class DoclingParseV2DocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
-        super().__init__(path_or_stream, document_hash)
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)

-        self._pdoc = pdfium.PdfDocument(path_or_stream)
+        self._pdoc = pdfium.PdfDocument(self.path_or_stream)
        self.parser = pdf_parser_v2("fatal")

        success = False
        if isinstance(path_or_stream, BytesIO):
            success = self.parser.load_document_from_bytesio(
-                document_hash, path_or_stream
+                self.document_hash, path_or_stream
            )
        elif isinstance(path_or_stream, Path):
-            success = self.parser.load_document(document_hash, str(path_or_stream))
+            success = self.parser.load_document(self.document_hash, str(path_or_stream))

        if not success:
            raise RuntimeError(
-                f"docling-parse could not load document {document_hash}."
+                f"docling-parse v2 could not load document {self.document_hash}."
            )

    def page_count(self) -> int:
@@ -0,0 +1,425 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+
+from bs4 import BeautifulSoup
+from docling_core.types.doc import (
+    DocItemLabel,
+    DoclingDocument,
+    GroupLabel,
+    TableCell,
+    TableData,
+)
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class HTMLDocumentBackend(DeclarativeDocumentBackend):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        _log.debug("About to init HTML backend...")
+        self.soup = None
+        # HTML file:
+        self.path_or_stream = path_or_stream
+        # Initialise the parents for the hierarchy
+        self.max_levels = 10
+        self.level = 0
+        self.parents = {}  # type: ignore
+        for i in range(0, self.max_levels):
+            self.parents[i] = None
+        self.labels = {}  # type: ignore
+
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                text_stream = self.path_or_stream.getvalue().decode("utf-8")
+                self.soup = BeautifulSoup(text_stream, "html.parser")
+            if isinstance(self.path_or_stream, Path):
+                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                    html_content = f.read()
+                    self.soup = BeautifulSoup(html_content, "html.parser")
+        except Exception as e:
+            raise RuntimeError(
+                f"Could not initialize HTML backend for file with hash {self.document_hash}."
+            ) from e
+
+    def is_valid(self) -> bool:
+        return self.soup is not None
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+
+        self.path_or_stream = None
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.HTML}
+
+    def convert(self) -> DoclingDocument:
+        # access self.path_or_stream to load stuff
+        doc = DoclingDocument(name="dummy")
+        _log.debug("Trying to convert HTML...")
+
+        if self.is_valid():
+            assert self.soup is not None
+            # Replace <br> tags with newline characters
+            for br in self.soup.body.find_all("br"):
+                br.replace_with("\n")
+            doc = self.walk(self.soup.body, doc)
+        else:
+            raise RuntimeError(
+                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
+            )
+        return doc
+
+    def walk(self, element, doc):
+        try:
+            # Iterate over elements in the body of the document
+            for idx, element in enumerate(element.children):
+                try:
+                    self.analyse_element(element, idx, doc)
+                except Exception as exc_child:
+
+                    _log.error(" -> error treating child: ", exc_child)
+                    _log.error(" => element: ", element, "\n")
+                    raise exc_child
+
+        except Exception as exc:
+            pass
+
+        return doc
+
+    def analyse_element(self, element, idx, doc):
+        """
+        if element.name!=None:
+            _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
+        """
+
+        if element.name in self.labels:
+            self.labels[element.name] += 1
+        else:
+            self.labels[element.name] = 1
+
+        if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
+            self.handle_header(element, idx, doc)
+        elif element.name in ["p"]:
+            self.handle_paragraph(element, idx, doc)
+        elif element.name in ["ul", "ol"]:
+            self.handle_list(element, idx, doc)
+        elif element.name in ["li"]:
+            self.handle_listitem(element, idx, doc)
+        elif element.name == "table":
+            self.handle_table(element, idx, doc)
+        elif element.name == "figure":
+            self.handle_figure(element, idx, doc)
+        elif element.name == "img":
+            self.handle_image(element, idx, doc)
+        else:
+            self.walk(element, doc)
+
+    def get_direct_text(self, item):
+        """Get the direct text of the <li> element (ignoring nested lists)."""
+        text = item.find(string=True, recursive=False)
+
+        if isinstance(text, str):
+            return text.strip()
+
+        return ""
+
+    # Function to recursively extract text from all child nodes
+    def extract_text_recursively(self, item):
+        result = []
+
+        if isinstance(item, str):
+            return [item]
+
+        result.append(self.get_direct_text(item))
+
+        try:
+            # Iterate over the children (and their text and tails)
+            for child in item:
+                try:
+                    # Recursively get the child's text content
+                    result.extend(self.extract_text_recursively(child))
+                except:
+                    pass
+        except:
+            _log.warn("item has no children")
+            pass
+
+        return " ".join(result)
+
+    def handle_header(self, element, idx, doc):
+        """Handles header tags (h1, h2, etc.)."""
+        hlevel = int(element.name.replace("h", ""))
+        slevel = hlevel - 1
+
+        label = DocItemLabel.SECTION_HEADER
+        text = element.text.strip()
+
+        if hlevel == 1:
+            for key, val in self.parents.items():
+                self.parents[key] = None
+
+            self.level = 1
+            self.parents[self.level] = doc.add_text(
+                parent=self.parents[0], label=DocItemLabel.TITLE, text=text
+            )
+
+        elif hlevel == self.level:
+            self.parents[hlevel] = doc.add_text(
+                parent=self.parents[hlevel - 1], label=label, text=text
+            )
+
+        elif hlevel > self.level:
+
+            # add invisible group
+            for i in range(self.level + 1, hlevel):
+                self.parents[i] = doc.add_group(
+                    name=f"header-{i}",
+                    label=GroupLabel.SECTION,
+                    parent=self.parents[i - 1],
+                )
+
+            self.parents[hlevel] = doc.add_text(
+                parent=self.parents[hlevel - 1], label=label, text=text
+            )
+            self.level = hlevel
+
+        elif hlevel < self.level:
+
+            # remove the tail
+            for key, val in self.parents.items():
+                if key > hlevel:
+                    self.parents[key] = None
+
+            self.parents[hlevel] = doc.add_text(
+                parent=self.parents[hlevel - 1], label=label, text=text
+            )
+            self.level = hlevel
+
+    def handle_paragraph(self, element, idx, doc):
+        """Handles paragraph tags (p)."""
+        if element.text is None:
+            return
+        text = element.text.strip()
+        label = DocItemLabel.PARAGRAPH
+        if len(text) == 0:
+            return
+        doc.add_text(parent=self.parents[self.level], label=label, text=text)
+
+    def handle_list(self, element, idx, doc):
+        """Handles list tags (ul, ol) and their list items."""
+
+        if element.name == "ul":
+            # create a list group
+            self.parents[self.level + 1] = doc.add_group(
+                parent=self.parents[self.level], name="list", label=GroupLabel.LIST
+            )
+        elif element.name == "ol":
+            # create a list group
+            self.parents[self.level + 1] = doc.add_group(
+                parent=self.parents[self.level],
+                name="ordered list",
+                label=GroupLabel.ORDERED_LIST,
+            )
+        self.level += 1
+
+        self.walk(element, doc)
+
+        self.parents[self.level + 1] = None
+        self.level -= 1
+
+    def handle_listitem(self, element, idx, doc):
+        """Handles listitem tags (li)."""
+        nested_lists = element.find(["ul", "ol"])
+
+        parent_list_label = self.parents[self.level].label
+        index_in_list = len(self.parents[self.level].children) + 1
+
+        if nested_lists:
+            name = element.name
+            text = self.get_direct_text(element)
+
+            marker = ""
+            enumerated = False
+            if parent_list_label == GroupLabel.ORDERED_LIST:
+                marker = str(index_in_list)
+                enumerated = True
+
+            # create a list-item
+            self.parents[self.level + 1] = doc.add_list_item(
+                text=text,
+                enumerated=enumerated,
+                marker=marker,
+                parent=self.parents[self.level],
+            )
+            self.level += 1
+
+            self.walk(element, doc)
+
+            self.parents[self.level + 1] = None
+            self.level -= 1
+
+        elif isinstance(element.text, str):
+            text = element.text.strip()
+
+            marker = ""
+            enumerated = False
+            if parent_list_label == GroupLabel.ORDERED_LIST:
+                marker = f"{str(index_in_list)}."
+                enumerated = True
+            doc.add_list_item(
+                text=text,
+                enumerated=enumerated,
+                marker=marker,
+                parent=self.parents[self.level],
+            )
+        else:
+            _log.warn("list-item has no text: ", element)
+
+    def handle_table(self, element, idx, doc):
+        """Handles table tags."""
+
+        nested_tables = element.find("table")
+        if nested_tables is not None:
+            _log.warn("detected nested tables: skipping for now")
+            return
+
+        # Count the number of rows (number of <tr> elements)
+        num_rows = len(element.find_all("tr"))
+
+        # Find the number of columns (taking into account colspan)
+        num_cols = 0
+        for row in element.find_all("tr"):
+            col_count = 0
+            for cell in row.find_all(["td", "th"]):
+                colspan = int(cell.get("colspan", 1))
+                col_count += colspan
+            num_cols = max(num_cols, col_count)
+
+        grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
+
+        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
+
+        # Iterate over the rows in the table
+        for row_idx, row in enumerate(element.find_all("tr")):
+
+            # For each row, find all the column cells (both <td> and <th>)
+            cells = row.find_all(["td", "th"])
+
+            # Check if each cell in the row is a header -> means it is a column header
+            col_header = True
+            for j, html_cell in enumerate(cells):
+                if html_cell.name == "td":
+                    col_header = False
+
+            col_idx = 0
+            # Extract and print the text content of each cell
+            for _, html_cell in enumerate(cells):
+
+                text = html_cell.text
+                try:
+                    text = self.extract_table_cell_text(html_cell)
+                except Exception as exc:
+                    _log.warn("exception: ", exc)
+                    exit(-1)
+
+                # label = html_cell.name
+
+                col_span = int(html_cell.get("colspan", 1))
+                row_span = int(html_cell.get("rowspan", 1))
+
+                while grid[row_idx][col_idx] is not None:
+                    col_idx += 1
+                for r in range(row_span):
+                    for c in range(col_span):
+                        grid[row_idx + r][col_idx + c] = text
+
+                cell = TableCell(
+                    text=text,
+                    row_span=row_span,
+                    col_span=col_span,
+                    start_row_offset_idx=row_idx,
+                    end_row_offset_idx=row_idx + row_span,
+                    start_col_offset_idx=col_idx,
+                    end_col_offset_idx=col_idx + col_span,
+                    col_header=col_header,
+                    row_header=((not col_header) and html_cell.name == "th"),
+                )
+                data.table_cells.append(cell)
+
+        doc.add_table(data=data, parent=self.parents[self.level])
+
+    def get_list_text(self, list_element, level=0):
+        """Recursively extract text from <ul> or <ol> with proper indentation."""
+        result = []
+        bullet_char = "*"  # Default bullet character for unordered lists
+
+        if list_element.name == "ol":  # For ordered lists, use numbers
+            for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
+                # Add numbering for ordered lists
+                result.append(f"{'    ' * level}{i}. {li.get_text(strip=True)}")
+                # Handle nested lists
+                nested_list = li.find(["ul", "ol"])
+                if nested_list:
+                    result.extend(self.get_list_text(nested_list, level + 1))
+        elif list_element.name == "ul":  # For unordered lists, use bullet points
+            for li in list_element.find_all("li", recursive=False):
+                # Add bullet points for unordered lists
+                result.append(
+                    f"{'    ' * level}{bullet_char} {li.get_text(strip=True)}"
+                )
+                # Handle nested lists
+                nested_list = li.find(["ul", "ol"])
+                if nested_list:
+                    result.extend(self.get_list_text(nested_list, level + 1))
+
+        return result
+
+    def extract_table_cell_text(self, cell):
+        """Extract text from a table cell, including lists with indents."""
+        contains_lists = cell.find(["ul", "ol"])
+        if contains_lists is None:
+            return cell.text
+        else:
+            _log.debug(
+                "should extract the content correctly for table-cells with lists ..."
+            )
+            return cell.text
+
+    def handle_figure(self, element, idx, doc):
+        """Handles image tags (img)."""
+
+        # Extract the image URI from the <img> tag
+        # image_uri = root.xpath('//figure//img/@src')[0]
+
+        contains_captions = element.find(["figcaption"])
+        if contains_captions is None:
+            doc.add_picture(parent=self.parents[self.level], caption=None)
+
+        else:
+            texts = []
+            for item in contains_captions:
+                texts.append(item.text)
+
+            fig_caption = doc.add_text(
+                label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
+            )
+            doc.add_picture(
+                parent=self.parents[self.level],
+                caption=fig_caption,
+            )
+
+    def handle_image(self, element, idx, doc):
+        """Handles image tags (img)."""
+        doc.add_picture(parent=self.parents[self.level], caption=None)
@@ -0,0 +1,375 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+
+from docling_core.types.doc import (
+    BoundingBox,
+    CoordOrigin,
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupLabel,
+    ProvenanceItem,
+    Size,
+    TableCell,
+    TableData,
+)
+from pptx import Presentation
+from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
+
+from docling.backend.abstract_backend import (
+    DeclarativeDocumentBackend,
+    PaginatedDocumentBackend,
+)
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        self.namespaces = {
+            "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
+            "c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
+            "p": "http://schemas.openxmlformats.org/presentationml/2006/main",
+        }
+        # Powerpoint file:
+        self.path_or_stream = path_or_stream
+
+        self.pptx_obj = None
+        self.valid = False
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                self.pptx_obj = Presentation(self.path_or_stream)
+            elif isinstance(self.path_or_stream, Path):
+                self.pptx_obj = Presentation(str(self.path_or_stream))
+
+            self.valid = True
+        except Exception as e:
+            raise RuntimeError(
+                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
+            ) from e
+
+        return
+
+    def page_count(self) -> int:
+        if self.is_valid():
+            assert self.pptx_obj is not None
+            return len(self.pptx_obj.slides)
+        else:
+            return 0
+
+    def is_valid(self) -> bool:
+        return self.valid
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return True  # True? if so, how to handle pages...
+
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+
+        self.path_or_stream = None
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.PPTX}
+
+    def convert(self) -> DoclingDocument:
+        # Parses the PPTX into a structured document model.
+        # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
+
+        fname = ""
+        if isinstance(self.path_or_stream, Path):
+            fname = self.path_or_stream.name
+
+        origin = DocumentOrigin(
+            filename=fname,
+            mimetype="application/vnd.ms-powerpoint",
+            binary_hash=self.document_hash,
+        )
+        if len(fname) > 0:
+            docname = Path(fname).stem
+        else:
+            docname = "stream"
+        doc = DoclingDocument(
+            name=docname, origin=origin
+        )  # must add origin information
+        doc = self.walk_linear(self.pptx_obj, doc)
+
+        return doc
+
+    def generate_prov(self, shape, slide_ind, text=""):
+        left = shape.left
+        top = shape.top
+        width = shape.width
+        height = shape.height
+        shape_bbox = [left, top, left + width, top + height]
+        shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
+        # prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}]
+        prov = ProvenanceItem(
+            page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
+        )
+
+        return prov
+
+    def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
+        is_a_list = False
+        enum_list_item_value = 0
+        for paragraph in shape.text_frame.paragraphs:
+            enum_list_item_value += 1
+            bullet_type = "None"
+            # Check if paragraph is a bullet point using the `element` XML
+            p = paragraph._element
+            if (
+                p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
+                is not None
+            ):
+                bullet_type = "Bullet"
+                is_a_list = True
+            elif (
+                p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
+                is not None
+            ):
+                bullet_type = "Numbered"
+                is_a_list = True
+            else:
+                is_a_list = False
+
+            if paragraph.level > 0:
+                # Most likely a sub-list
+                is_a_list = True
+            list_text = paragraph.text.strip()
+
+            prov = self.generate_prov(shape, slide_ind, shape.text.strip())
+
+            if is_a_list:
+                # Determine if this is an unordered list or an ordered list.
+                # Set GroupLabel.ORDERED_LIST when it fits.
+                list_label = GroupLabel.LIST
+                if bullet_type == "Numbered":
+                    list_label = GroupLabel.ORDERED_LIST
+
+                new_list = doc.add_group(
+                    label=list_label, name=f"list", parent=parent_slide
+                )
+            else:
+                new_list = None
+
+            if is_a_list:
+                _log.debug("LIST DETECTED!")
+            else:
+                _log.debug("No List")
+
+            # for e in p.iter():
+            for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
+                if len(e.text.strip()) > 0:
+                    e_is_a_list_item = False
+                    is_numbered = False
+                    if (
+                        p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
+                        is not None
+                    ):
+                        bullet_type = "Bullet"
+                        e_is_a_list_item = True
+                    elif (
+                        p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
+                        is not None
+                    ):
+                        bullet_type = "Numbered"
+                        is_numbered = True
+                        e_is_a_list_item = True
+                    else:
+                        e_is_a_list_item = False
+
+                    if e_is_a_list_item:
+                        # Set marker and enumerated arguments if this is an enumeration element.
+                        enum_marker = str(enum_list_item_value) + "."
+                        doc.add_list_item(
+                            marker=enum_marker,
+                            enumerated=is_numbered,
+                            parent=new_list,
+                            text=list_text,
+                            prov=prov,
+                        )
+                    else:
+                        # Assign proper label to the text, depending if it's a Title or Section Header
+                        # For other types of text, assign - PARAGRAPH
+                        doc_label = DocItemLabel.PARAGRAPH
+                        if shape.is_placeholder:
+                            placeholder_type = shape.placeholder_format.type
+                            if placeholder_type in [
+                                PP_PLACEHOLDER.CENTER_TITLE,
+                                PP_PLACEHOLDER.TITLE,
+                            ]:
+                                # It's a title
+                                doc_label = DocItemLabel.TITLE
+                            elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
+                                DocItemLabel.SECTION_HEADER
+
+                        enum_list_item_value = 0
+
+                        doc.add_text(
+                            label=doc_label,
+                            parent=parent_slide,
+                            text=list_text,
+                            prov=prov,
+                        )
+        return
+
+    def handle_title(self, shape, parent_slide, slide_ind, doc):
+        placeholder_type = shape.placeholder_format.type
+        txt = shape.text.strip()
+        prov = self.generate_prov(shape, slide_ind, txt)
+
+        if len(txt.strip()) > 0:
+            # title = slide.shapes.title.text if slide.shapes.title else "No title"
+            if placeholder_type in [PP_PLACEHOLDER.CENTER_TITLE, PP_PLACEHOLDER.TITLE]:
+                _log.info(f"Title found: {shape.text}")
+                doc.add_text(
+                    label=DocItemLabel.TITLE, parent=parent_slide, text=txt, prov=prov
+                )
+            elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
+                _log.info(f"Subtitle found: {shape.text}")
+                # Using DocItemLabel.FOOTNOTE, while SUBTITLE label is not avail.
+                doc.add_text(
+                    label=DocItemLabel.SECTION_HEADER,
+                    parent=parent_slide,
+                    text=txt,
+                    prov=prov,
+                )
+        return
+
+    def handle_pictures(self, shape, parent_slide, slide_ind, doc):
+        # shape has picture
+        prov = self.generate_prov(shape, slide_ind, "")
+        doc.add_picture(parent=parent_slide, caption=None, prov=prov)
+        return
+
+    def handle_tables(self, shape, parent_slide, slide_ind, doc):
+        # Handling tables, images, charts
+        if shape.has_table:
+            table = shape.table
+            table_xml = shape._element
+
+            prov = self.generate_prov(shape, slide_ind, "")
+
+            num_cols = 0
+            num_rows = len(table.rows)
+            tcells = []
+            # Access the XML element for the shape that contains the table
+            table_xml = shape._element
+
+            for row_idx, row in enumerate(table.rows):
+                if len(row.cells) > num_cols:
+                    num_cols = len(row.cells)
+                for col_idx, cell in enumerate(row.cells):
+                    # Access the XML of the cell (this is the 'tc' element in table XML)
+                    cell_xml = table_xml.xpath(
+                        f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]"
+                    )
+
+                    if not cell_xml:
+                        continue  # If no cell XML is found, skip
+
+                    cell_xml = cell_xml[0]  # Get the first matching XML node
+                    row_span = cell_xml.get("rowSpan")  # Vertical span
+                    col_span = cell_xml.get("gridSpan")  # Horizontal span
+
+                    if row_span is None:
+                        row_span = 1
+                    else:
+                        row_span = int(row_span)
+
+                    if col_span is None:
+                        col_span = 1
+                    else:
+                        col_span = int(col_span)
+
+                    icell = TableCell(
+                        text=cell.text.strip(),
+                        row_span=row_span,
+                        col_span=col_span,
+                        start_row_offset_idx=row_idx,
+                        end_row_offset_idx=row_idx + row_span,
+                        start_col_offset_idx=col_idx,
+                        end_col_offset_idx=col_idx + col_span,
+                        col_header=False,
+                        row_header=False,
+                    )
+                    if len(cell.text.strip()) > 0:
+                        tcells.append(icell)
+            # Initialize Docling TableData
+            data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
+            # Populate
+            for tcell in tcells:
+                data.table_cells.append(tcell)
+            if len(tcells) > 0:
+                # If table is not fully empty...
+                # Create Docling table
+                doc.add_table(data=data, prov=prov)
+        return
+
+    def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
+        # Units of size in PPTX by default are EMU units (English Metric Units)
+        slide_width = pptx_obj.slide_width
+        slide_height = pptx_obj.slide_height
+
+        text_content = []  # type: ignore
+
+        max_levels = 10
+        parents = {}  # type: ignore
+        for i in range(0, max_levels):
+            parents[i] = None
+
+        # Loop through each slide
+        for slide_num, slide in enumerate(pptx_obj.slides):
+            slide_ind = pptx_obj.slides.index(slide)
+            parent_slide = doc.add_group(
+                name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
+            )
+
+            size = Size(width=slide_width, height=slide_height)
+            parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
+            # parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
+
+            # Loop through each shape in the slide
+            for shape in slide.shapes:
+
+                if shape.has_table:
+                    # Handle Tables
+                    self.handle_tables(shape, parent_slide, slide_ind, doc)
+
+                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
+                    # Handle Tables
+                    self.handle_pictures(shape, parent_slide, slide_ind, doc)
+
+                # If shape doesn't have any text, move on to the next shape
+                if not hasattr(shape, "text"):
+                    continue
+                if shape.text is None:
+                    continue
+                if len(shape.text.strip()) == 0:
+                    continue
+                if not shape.has_text_frame:
+                    _log.warn("Warning: shape has text but not text_frame")
+                    continue
+
+                # if shape.is_placeholder:
+                # Handle Titles (Headers) and Subtitles
+                # Check if the shape is a placeholder (titles are placeholders)
+                # self.handle_title(shape, parent_slide, slide_ind, doc)
+                # self.handle_text_elements(shape, parent_slide, slide_ind, doc)
+                # else:
+
+                # Handle other text elements, including lists (bullet lists, numbered lists)
+                self.handle_text_elements(shape, parent_slide, slide_ind, doc)
+
+                # figures...
+                # doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
+
+        return doc
@@ -0,0 +1,509 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+
+import docx
+from docling_core.types.doc import (
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupLabel,
+    TableCell,
+    TableData,
+)
+from lxml import etree
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class MsWordDocumentBackend(DeclarativeDocumentBackend):
+
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        self.XML_KEY = (
+            "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
+        )
+        self.xml_namespaces = {
+            "w": "http://schemas.microsoft.com/office/word/2003/wordml"
+        }
+        # self.initialise(path_or_stream)
+        # Word file:
+        self.path_or_stream = path_or_stream
+        self.valid = False
+        # Initialise the parents for the hierarchy
+        self.max_levels = 10
+        self.level_at_new_list = None
+        self.parents = {}  # type: ignore
+        for i in range(-1, self.max_levels):
+            self.parents[i] = None
+
+        self.level = 0
+        self.listIter = 0
+
+        self.history = {
+            "names": [None],
+            "levels": [None],
+            "numids": [None],
+            "indents": [None],
+        }
+
+        self.docx_obj = None
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                self.docx_obj = docx.Document(self.path_or_stream)
+            elif isinstance(self.path_or_stream, Path):
+                self.docx_obj = docx.Document(str(self.path_or_stream))
+
+            self.valid = True
+        except Exception as e:
+            raise RuntimeError(
+                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
+            ) from e
+
+    def is_valid(self) -> bool:
+        return self.valid
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+
+        self.path_or_stream = None
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.DOCX}
+
+    def convert(self) -> DoclingDocument:
+        # Parses the DOCX into a structured document model.
+
+        fname = ""
+        if isinstance(self.path_or_stream, Path):
+            fname = self.path_or_stream.name
+
+        origin = DocumentOrigin(
+            filename=fname,
+            mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            binary_hash=self.document_hash,
+        )
+        if len(fname) > 0:
+            docname = Path(fname).stem
+        else:
+            docname = "stream"
+        doc = DoclingDocument(name=docname, origin=origin)
+        if self.is_valid():
+            assert self.docx_obj is not None
+            doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
+            return doc
+        else:
+            raise RuntimeError(
+                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
+            )
+
+    def update_history(self, name, level, numid, ilevel):
+        self.history["names"].append(name)
+        self.history["levels"].append(level)
+
+        self.history["numids"].append(numid)
+        self.history["indents"].append(ilevel)
+
+    def prev_name(self):
+        return self.history["names"][-1]
+
+    def prev_level(self):
+        return self.history["levels"][-1]
+
+    def prev_numid(self):
+        return self.history["numids"][-1]
+
+    def prev_indent(self):
+        return self.history["indents"][-1]
+
+    def get_level(self) -> int:
+        """Return the first None index."""
+        for k, v in self.parents.items():
+            if k >= 0 and v == None:
+                return k
+        return 0
+
+    def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
+        for element in body:
+            tag_name = etree.QName(element).localname
+
+            # Check for Inline Images (drawings or blip elements)
+            found_drawing = etree.ElementBase.xpath(
+                element, ".//w:drawing", namespaces=self.xml_namespaces
+            )
+            found_pict = etree.ElementBase.xpath(
+                element, ".//w:pict", namespaces=self.xml_namespaces
+            )
+
+            # Check for Tables
+            if element.tag.endswith("tbl"):
+                try:
+                    self.handle_tables(element, docx_obj, doc)
+                except Exception:
+                    _log.debug("could not parse a table, broken docx table")
+
+            elif found_drawing or found_pict:
+                self.handle_pictures(element, docx_obj, doc)
+            # Check for Text
+            elif tag_name in ["p"]:
+                self.handle_text_elements(element, docx_obj, doc)
+            else:
+                _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
+        return doc
+
+    def str_to_int(self, s, default=0):
+        if s is None:
+            return None
+        try:
+            return int(s)
+        except ValueError:
+            return default
+
+    def get_numId_and_ilvl(self, paragraph):
+        # Access the XML element of the paragraph
+        numPr = paragraph._element.find(
+            ".//w:numPr", namespaces=paragraph._element.nsmap
+        )
+
+        if numPr is not None:
+            # Get the numId element and extract the value
+            numId_elem = numPr.find("w:numId", namespaces=paragraph._element.nsmap)
+            ilvl_elem = numPr.find("w:ilvl", namespaces=paragraph._element.nsmap)
+            numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
+            ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
+
+            return self.str_to_int(numId, default=None), self.str_to_int(
+                ilvl, default=None
+            )
+
+        return None, None  # If the paragraph is not part of a list
+
+    def get_label_and_level(self, paragraph):
+        if paragraph.style is None:
+            return "Normal", None
+        label = paragraph.style.name
+        if label is None:
+            return "Normal", None
+        if ":" in label:
+            parts = label.split(":")
+
+            if len(parts) == 2:
+                return parts[0], int(parts[1])
+
+        parts = label.split(" ")
+
+        if "Heading" in label and len(parts) == 2:
+            parts.sort()
+            label_str = ""
+            label_level = 0
+            if parts[0] == "Heading":
+                # print("{} - {}".format(parts[0], parts[1]))
+                label_str = parts[0]
+                label_level = self.str_to_int(parts[1], default=None)
+            if parts[1] == "Heading":
+                label_str = parts[1]
+                label_level = self.str_to_int(parts[0], default=None)
+            return label_str, label_level
+        else:
+            return label, None
+
+    def handle_text_elements(self, element, docx_obj, doc):
+        paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
+
+        if paragraph.text is None:
+            # _log.warn(f"paragraph has text==None")
+            return
+
+        text = paragraph.text.strip()
+        # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
+
+        # Common styles for bullet and numbered lists.
+        # "List Bullet", "List Number", "List Paragraph"
+        # TODO: reliably identify wether list is a numbered list or not
+        # is_numbered = "List Bullet" not in paragraph.style.name
+        is_numbered = False
+
+        p_style_name, p_level = self.get_label_and_level(paragraph)
+        numid, ilevel = self.get_numId_and_ilvl(paragraph)
+        # print("numid: {}, ilevel: {}, text: {}".format(numid, ilevel, text))
+
+        if numid == 0:
+            numid = None
+
+        # Handle lists
+        if numid is not None and ilevel is not None:
+            self.add_listitem(
+                element,
+                docx_obj,
+                doc,
+                p_style_name,
+                p_level,
+                numid,
+                ilevel,
+                text,
+                is_numbered,
+            )
+            self.update_history(p_style_name, p_level, numid, ilevel)
+            return
+        elif numid is None and self.prev_numid() is not None:  # Close list
+            for key, val in self.parents.items():
+                if key >= self.level_at_new_list:
+                    self.parents[key] = None
+            self.level = self.level_at_new_list - 1
+            self.level_at_new_list = None
+        if p_style_name in ["Title"]:
+            for key, val in self.parents.items():
+                self.parents[key] = None
+            self.parents[0] = doc.add_text(
+                parent=None, label=DocItemLabel.TITLE, text=text
+            )
+        elif "Heading" in p_style_name:
+            self.add_header(element, docx_obj, doc, p_style_name, p_level, text)
+
+        elif p_style_name in [
+            "Paragraph",
+            "Normal",
+            "Subtitle",
+            "Author",
+            "Default Text",
+            "List Paragraph",
+            "List Bullet",
+            "Quote",
+        ]:
+            level = self.get_level()
+            doc.add_text(
+                label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
+            )
+
+        else:
+            # Text style names can, and will have, not only default values but user values too
+            # hence we treat all other labels as pure text
+            level = self.get_level()
+            doc.add_text(
+                label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
+            )
+
+        self.update_history(p_style_name, p_level, numid, ilevel)
+        return
+
+    def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
+        level = self.get_level()
+        if isinstance(curr_level, int):
+
+            if curr_level == level:
+
+                self.parents[level] = doc.add_heading(
+                    parent=self.parents[level - 1], text=text
+                )
+
+            elif curr_level > level:
+
+                # add invisible group
+                for i in range(level, curr_level):
+                    self.parents[i] = doc.add_group(
+                        parent=self.parents[i - 1],
+                        label=GroupLabel.SECTION,
+                        name=f"header-{i}",
+                    )
+
+                self.parents[curr_level] = doc.add_heading(
+                    parent=self.parents[curr_level - 1], text=text
+                )
+
+            elif curr_level < level:
+
+                # remove the tail
+                for key, val in self.parents.items():
+                    if key >= curr_level:
+                        self.parents[key] = None
+
+                self.parents[curr_level] = doc.add_heading(
+                    parent=self.parents[curr_level - 1], text=text
+                )
+
+        else:
+            self.parents[self.level] = doc.add_heading(
+                parent=self.parents[self.level - 1], text=text
+            )
+        return
+
+    def add_listitem(
+        self,
+        element,
+        docx_obj,
+        doc,
+        p_style_name,
+        p_level,
+        numid,
+        ilevel,
+        text: str,
+        is_numbered=False,
+    ):
+        # is_numbered = is_numbered
+        enum_marker = ""
+
+        level = self.get_level()
+        if self.prev_numid() is None:  # Open new list
+            self.level_at_new_list = level  # type: ignore
+
+            self.parents[level] = doc.add_group(
+                label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
+            )
+
+            # TODO: Set marker and enumerated arguments if this is an enumeration element.
+            self.listIter += 1
+            if is_numbered:
+                enum_marker = str(self.listIter) + "."
+                is_numbered = True
+            doc.add_list_item(
+                marker=enum_marker,
+                enumerated=is_numbered,
+                parent=self.parents[level],
+                text=text,
+            )
+
+        elif (
+            self.prev_numid() == numid and self.prev_indent() < ilevel
+        ):  # Open indented list
+            for i in range(
+                self.level_at_new_list + self.prev_indent() + 1,
+                self.level_at_new_list + ilevel + 1,
+            ):
+                # TODO: determine if this is an unordered list or an ordered list.
+                #  Set GroupLabel.ORDERED_LIST when it fits.
+                self.listIter = 0
+                if is_numbered:
+                    self.parents[i] = doc.add_group(
+                        label=GroupLabel.ORDERED_LIST,
+                        name="list",
+                        parent=self.parents[i - 1],
+                    )
+                else:
+                    self.parents[i] = doc.add_group(
+                        label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
+                    )
+
+            # TODO: Set marker and enumerated arguments if this is an enumeration element.
+            self.listIter += 1
+            if is_numbered:
+                enum_marker = str(self.listIter) + "."
+                is_numbered = True
+            doc.add_list_item(
+                marker=enum_marker,
+                enumerated=is_numbered,
+                parent=self.parents[self.level_at_new_list + ilevel],
+                text=text,
+            )
+
+        elif self.prev_numid() == numid and ilevel < self.prev_indent():  # Close list
+            for k, v in self.parents.items():
+                if k > self.level_at_new_list + ilevel:
+                    self.parents[k] = None
+
+            # TODO: Set marker and enumerated arguments if this is an enumeration element.
+            self.listIter += 1
+            if is_numbered:
+                enum_marker = str(self.listIter) + "."
+                is_numbered = True
+            doc.add_list_item(
+                marker=enum_marker,
+                enumerated=is_numbered,
+                parent=self.parents[self.level_at_new_list + ilevel],
+                text=text,
+            )
+            self.listIter = 0
+
+        elif self.prev_numid() == numid or self.prev_indent() == ilevel:
+            # TODO: Set marker and enumerated arguments if this is an enumeration element.
+            self.listIter += 1
+            if is_numbered:
+                enum_marker = str(self.listIter) + "."
+                is_numbered = True
+            doc.add_list_item(
+                marker=enum_marker,
+                enumerated=is_numbered,
+                parent=self.parents[level - 1],
+                text=text,
+            )
+        return
+
+    def handle_tables(self, element, docx_obj, doc):
+
+        # Function to check if a cell has a colspan (gridSpan)
+        def get_colspan(cell):
+            grid_span = cell._element.xpath("@w:gridSpan")
+            if grid_span:
+                return int(grid_span[0])  # Return the number of columns spanned
+            return 1  # Default is 1 (no colspan)
+
+        # Function to check if a cell has a rowspan (vMerge)
+        def get_rowspan(cell):
+            v_merge = cell._element.xpath("@w:vMerge")
+            if v_merge:
+                return v_merge[
+                    0
+                ]  # 'restart' indicates the beginning of a rowspan, others are continuation
+            return 1
+
+        table = docx.table.Table(element, docx_obj)
+
+        num_rows = len(table.rows)
+        num_cols = 0
+        for row in table.rows:
+            # Calculate the max number of columns
+            num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
+            # if row.cells:
+            #     num_cols = max(num_cols, len(row.cells))
+
+        # Initialize the table grid
+        table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
+
+        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
+
+        for row_idx, row in enumerate(table.rows):
+            col_idx = 0
+            for c, cell in enumerate(row.cells):
+                row_span = get_rowspan(cell)
+                col_span = get_colspan(cell)
+
+                # Find the next available column in the grid
+                while table_grid[row_idx][col_idx] is not None:
+                    col_idx += 1
+
+                # Fill the grid with the cell value, considering rowspan and colspan
+                for i in range(row_span if row_span == "restart" else 1):
+                    for j in range(col_span):
+                        table_grid[row_idx + i][col_idx + j] = ""
+
+                cell = TableCell(
+                    text=cell.text,
+                    row_span=row_span,
+                    col_span=col_span,
+                    start_row_offset_idx=row_idx,
+                    end_row_offset_idx=row_idx + row_span,
+                    start_col_offset_idx=col_idx,
+                    end_col_offset_idx=col_idx + col_span,
+                    col_header=False,  # col_header,
+                    row_header=False,  # ((not col_header) and html_cell.name=='th')
+                )
+
+                data.table_cells.append(cell)
+
+        level = self.get_level()
+        doc.add_table(data=data, parent=self.parents[level - 1])
+        return
+
+    def handle_pictures(self, element, docx_obj, doc):
+        doc.add_picture(parent=self.parents[self.level], caption=None)
+        return
@@ -0,0 +1,78 @@
+from abc import ABC, abstractmethod
+from io import BytesIO
+from pathlib import Path
+from typing import Iterable, Optional, Set, Union
+
+from docling_core.types.doc import BoundingBox, Size
+from PIL import Image
+
+from docling.backend.abstract_backend import PaginatedDocumentBackend
+from docling.datamodel.base_models import Cell, InputFormat
+from docling.datamodel.document import InputDocument
+
+
+class PdfPageBackend(ABC):
+
+    @abstractmethod
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        pass
+
+    @abstractmethod
+    def get_text_cells(self) -> Iterable[Cell]:
+        pass
+
+    @abstractmethod
+    def get_bitmap_rects(self, float: int = 1) -> Iterable[BoundingBox]:
+        pass
+
+    @abstractmethod
+    def get_page_image(
+        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
+    ) -> Image.Image:
+        pass
+
+    @abstractmethod
+    def get_size(self) -> Size:
+        pass
+
+    @abstractmethod
+    def is_valid(self) -> bool:
+        pass
+
+    @abstractmethod
+    def unload(self):
+        pass
+
+
+class PdfDocumentBackend(PaginatedDocumentBackend):
+
+    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+
+        if self.input_format is not InputFormat.PDF:
+            if self.input_format is InputFormat.IMAGE:
+                buf = BytesIO()
+                img = Image.open(self.path_or_stream)
+                img.save(buf, "PDF")
+                buf.seek(0)
+                self.path_or_stream = buf
+            else:
+                raise RuntimeError(
+                    f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
+                )
+
+    @abstractmethod
+    def load_page(self, page_no: int) -> PdfPageBackend:
+        pass
+
+    @abstractmethod
+    def page_count(self) -> int:
+        pass
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.PDF}
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return True
@@ -2,16 +2,20 @@ import logging
 import random
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, Iterable, List, Optional, Union

 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
+from docling_core.types.doc import BoundingBox, CoordOrigin, Size
 from PIL import Image, ImageDraw
-from pypdfium2 import PdfPage, PdfTextPage
+from pypdfium2 import PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError

-from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import Cell
+
+if TYPE_CHECKING:
+    from docling.datamodel.document import InputDocument

 _log = logging.getLogger(__name__)

@@ -222,8 +226,8 @@ class PyPdfiumPageBackend(PdfPageBackend):

        return image

-    def get_size(self) -> PageSize:
-        return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
+    def get_size(self) -> Size:
+        return Size(width=self._ppage.get_width(), height=self._ppage.get_height())

    def unload(self):
        self._ppage = None
@@ -231,13 +235,14 @@ class PyPdfiumPageBackend(PdfPageBackend):


 class PyPdfiumDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
-        super().__init__(path_or_stream, document_hash)
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+
        try:
-            self._pdoc = pdfium.PdfDocument(path_or_stream)
+            self._pdoc = pdfium.PdfDocument(self.path_or_stream)
        except PdfiumError as e:
            raise RuntimeError(
-                f"pypdfium could not load document {document_hash}"
+                f"pypdfium could not load document with hash {self.document_hash}"
            ) from e

    def page_count(self) -> int: