Initial commit

2024-07-15 09:42:42 +02:00 · 2024-07-15 09:42:42 +02:00 · e2d996753b
commit e2d996753b
38 changed files with 8767 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,442 @@
+model_artifacts/
+scratch/
+ds_convert_models/
+
+# Created by https://www.toptal.com/developers/gitignore/api/python,macos,virtualenv,pycharm,visualstudiocode,emacs,vim,jupyternotebooks
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,virtualenv,pycharm,visualstudiocode,emacs,vim,jupyternotebooks
+
+### Emacs ###
+# -*- mode: gitignore; -*-
+*~
+\#*\#
+/.emacs.desktop
+/.emacs.desktop.lock
+*.elc
+auto-save-list
+tramp
+.\#*
+
+# Org-mode
+.org-id-locations
+*_archive
+
+# flymake-mode
+*_flymake.*
+
+# eshell files
+/eshell/history
+/eshell/lastdir
+
+# elpa packages
+/elpa/
+
+# reftex files
+*.rel
+
+# AUCTeX auto folder
+/auto/
+
+# cask packages
+.cask/
+dist/
+
+# Flycheck
+flycheck_*.el
+
+# server auth directory
+/server/
+
+# projectiles files
+.projectile
+
+# directory configuration
+.dir-locals.el
+
+# network security
+/network-security.data
+
+
+### JupyterNotebooks ###
+# gitignore template for Jupyter Notebooks
+# website: http://jupyter.org/
+
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
+
+# IPython
+profile_default/
+ipython_config.py
+
+# Remove previous ipynb_checkpoints
+#   git rm -r .ipynb_checkpoints/
+
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+
+### PyCharm ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# AWS User-specific
+.idea/**/aws.xml
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# SonarLint plugin
+.idea/sonarlint/
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### PyCharm Patch ###
+# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
+
+# *.iml
+# modules.xml
+# .idea/misc.xml
+# *.ipr
+
+# Sonarlint plugin
+# https://plugins.jetbrains.com/plugin/7973-sonarlint
+.idea/**/sonarlint/
+
+# SonarQube Plugin
+# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
+.idea/**/sonarIssues.xml
+
+# Markdown Navigator plugin
+# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
+.idea/**/markdown-navigator.xml
+.idea/**/markdown-navigator-enh.xml
+.idea/**/markdown-navigator/
+
+# Cache file creation bug
+# See https://youtrack.jetbrains.com/issue/JBR-2257
+.idea/$CACHE_FILE$
+
+# CodeStream plugin
+# https://plugins.jetbrains.com/plugin/12206-codestream
+.idea/codestream.xml
+
+# Azure Toolkit for IntelliJ plugin
+# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
+.idea/**/azureSettings.xml
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+
+# IPython
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+### Vim ###
+# Swap
+[._]*.s[a-v][a-z]
+!*.svg  # comment out if you don't need vector files
+[._]*.sw[a-p]
+[._]s[a-rt-v][a-z]
+[._]ss[a-gi-z]
+[._]sw[a-p]
+
+# Session
+Session.vim
+Sessionx.vim
+
+# Temporary
+.netrwhist
+# Auto-generated tag files
+tags
+# Persistent undo
+[._]*.un~
+
+
+### Visual Studio Code ###
+.vscode/
+
+### VirtualEnv ###
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+[Ss]cripts
+pyvenv.cfg
+pip-selfcheck.json
+
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+
+
+# Docs
+# docs/**/*.png
+# docs/**/*.svg
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,34 @@
+fail_fast: true
+repos:
+  - repo: local
+    hooks:
+      - id: system
+        name: Black
+        entry: poetry run black docling examples
+        pass_filenames: false
+        language: system
+        files: '\.py$'
+  - repo: local
+    hooks:
+      - id: system
+        name: isort
+        entry: poetry run isort docling examples
+        pass_filenames: false
+        language: system
+        files: '\.py$'
+#  - repo: local
+#    hooks:
+#      - id: system
+#        name: flake8
+#        entry: poetry run flake8 docling
+#        pass_filenames: false
+#        language: system
+#        files: '\.py$'
+#  - repo: local
+#    hooks:
+#     - id: system
+#       name: MyPy
+#       entry: poetry run mypy docling
+#       pass_filenames: false
+#       language: system
+#       files: '\.py$'
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -0,0 +1,129 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement using
+[deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com).
+
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html](https://www.contributor-covenant.org/version/2/0/code_of_conduct.html).
+
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+
+Homepage: [https://www.contributor-covenant.org](https://www.contributor-covenant.org)
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq). Translations are available at
+[https://www.contributor-covenant.org/translations](https://www.contributor-covenant.org/translations).
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,184 @@
+## Contributing In General
+Our project welcomes external contributions. If you have an itch, please feel
+free to scratch it.
+
+To contribute code or documentation, please submit a [pull request](https://github.com/DS4SD/docling/pulls).
+
+A good way to familiarize yourself with the codebase and contribution process is
+to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/DS4SD/docling/issues).
+Before embarking on a more ambitious contribution, please quickly [get in touch](#communication) with us.
+
+For general questions or support requests, please refer to the [discussion section](https://github.com/DS4SD/docling/discussions).
+
+**Note: We appreciate your effort, and want to avoid a situation where a contribution
+requires extensive rework (by you or by us), sits in backlog for a long time, or
+cannot be accepted at all!**
+
+### Proposing new features
+
+If you would like to implement a new feature, please [raise an issue](https://github.com/DS4SD/docling/issues)
+before sending a pull request so the feature can be discussed. This is to avoid
+you wasting your valuable time working on a feature that the project developers
+are not interested in accepting into the code base.
+
+### Fixing bugs
+
+If you would like to fix a bug, please [raise an issue](https://github.com/DS4SD/docling/issues) before sending a
+pull request so it can be tracked.
+
+### Merge approval
+
+The project maintainers use LGTM (Looks Good To Me) in comments on the code
+review to indicate acceptance. A change requires LGTMs from two of the
+maintainers of each component affected.
+
+For a list of the maintainers, see the [MAINTAINERS.md](MAINTAINERS.md) page.
+
+
+## Legal
+
+Each source file must include a license header for the MIT
+Software. Using the SPDX format is the simplest approach.
+e.g.
+
+```
+/*
+Copyright IBM Inc. All rights reserved.
+
+SPDX-License-Identifier: MIT
+*/
+```
+
+We have tried to make it as easy as possible to make contributions. This
+applies to how we handle the legal aspects of contribution. We use the
+same approach - the [Developer's Certificate of Origin 1.1 (DCO)](https://github.com/hyperledger/fabric/blob/master/docs/source/DCO1.1.txt) - that the Linux® Kernel [community](https://elinux.org/Developer_Certificate_Of_Origin)
+uses to manage code contributions.
+
+We simply ask that when submitting a patch for review, the developer
+must include a sign-off statement in the commit message.
+
+Here is an example Signed-off-by line, which indicates that the
+submitter accepts the DCO:
+
+```
+Signed-off-by: John Doe <john.doe@example.com>
+```
+
+You can include this automatically when you commit a change to your
+local git repository using the following command:
+
+```
+git commit -s
+```
+
+
+## Communication
+
+Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
+
+
+
+## Developing
+
+### Usage of Poetry
+
+We use Poetry to manage dependencies.
+
+
+#### Install
+
+To install, see the documentation here: https://python-poetry.org/docs/master/#installing-with-the-official-installer
+
+1. Install the Poetry globally in your machine
+    ```bash
+    curl -sSL https://install.python-poetry.org | python3 -
+    ```
+    The installation script will print the installation bin folder `POETRY_BIN` which you need in the next steps.
+
+2. Make sure Poetry is in your `$PATH`
+    - for `zsh`
+        ```sh
+        echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.zshrc
+        ```
+    - for `bash`
+        ```sh
+        echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.bashrc
+        ```
+
+3. The official guidelines linked above include useful details on the configuration of autocomplete for most shell environments, e.g. Bash and Zsh.
+
+
+#### Create a Virtual Environment and Install Dependencies
+
+To activate the Virtual Environment, run:
+
+```bash
+poetry shell
+```
+
+To spawn a shell with the Virtual Environment activated. If the Virtual Environment doesn't exist, Poetry will create one for you. Then, to install dependencies, run:
+
+```bash
+poetry install
+```
+
+**(Advanced) Use a Specific Python Version**
+
+If for whatever reason you need to work in a specific (older) version of Python, run:
+
+```bash
+poetry env use $(which python3.8)
+```
+
+This creates a Virtual Environment with Python 3.8. For other versions, replace `$(which python3.8)` by the path to the interpreter (e.g., `/usr/bin/python3.8`) or use `$(which pythonX.Y)`.
+
+
+#### Add a new dependency
+
+```bash
+poetry add NAME
+```
+
+## Coding style guidelines
+
+We use the following tools to enforce code style:
+
+- iSort, to sort imports
+- Black, to format code
+
+
+We run a series of checks on the code base on every commit, using `pre-commit`. To install the hooks, run:
+
+```bash
+pre-commit install
+```
+
+To run the checks on-demand, run:
+
+```
+pre-commit run --all-files
+```
+
+Note: Checks like `Black` and `isort` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by their Hooks. In these cases, `git add` the modified files and `git commit` again.
+
+
+
+## Documentation
+
+We use [MkDocs](https://www.mkdocs.org/) to write documentation.
+
+To run the documentation server, do:
+
+```bash
+mkdocs serve
+```
+
+The server will be available on [http://localhost:8000](http://localhost:8000).
+
+### Pushing Documentation to GitHub pages
+
+Run the following:
+
+```bash
+mkdocs gh-deploy
+```
--- a/23
+++ b/23
@ -0,0 +1,23 @@
+FROM python:3.11-slim-bookworm
+
+ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"
+
+RUN apt-get update \
+    && apt-get install -y libgl1 libglib2.0-0 curl wget git \
+    && apt-get clean
+
+RUN --mount=type=ssh \
+    pip install --no-cache-dir https://github.com/DS4SD/docling.git
+
+ENV HF_HOME=/tmp/
+ENV TORCH_HOME=/tmp/
+
+COPY examples/minimal.py /root/minimal.py
+
+RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
+RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
+RUN wget "https://www.ibm.com/docs/en/SSQRB8/com.ibm.spectrum.si.pdfs/IBM_Storage_Insights_Fact_Sheet.pdf" -O /root/factsheet.pdf
+
+# On container shell:
+# > cd /root/
+# > python minimal.py
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) [year] [fullname]
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/MAINTAINERS.md
+++ b/MAINTAINERS.md
@ -0,0 +1,10 @@
+# MAINTAINERS
+
+- Christoph Auer - [@cau-git](https://github.com/cau-git)
+- Michele Dolfi - [@dolfim-ibm](https://github.com/dolfim-ibm)
+- Maxim Lysak - [@maxmnemonic](https://github.com/maxmnemonic)
+- Nikos Livathinos - [@nikos-livathinos](https://github.com/nikos-livathinos)
+- Ahmed Nassar [@nassarofficial](https://github.com/nassarofficial)
+- Peter Staar - [@PeterStaar-IBM](https://github.com/PeterStaar-IBM)
+
+Maintainers can be contacted at [deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com).
--- a/README.md
+++ b/README.md
@ -0,0 +1,99 @@
+<p align="center">
+  <a href="https://github.com/ds4sd/docling"> <img loading="lazy" alt="Docling" src="logo.png" width="150" /> </a>
+</p>
+
+# Docling
+
+Dockling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
+
+## Features
+* ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
+* 📑 Understands detailed page layout, reading order and recovers table structures
+* 📝 Extracts metadata from the document, such as title, authors, references and language
+* 🔍 Optionally applies OCR (use with scanned PDFs)
+
+## Setup
+
+You need Python 3.11 and poetry. Install poetry from [here](https://python-poetry.org/docs/#installing-with-the-official-installer).
+
+Once you have `poetry` installed, create an environment and install the package:
+
+```bash
+poetry env use $(which python3.11)
+poetry shell
+poetry install
+```
+
+**Notes**:
+* Works on macOS and Linux environments. Windows platforms are currently not tested.
+
+
+## Usage
+
+For basic usage, see the [convert.py](examples/convert.py) example module. Run with:
+
+```
+python examples/convert.py
+```
+The output of the above command will be written to `./scratch`.
+
+### Enable or disable pipeline features
+
+You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter` 
+```python
+doc_converter = DocumentConverter(
+    artifacts_path=artifacts_path,
+    pipeline_options=PipelineOptions(do_table_structure=False, # Controls if table structure is recovered. 
+                                     do_ocr=True), # Controls if OCR is applied (ignores programmatic content)
+)
+```
+
+### Impose limits on the document size
+
+You can limit the file size and number of pages which should be allowed to process per document.
+```python
+paths = [Path("./test/data/2206.01062.pdf")]
+
+input = DocumentConversionInput.from_paths(
+    paths, limits=DocumentLimits(max_num_pages=100, max_file_size=20971520)
+)
+```
+
+### Convert from binary PDF streams 
+
+You can convert PDFs from a binary stream instead of from the filesystem as follows:
+```python
+buf = BytesIO(your_binary_stream)
+docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
+input = DocumentConversionInput.from_streams(docs)
+converted_docs = doc_converter.convert(input)
+```
+### Limit resource usage
+
+You can limit the CPU threads used by `docling` by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
+
+
+## Contributing
+
+Please read [Contributing to Docling](./CONTRIBUTING.md) for details.
+
+
+## References
+
+If you use `Docling` in your projects, please consider citing the following:
+
+```bib
+@software{Docling,
+author = {Deep Search Team},
+month = {7},
+title = {{Docling}},
+url = {https://github.com/DS4SD/docling},
+version = {main},
+year = {2024}
+}
+```
+
+## License
+
+The `Docling` codebase is under MIT license.
+For individual model usage, please refer to the model licenses found in the original packages.
--- a/docling/init.py
+++ b/docling/init.py
--- a/docling/backend/init.py
+++ b/docling/backend/init.py
--- a/docling/backend/abstract_backend.py
+++ b/docling/backend/abstract_backend.py
@ -0,0 +1,55 @@
+from abc import ABC, abstractmethod
+from io import BytesIO
+from pathlib import Path
+from typing import Any, Iterable, Optional, Union
+
+from PIL import Image
+
+
+class PdfPageBackend(ABC):
+    def __init__(self, page_obj: Any) -> object:
+        pass
+
+    @abstractmethod
+    def get_text_in_rect(self, bbox: "BoundingBox") -> str:
+        pass
+
+    @abstractmethod
+    def get_text_cells(self) -> Iterable["Cell"]:
+        pass
+
+    @abstractmethod
+    def get_page_image(
+        self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
+    ) -> Image.Image:
+        pass
+
+    @abstractmethod
+    def get_size(self) -> "PageSize":
+        pass
+
+    @abstractmethod
+    def unload(self):
+        pass
+
+
+class PdfDocumentBackend(ABC):
+    @abstractmethod
+    def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
+        pass
+
+    @abstractmethod
+    def load_page(self, page_no: int) -> PdfPageBackend:
+        pass
+
+    @abstractmethod
+    def page_count(self) -> int:
+        pass
+
+    @abstractmethod
+    def is_valid(self) -> bool:
+        pass
+
+    @abstractmethod
+    def unload(self):
+        pass
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@ -0,0 +1,223 @@
+import random
+from io import BytesIO
+from pathlib import Path
+from typing import Iterable, List, Optional, Union
+
+import pypdfium2 as pdfium
+from PIL import Image, ImageDraw
+from pypdfium2 import PdfPage
+
+from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
+
+
+class PyPdfiumPageBackend(PdfPageBackend):
+    def __init__(self, page_obj: PdfPage):
+        super().__init__(page_obj)
+        self._ppage = page_obj
+        self.text_page = None
+
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        if not self.text_page:
+            self.text_page = self._ppage.get_textpage()
+
+        if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
+            bbox = bbox.to_bottom_left_origin(self.get_size().height)
+
+        text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
+
+        return text_piece
+
+    def get_text_cells(self) -> Iterable[Cell]:
+        if not self.text_page:
+            self.text_page = self._ppage.get_textpage()
+
+        cells = []
+        cell_counter = 0
+
+        page_size = self.get_size()
+
+        for i in range(self.text_page.count_rects()):
+            rect = self.text_page.get_rect(i)
+            text_piece = self.text_page.get_text_bounded(*rect)
+            x0, y0, x1, y1 = rect
+            cells.append(
+                Cell(
+                    id=cell_counter,
+                    text=text_piece,
+                    bbox=BoundingBox(
+                        l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
+                    ).to_top_left_origin(page_size.height),
+                )
+            )
+            cell_counter += 1
+
+        # PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
+        # The cell merging code below is to clean this up.
+        def merge_horizontal_cells(
+            cells: List[Cell],
+            horizontal_threshold_factor: float = 1.0,
+            vertical_threshold_factor: float = 0.5,
+        ) -> List[Cell]:
+            if not cells:
+                return []
+
+            def group_rows(cells: List[Cell]) -> List[List[Cell]]:
+                rows = []
+                current_row = [cells[0]]
+                row_top = cells[0].bbox.t
+                row_bottom = cells[0].bbox.b
+                row_height = cells[0].bbox.height
+
+                for cell in cells[1:]:
+                    vertical_threshold = row_height * vertical_threshold_factor
+                    if (
+                        abs(cell.bbox.t - row_top) <= vertical_threshold
+                        and abs(cell.bbox.b - row_bottom) <= vertical_threshold
+                    ):
+                        current_row.append(cell)
+                        row_top = min(row_top, cell.bbox.t)
+                        row_bottom = max(row_bottom, cell.bbox.b)
+                        row_height = row_bottom - row_top
+                    else:
+                        rows.append(current_row)
+                        current_row = [cell]
+                        row_top = cell.bbox.t
+                        row_bottom = cell.bbox.b
+                        row_height = cell.bbox.height
+
+                if current_row:
+                    rows.append(current_row)
+
+                return rows
+
+            def merge_row(row: List[Cell]) -> List[Cell]:
+                merged = []
+                current_group = [row[0]]
+
+                for cell in row[1:]:
+                    prev_cell = current_group[-1]
+                    avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
+                    if (
+                        cell.bbox.l - prev_cell.bbox.r
+                        <= avg_height * horizontal_threshold_factor
+                    ):
+                        current_group.append(cell)
+                    else:
+                        merged.append(merge_group(current_group))
+                        current_group = [cell]
+
+                if current_group:
+                    merged.append(merge_group(current_group))
+
+                return merged
+
+            def merge_group(group: List[Cell]) -> Cell:
+                if len(group) == 1:
+                    return group[0]
+
+                merged_text = "".join(cell.text for cell in group)
+                merged_bbox = BoundingBox(
+                    l=min(cell.bbox.l for cell in group),
+                    t=min(cell.bbox.t for cell in group),
+                    r=max(cell.bbox.r for cell in group),
+                    b=max(cell.bbox.b for cell in group),
+                )
+                return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
+
+            rows = group_rows(cells)
+            merged_cells = [cell for row in rows for cell in merge_row(row)]
+
+            for i, cell in enumerate(merged_cells, 1):
+                cell.id = i
+
+            return merged_cells
+
+        def draw_clusters_and_cells():
+            image = self.get_page_image()
+            draw = ImageDraw.Draw(image)
+            for c in cells:
+                x0, y0, x1, y1 = c.bbox.as_tuple()
+                cell_color = (
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                )
+                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
+            image.show()
+
+        # before merge:
+        # draw_clusters_and_cells()
+
+        cells = merge_horizontal_cells(cells)
+
+        # after merge:
+        # draw_clusters_and_cells()
+
+        return cells
+
+    def get_page_image(
+        self, scale: int = 1, cropbox: Optional[BoundingBox] = None
+    ) -> Image.Image:
+
+        page_size = self.get_size()
+
+        if not cropbox:
+            cropbox = BoundingBox(
+                l=0,
+                r=page_size.width,
+                t=0,
+                b=page_size.height,
+                coord_origin=CoordOrigin.TOPLEFT,
+            )
+            padbox = BoundingBox(
+                l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
+            )
+        else:
+            padbox = cropbox.to_bottom_left_origin(page_size.height)
+            padbox.r = page_size.width - padbox.r
+            padbox.t = page_size.height - padbox.t
+
+        image = (
+            self._ppage.render(
+                scale=scale * 1.5,
+                rotation=0,  # no additional rotation
+                crop=padbox.as_tuple(),
+            )
+            .to_pil()
+            .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
+        )  # We resize the image from 1.5x the given scale to make it sharper.
+
+        return image
+
+    def get_size(self) -> PageSize:
+        return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
+
+    def unload(self):
+        self._ppage = None
+        self.text_page = None
+
+
+class PyPdfiumDocumentBackend(PdfDocumentBackend):
+    def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
+        super().__init__(path_or_stream)
+
+        if isinstance(path_or_stream, Path):
+            self._pdoc = pdfium.PdfDocument(path_or_stream)
+        elif isinstance(path_or_stream, BytesIO):
+            self._pdoc = pdfium.PdfDocument(
+                path_or_stream
+            )  # TODO Fix me, won't accept bytes.
+
+    def page_count(self) -> int:
+        return len(self._pdoc)
+
+    def load_page(self, page_no: int) -> PdfPage:
+        return PyPdfiumPageBackend(self._pdoc[page_no])
+
+    def is_valid(self) -> bool:
+        return self.page_count() > 0
+
+    def unload(self):
+        self._pdoc.close()
+        self._pdoc = None
--- a/docling/datamodel/init.py
+++ b/docling/datamodel/init.py
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -0,0 +1,247 @@
+from enum import Enum, auto
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from PIL.Image import Image
+from pydantic import BaseModel, ConfigDict, model_validator
+
+from docling.backend.abstract_backend import PdfPageBackend
+
+
+class ConversionStatus(str, Enum):
+    PENDING = auto()
+    STARTED = auto()
+    FAILURE = auto()
+    SUCCESS = auto()
+    SUCCESS_WITH_ERRORS = auto()
+
+
+class DocInputType(str, Enum):
+    PATH = auto()
+    STREAM = auto()
+
+
+class CoordOrigin(str, Enum):
+    TOPLEFT = auto()
+    BOTTOMLEFT = auto()
+
+
+class PageSize(BaseModel):
+    width: float = 0.0
+    height: float = 0.0
+
+
+class BoundingBox(BaseModel):
+    l: float  # left
+    t: float  # top
+    r: float  # right
+    b: float  # bottom
+
+    coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
+
+    @property
+    def width(self):
+        return self.r - self.l
+
+    @property
+    def height(self):
+        return abs(self.t - self.b)
+
+    def as_tuple(self):
+        if self.coord_origin == CoordOrigin.TOPLEFT:
+            return (self.l, self.t, self.r, self.b)
+        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return (self.l, self.b, self.r, self.t)
+
+    @classmethod
+    def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
+        if origin == CoordOrigin.TOPLEFT:
+            return BoundingBox(
+                l=coord[0], t=coord[1], r=coord[2], b=coord[3], coord_origin=origin
+            )
+        elif origin == CoordOrigin.BOTTOMLEFT:
+            return BoundingBox(
+                l=coord[0], b=coord[1], r=coord[2], t=coord[3], coord_origin=origin
+            )
+
+    def area(self) -> float:
+        return (self.r - self.l) * (self.b - self.t)
+
+    def intersection_area_with(self, other: "BoundingBox") -> float:
+        # Calculate intersection coordinates
+        left = max(self.l, other.l)
+        top = max(self.t, other.t)
+        right = min(self.r, other.r)
+        bottom = min(self.b, other.b)
+
+        # Calculate intersection dimensions
+        width = right - left
+        height = bottom - top
+
+        # If the bounding boxes do not overlap, width or height will be negative
+        if width <= 0 or height <= 0:
+            return 0.0
+
+        return width * height
+
+    def to_bottom_left_origin(self, page_height) -> "BoundingBox":
+        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return self
+        elif self.coord_origin == CoordOrigin.TOPLEFT:
+            return BoundingBox(
+                l=self.l,
+                r=self.r,
+                t=page_height - self.t,
+                b=page_height - self.b,
+                coord_origin=CoordOrigin.BOTTOMLEFT,
+            )
+
+    def to_top_left_origin(self, page_height):
+        if self.coord_origin == CoordOrigin.TOPLEFT:
+            return self
+        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return BoundingBox(
+                l=self.l,
+                r=self.r,
+                t=page_height - self.t,  # self.b
+                b=page_height - self.b,  # self.t
+                coord_origin=CoordOrigin.TOPLEFT,
+            )
+
+
+class Cell(BaseModel):
+    id: int
+    text: str
+    bbox: BoundingBox
+
+
+class OcrCell(Cell):
+    confidence: float
+
+
+class Cluster(BaseModel):
+    id: int
+    label: str
+    bbox: BoundingBox
+    confidence: float = 1.0
+    cells: List[Cell] = []
+
+
+class BasePageElement(BaseModel):
+    label: str
+    id: int
+    page_no: int
+    cluster: Cluster
+    text: Optional[str] = None
+
+
+class LayoutPrediction(BaseModel):
+    clusters: List[Cluster] = []
+
+
+class TableCell(BaseModel):
+    bbox: BoundingBox
+    row_span: int
+    col_span: int
+    start_row_offset_idx: int
+    end_row_offset_idx: int
+    start_col_offset_idx: int
+    end_col_offset_idx: int
+    text: str
+    column_header: bool = False
+    row_header: bool = False
+    row_section: bool = False
+
+    @model_validator(mode="before")
+    @classmethod
+    def from_dict_format(cls, data: Any) -> Any:
+        if isinstance(data, Dict):
+            text = data["bbox"].get("token", "")
+            if not len(text):
+                text_cells = data.pop("text_cell_bboxes", None)
+                if text_cells:
+                    for el in text_cells:
+                        text += el["token"] + " "
+
+                text = text.strip()
+            data["text"] = text
+
+        return data
+
+
+class TableElement(BasePageElement):
+    otsl_seq: List[str]
+    num_rows: int = 0
+    num_cols: int = 0
+    table_cells: List[TableCell]
+
+
+class TableStructurePrediction(BaseModel):
+    table_map: Dict[int, TableElement] = {}
+
+
+class TextElement(BasePageElement):
+    ...
+
+
+class FigureData(BaseModel):
+    pass
+
+
+class FigureElement(BasePageElement):
+    data: Optional[FigureData] = None
+    provenance: Optional[str] = None
+    predicted_class: Optional[str] = None
+    confidence: Optional[float] = None
+
+
+class FigureClassificationPrediction(BaseModel):
+    figure_count: int = 0
+    figure_map: Dict[int, FigureElement] = {}
+
+
+class EquationPrediction(BaseModel):
+    equation_count: int = 0
+    equation_map: Dict[int, TextElement] = {}
+
+
+class PagePredictions(BaseModel):
+    layout: LayoutPrediction = None
+    tablestructure: TableStructurePrediction = None
+    figures_classification: FigureClassificationPrediction = None
+    equations_prediction: EquationPrediction = None
+
+
+PageElement = Union[TextElement, TableElement, FigureElement]
+
+
+class AssembledUnit(BaseModel):
+    elements: List[PageElement]
+    body: List[PageElement]
+    headers: List[PageElement]
+
+
+class Page(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    page_no: int
+    page_hash: str = None
+    size: PageSize = None
+    image: Image = None
+    cells: List[Cell] = None
+    predictions: PagePredictions = PagePredictions()
+    assembled: AssembledUnit = None
+
+    _backend: PdfPageBackend = None  # Internal PDF backend
+
+
+class DocumentStream(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    filename: str
+    stream: BytesIO
+
+
+class PipelineOptions(BaseModel):
+    do_table_structure: bool = True
+    do_ocr: bool = False
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -0,0 +1,351 @@
+import logging
+from io import BytesIO
+from pathlib import Path, PurePath
+from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
+
+from deepsearch.documents.core.export import export_to_markdown
+from docling_core.types import BaseCell, BaseText
+from docling_core.types import BoundingBox as DsBoundingBox
+from docling_core.types import Document as DsDocument
+from docling_core.types import DocumentDescription as DsDocumentDescription
+from docling_core.types import FileInfoObject as DsFileInfoObject
+from docling_core.types import PageDimensions, PageReference, Prov, Ref
+from docling_core.types import Table as DsSchemaTable
+from docling_core.types import TableCell
+from pydantic import BaseModel
+
+from docling.backend.abstract_backend import PdfDocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import (
+    AssembledUnit,
+    ConversionStatus,
+    DocumentStream,
+    FigureElement,
+    Page,
+    TableElement,
+    TextElement,
+)
+from docling.datamodel.settings import DocumentLimits
+from docling.utils.utils import create_file_hash
+
+_log = logging.getLogger(__name__)
+
+layout_label_to_ds_type = {
+    "Title": "title",
+    "Document Index": "table-of-path_or_stream",
+    "Section-header": "subtitle-level-1",
+    "Checkbox-Selected": "checkbox-selected",
+    "Checkbox-Unselected": "checkbox-unselected",
+    "Caption": "caption",
+    "Page-header": "page-header",
+    "Page-footer": "page-footer",
+    "Footnote": "footnote",
+    "Table": "table",
+    "Formula": "equation",
+    "List-item": "paragraph",
+    "Code": "paragraph",
+    "Picture": "figure",
+    "Text": "paragraph",
+}
+
+
+class InputDocument(BaseModel):
+    file: PurePath = None
+    document_hash: Optional[str] = None
+    valid: bool = False
+    limits: DocumentLimits = DocumentLimits()
+
+    filesize: Optional[int] = None
+    page_count: Optional[int] = None
+
+    _backend: PdfDocumentBackend = None  # Internal PDF backend used
+
+    def __init__(
+        self,
+        path_or_stream: Union[BytesIO, Path],
+        filename: Optional[str] = None,
+        limits: Optional[DocumentLimits] = None,
+        pdf_backend=PyPdfiumDocumentBackend,
+    ):
+        super().__init__()
+
+        self.limits = limits or DocumentLimits()
+
+        try:
+            if isinstance(path_or_stream, Path):
+                self.file = path_or_stream
+                self.filesize = path_or_stream.stat().st_size
+                if self.filesize > self.limits.max_file_size:
+                    self.valid = False
+                else:
+                    self.document_hash = create_file_hash(path_or_stream)
+                    self._backend = pdf_backend(path_or_stream=path_or_stream)
+
+            elif isinstance(path_or_stream, BytesIO):
+                self.file = PurePath(filename)
+                self.filesize = path_or_stream.getbuffer().nbytes
+
+                if self.filesize > self.limits.max_file_size:
+                    self.valid = False
+                else:
+                    self.document_hash = create_file_hash(path_or_stream)
+                    self._backend = pdf_backend(path_or_stream=path_or_stream)
+
+            if self.document_hash and self._backend.page_count() > 0:
+                self.page_count = self._backend.page_count()
+
+                if self.page_count <= self.limits.max_num_pages:
+                    self.valid = True
+
+        except (FileNotFoundError, OSError) as e:
+            _log.exception(
+                f"File {self.file.name} not found or cannot be opened.", exc_info=e
+            )
+            # raise
+        except RuntimeError as e:
+            _log.exception(
+                f"An unexpected error occurred while opening the document {self.file.name}",
+                exc_info=e,
+            )
+            # raise
+
+
+class ConvertedDocument(BaseModel):
+    input: InputDocument
+
+    status: ConversionStatus = ConversionStatus.PENDING  # failure, success
+    errors: List[Dict] = []  # structure to keep errors
+
+    pages: List[Page] = []
+    assembled: AssembledUnit = None
+
+    output: DsDocument = None
+
+    def to_ds_document(self) -> DsDocument:
+        title = ""
+        desc = DsDocumentDescription(logs=[])
+
+        page_hashes = [
+            PageReference(hash=p.page_hash, page=p.page_no, model="default")
+            for p in self.pages
+        ]
+
+        file_info = DsFileInfoObject(
+            filename=self.input.file.name,
+            document_hash=self.input.document_hash,
+            num_pages=self.input.page_count,
+            page_hashes=page_hashes,
+        )
+
+        main_text = []
+        tables = []
+        figures = []
+
+        page_no_to_page = {p.page_no: p for p in self.pages}
+
+        for element in self.assembled.elements:
+            # Convert bboxes to lower-left origin.
+            target_bbox = DsBoundingBox(
+                element.cluster.bbox.to_bottom_left_origin(
+                    page_no_to_page[element.page_no].size.height
+                ).as_tuple()
+            )
+
+            if isinstance(element, TextElement):
+                main_text.append(
+                    BaseText(
+                        text=element.text,
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        name=element.label,
+                        prov=[
+                            Prov(
+                                bbox=target_bbox,
+                                page=element.page_no,
+                                span=[0, len(element.text)],
+                            )
+                        ],
+                    )
+                )
+            elif isinstance(element, TableElement):
+                index = len(tables)
+                ref_str = f"#/tables/{index}"
+                main_text.append(
+                    Ref(
+                        name=element.label,
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        ref=ref_str,
+                    ),
+                )
+
+                # Initialise empty table data grid (only empty cells)
+                table_data = [
+                    [
+                        TableCell(
+                            text="",
+                            # bbox=[0,0,0,0],
+                            spans=[[i, j]],
+                            obj_type="body",
+                        )
+                        for j in range(element.num_cols)
+                    ]
+                    for i in range(element.num_rows)
+                ]
+
+                # Overwrite cells in table data for which there is actual cell content.
+                for cell in element.table_cells:
+                    for i in range(
+                        min(cell.start_row_offset_idx, element.num_rows),
+                        min(cell.end_row_offset_idx, element.num_rows),
+                    ):
+                        for j in range(
+                            min(cell.start_col_offset_idx, element.num_cols),
+                            min(cell.end_col_offset_idx, element.num_cols),
+                        ):
+                            celltype = "body"
+                            if cell.column_header:
+                                celltype = "col_header"
+                            elif cell.row_header:
+                                celltype = "row_header"
+
+                            def make_spans(cell):
+                                for rspan in range(
+                                    min(cell.start_row_offset_idx, element.num_rows),
+                                    min(cell.end_row_offset_idx, element.num_rows),
+                                ):
+                                    for cspan in range(
+                                        min(
+                                            cell.start_col_offset_idx, element.num_cols
+                                        ),
+                                        min(cell.end_col_offset_idx, element.num_cols),
+                                    ):
+                                        yield [rspan, cspan]
+
+                            spans = list(make_spans(cell))
+                            table_data[i][j] = TableCell(
+                                text=cell.text,
+                                bbox=cell.bbox.to_bottom_left_origin(
+                                    page_no_to_page[element.page_no].size.height
+                                ).as_tuple(),
+                                # col=j,
+                                # row=i,
+                                spans=spans,
+                                obj_type=celltype,
+                                # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
+                                # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
+                            )
+
+                tables.append(
+                    DsSchemaTable(
+                        num_cols=element.num_cols,
+                        num_rows=element.num_rows,
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        data=table_data,
+                        prov=[
+                            Prov(
+                                bbox=target_bbox,
+                                page=element.page_no,
+                                span=[0, 0],
+                            )
+                        ],
+                    )
+                )
+
+            elif isinstance(element, FigureElement):
+                index = len(figures)
+                ref_str = f"#/figures/{index}"
+                main_text.append(
+                    Ref(
+                        name=element.label,
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        ref=ref_str,
+                    ),
+                )
+                figures.append(
+                    BaseCell(
+                        prov=[
+                            Prov(
+                                bbox=target_bbox,
+                                page=element.page_no,
+                                span=[0, 0],
+                            )
+                        ],
+                        obj_type=layout_label_to_ds_type.get(element.label),
+                        # data=[[]],
+                    )
+                )
+
+        page_dimensions = [
+            PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
+            for p in self.pages
+        ]
+
+        ds_doc = DsDocument(
+            name=title,
+            description=desc,
+            file_info=file_info,
+            main_text=main_text,
+            tables=tables,
+            figures=figures,
+            page_dimensions=page_dimensions,
+        )
+
+        return ds_doc
+
+    def render_as_dict(self):
+        if self.output:
+            return self.output.model_dump(by_alias=True, exclude_none=True)
+        else:
+            return {}
+
+    def render_as_markdown(self):
+        if self.output:
+            return export_to_markdown(
+                self.output.model_dump(by_alias=True, exclude_none=True)
+            )
+        else:
+            return ""
+
+
+class DocumentConversionInput(BaseModel):
+
+    _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
+    limits: Optional[DocumentLimits] = DocumentLimits()
+
+    DEFAULT_BACKEND: ClassVar = PyPdfiumDocumentBackend
+
+    def docs(
+        self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
+    ) -> Iterable[InputDocument]:
+
+        pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
+
+        for obj in self._path_or_stream_iterator:
+            if isinstance(obj, Path):
+                yield InputDocument(
+                    path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
+                )
+            elif isinstance(obj, DocumentStream):
+                yield InputDocument(
+                    path_or_stream=obj.stream,
+                    filename=obj.filename,
+                    limits=self.limits,
+                    pdf_backend=pdf_backend,
+                )
+
+    @classmethod
+    def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
+        paths = [Path(p) for p in paths]
+
+        doc_input = cls(limits=limits)
+        doc_input._path_or_stream_iterator = paths
+
+        return doc_input
+
+    @classmethod
+    def from_streams(
+        cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
+    ):
+        doc_input = cls(limits=limits)
+        doc_input._path_or_stream_iterator = streams
+
+        return doc_input
--- a/docling/datamodel/settings.py
+++ b/docling/datamodel/settings.py
@ -0,0 +1,32 @@
+import sys
+
+from pydantic import BaseModel
+from pydantic_settings import BaseSettings
+
+
+class DocumentLimits(BaseModel):
+    max_num_pages: int = sys.maxsize
+    max_file_size: int = sys.maxsize
+
+
+class BatchConcurrencySettings(BaseModel):
+    doc_batch_size: int = 2
+    doc_batch_concurrency: int = 2
+    page_batch_size: int = 4
+    page_batch_concurrency: int = 2
+
+    # doc_batch_size: int = 1
+    # doc_batch_concurrency: int = 1
+    # page_batch_size: int = 1
+    # page_batch_concurrency: int = 1
+
+    # model_concurrency: int = 2
+
+    # To force models into single core: export OMP_NUM_THREADS=1
+
+
+class AppSettings(BaseSettings):
+    perf: BatchConcurrencySettings
+
+
+settings = AppSettings(perf=BatchConcurrencySettings())
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -0,0 +1,207 @@
+import functools
+import logging
+import time
+import traceback
+from pathlib import Path
+from typing import Iterable, Optional, Type, Union
+
+from PIL import ImageDraw
+
+from docling.backend.abstract_backend import PdfDocumentBackend
+from docling.datamodel.base_models import (
+    AssembledUnit,
+    ConversionStatus,
+    Page,
+    PipelineOptions,
+)
+from docling.datamodel.document import (
+    ConvertedDocument,
+    DocumentConversionInput,
+    InputDocument,
+)
+from docling.datamodel.settings import settings
+from docling.models.ds_glm_model import GlmModel
+from docling.models.page_assemble_model import PageAssembleModel
+from docling.pipeline.base_model_pipeline import BaseModelPipeline
+from docling.pipeline.standard_model_pipeline import StandardModelPipeline
+from docling.utils.utils import chunkify, create_hash
+
+_log = logging.getLogger(__name__)
+
+
+class DocumentConverter:
+    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
+    _table_model_path = "model_artifacts/tableformer"
+
+    def __init__(
+        self,
+        artifacts_path: Optional[Union[Path, str]] = None,
+        pipeline_options: PipelineOptions = PipelineOptions(),
+        pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
+        pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
+    ):
+        if not artifacts_path:
+            artifacts_path = self.download_models_hf()
+
+        artifacts_path = Path(artifacts_path)
+
+        self.model_pipeline = pipeline_cls(
+            artifacts_path=artifacts_path, pipeline_options=pipeline_options
+        )
+
+        self.page_assemble_model = PageAssembleModel(config={})
+        self.glm_model = GlmModel(config={})
+        self.pdf_backend = pdf_backend
+
+    @staticmethod
+    def download_models_hf(
+        local_dir: Optional[Path] = None, force: bool = False
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+
+        download_path = snapshot_download(
+            repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
+        )
+
+        return Path(download_path)
+
+    def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument]:
+
+        for input_batch in chunkify(
+            input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
+        ):
+            _log.info(f"Going to convert document batch...")
+            # parallel processing only within input_batch
+            # with ThreadPoolExecutor(
+            #    max_workers=settings.perf.doc_batch_concurrency
+            # ) as pool:
+            #   yield from pool.map(self.process_document, input_batch)
+
+            # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
+            yield from map(self.process_document, input_batch)
+
+    def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
+        start_doc_time = time.time()
+        converted_doc = ConvertedDocument(input=in_doc)
+
+        if not in_doc.valid:
+            converted_doc.status = ConversionStatus.FAILURE
+            return converted_doc
+
+        for i in range(0, in_doc.page_count):
+            converted_doc.pages.append(Page(page_no=i))
+
+        all_assembled_pages = []
+
+        try:
+            # Iterate batches of pages (page_batch_size) in the doc
+            for page_batch in chunkify(
+                converted_doc.pages, settings.perf.page_batch_size
+            ):
+
+                start_pb_time = time.time()
+                # Pipeline
+
+                # 1. Initialise the page resources
+                init_pages = map(
+                    functools.partial(self.initialize_page, in_doc), page_batch
+                )
+
+                # 2. Populate page image
+                pages_with_images = map(
+                    functools.partial(self.populate_page_images, in_doc), init_pages
+                )
+
+                # 3. Populate programmatic page cells
+                pages_with_cells = map(
+                    functools.partial(self.parse_page_cells, in_doc),
+                    pages_with_images,
+                )
+
+                pipeline_pages = self.model_pipeline.apply(pages_with_cells)
+
+                # 7. Assemble page elements (per page)
+                assembled_pages = self.page_assemble_model(pipeline_pages)
+
+                # exhaust assembled_pages
+                for assembled_page in assembled_pages:
+                    # Free up mem resources before moving on with next batch
+                    assembled_page.image = (
+                        None  # Comment this if you want to visualize page images
+                    )
+                    assembled_page._backend.unload()
+
+                    all_assembled_pages.append(assembled_page)
+
+                end_pb_time = time.time() - start_pb_time
+                _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
+
+            # Free up mem resources of PDF backend
+            in_doc._backend.unload()
+
+            converted_doc.pages = all_assembled_pages
+            self.assemble_doc(converted_doc)
+
+            converted_doc.status = ConversionStatus.SUCCESS
+
+        except Exception as e:
+            converted_doc.status = ConversionStatus.FAILURE
+            trace = "\n".join(traceback.format_exception(e))
+            _log.info(f"Encountered an error during conversion: {trace}")
+
+        end_doc_time = time.time() - start_doc_time
+        _log.info(
+            f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
+        )
+
+        return converted_doc
+
+    # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
+    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
+        page._backend = doc._backend.load_page(page.page_no)
+        page.size = page._backend.get_size()
+        page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
+
+        return page
+
+    # Generate the page image and store it in the page object
+    def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
+        page.image = page._backend.get_page_image()
+
+        return page
+
+    # Extract and populate the page cells and store it in the page object
+    def parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
+        page.cells = page._backend.get_text_cells()
+
+        # DEBUG code:
+        def draw_text_boxes(image, cells):
+            draw = ImageDraw.Draw(image)
+            for c in cells:
+                x0, y0, x1, y1 = c.bbox.as_tuple()
+                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
+            image.show()
+
+        # draw_text_boxes(page.image, cells)
+
+        return page
+
+    def assemble_doc(self, converted_doc: ConvertedDocument):
+        all_elements = []
+        all_headers = []
+        all_body = []
+
+        for p in converted_doc.pages:
+
+            for el in p.assembled.body:
+                all_body.append(el)
+            for el in p.assembled.headers:
+                all_headers.append(el)
+            for el in p.assembled.elements:
+                all_elements.append(el)
+
+        converted_doc.assembled = AssembledUnit(
+            elements=all_elements, headers=all_headers, body=all_body
+        )
+
+        converted_doc.output = self.glm_model(converted_doc)
--- a/docling/models/init.py
+++ b/docling/models/init.py
--- a/docling/models/ds_glm_model.py
+++ b/docling/models/ds_glm_model.py
@ -0,0 +1,82 @@
+import copy
+import random
+
+from deepsearch_glm.nlp_utils import init_nlp_model
+from deepsearch_glm.utils.ds_utils import to_legacy_document_format
+from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
+from docling_core.types import BaseText
+from docling_core.types import Document as DsDocument
+from docling_core.types import Ref
+from PIL import ImageDraw
+
+from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
+from docling.datamodel.document import ConvertedDocument
+
+
+class GlmModel:
+    def __init__(self, config):
+        self.config = config
+        load_pretrained_nlp_models()
+        model = init_nlp_model(model_names="language;term;reference")
+        self.model = model
+
+    def __call__(self, document: ConvertedDocument) -> DsDocument:
+        ds_doc = document.to_ds_document()
+        ds_doc_dict = ds_doc.model_dump(by_alias=True)
+
+        glm_doc = self.model.apply_on_doc(ds_doc_dict)
+        ds_doc_dict = to_legacy_document_format(
+            glm_doc, ds_doc_dict, update_name_label=True
+        )
+
+        exported_doc = DsDocument.model_validate(ds_doc_dict)
+
+        # DEBUG code:
+        def draw_clusters_and_cells(ds_document, page_no):
+            clusters_to_draw = []
+            image = copy.deepcopy(document.pages[page_no].image)
+            for ix, elem in enumerate(ds_document.main_text):
+                if isinstance(elem, BaseText):
+                    prov = elem.prov[0]
+                elif isinstance(elem, Ref):
+                    _, arr, index = elem.ref.split("/")
+                    index = int(index)
+                    if arr == "tables":
+                        prov = ds_document.tables[index].prov[0]
+                    elif arr == "figures":
+                        prov = ds_document.figures[index].prov[0]
+                    else:
+                        prov = None
+
+                if prov and prov.page == page_no:
+                    clusters_to_draw.append(
+                        Cluster(
+                            id=ix,
+                            label=elem.name,
+                            bbox=BoundingBox.from_tuple(
+                                coord=prov.bbox,
+                                origin=CoordOrigin.BOTTOMLEFT,
+                            ).to_top_left_origin(document.pages[page_no].size.height),
+                        )
+                    )
+
+            draw = ImageDraw.Draw(image)
+            for c in clusters_to_draw:
+                x0, y0, x1, y1 = c.bbox.as_tuple()
+                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
+                draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
+
+                cell_color = (
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                )
+                for tc in c.cells:  # [:1]:
+                    x0, y0, x1, y1 = tc.bbox.as_tuple()
+                    draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
+            image.show()
+
+        # draw_clusters_and_cells(ds_doc, 0)
+        # draw_clusters_and_cells(exported_doc, 0)
+
+        return exported_doc
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@ -0,0 +1,77 @@
+import copy
+import logging
+import random
+from typing import Iterable
+
+import numpy
+from PIL import ImageDraw
+
+from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+
+_log = logging.getLogger(__name__)
+
+
+class EasyOcrModel:
+    def __init__(self, config):
+        self.config = config
+        self.enabled = config["enabled"]
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+
+        if self.enabled:
+            import easyocr
+
+            self.reader = easyocr.Reader(config["lang"])
+
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+
+        if not self.enabled:
+            yield from page_batch
+            return
+
+        for page in page_batch:
+            # rects = page._fpage.
+            high_res_image = page._backend.get_page_image(scale=self.scale)
+            im = numpy.array(high_res_image)
+            result = self.reader.readtext(im)
+
+            del high_res_image
+            del im
+
+            cells = [
+                OcrCell(
+                    id=ix,
+                    text=line[1],
+                    confidence=line[2],
+                    bbox=BoundingBox.from_tuple(
+                        coord=(
+                            line[0][0][0] / self.scale,
+                            line[0][0][1] / self.scale,
+                            line[0][2][0] / self.scale,
+                            line[0][2][1] / self.scale,
+                        ),
+                        origin=CoordOrigin.TOPLEFT,
+                    ),
+                )
+                for ix, line in enumerate(result)
+            ]
+
+            page.cells = cells  # For now, just overwrites all digital cells.
+
+            # DEBUG code:
+            def draw_clusters_and_cells():
+                image = copy.deepcopy(page.image)
+                draw = ImageDraw.Draw(image)
+
+                cell_color = (
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                )
+                for tc in cells:
+                    x0, y0, x1, y1 = tc.bbox.as_tuple()
+                    draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
+                image.show()
+
+            # draw_clusters_and_cells()
+
+            yield page
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -0,0 +1,318 @@
+import copy
+import logging
+import random
+import time
+from typing import Iterable, List
+
+from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
+from PIL import ImageDraw
+
+from docling.datamodel.base_models import (
+    BoundingBox,
+    Cell,
+    Cluster,
+    CoordOrigin,
+    LayoutPrediction,
+    Page,
+)
+from docling.utils import layout_utils as lu
+
+_log = logging.getLogger(__name__)
+
+
+class LayoutModel:
+
+    TEXT_ELEM_LABELS = [
+        "Text",
+        "Footnote",
+        "Caption",
+        "Checkbox-Unselected",
+        "Checkbox-Selected",
+        "Section-header",
+        "Page-header",
+        "Page-footer",
+        "Code",
+        "List-item",
+        # "Formula",
+    ]
+    PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
+
+    TABLE_LABEL = "Table"
+    FIGURE_LABEL = "Picture"
+    FORMULA_LABEL = "Formula"
+
+    def __init__(self, config):
+        self.config = config
+        self.layout_predictor = LayoutPredictor(
+            config["artifacts_path"]
+        )  # TODO temporary
+
+    def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
+        MIN_INTERSECTION = 0.2
+        CLASS_THRESHOLDS = {
+            "Caption": 0.35,
+            "Footnote": 0.35,
+            "Formula": 0.35,
+            "List-item": 0.35,
+            "Page-footer": 0.35,
+            "Page-header": 0.35,
+            "Picture": 0.2,  # low threshold adjust to capture chemical structures for examples.
+            "Section-header": 0.45,
+            "Table": 0.35,
+            "Text": 0.45,
+            "Title": 0.45,
+            "Document Index": 0.45,
+            "Code": 0.45,
+            "Checkbox-Selected": 0.45,
+            "Checkbox-Unselected": 0.45,
+            "Form": 0.45,
+            "Key-Value Region": 0.45,
+        }
+
+        _log.debug("================= Start postprocess function ====================")
+        start_time = time.time()
+        # Apply Confidence Threshold to cluster predictions
+        # confidence = self.conf_threshold
+        clusters_out = []
+
+        for cluster in clusters:
+            confidence = CLASS_THRESHOLDS[cluster.label]
+            if cluster.confidence >= confidence:
+                # annotation["created_by"] = "high_conf_pred"
+                clusters_out.append(cluster)
+
+        # map to dictionary clusters and cells, with bottom left origin
+        clusters = [
+            {
+                "id": c.id,
+                "bbox": list(
+                    c.bbox.to_bottom_left_origin(page_height).as_tuple()
+                ),  # TODO
+                "confidence": c.confidence,
+                "cell_ids": [],
+                "type": c.label,
+            }
+            for c in clusters
+        ]
+
+        clusters_out = [
+            {
+                "id": c.id,
+                "bbox": list(
+                    c.bbox.to_bottom_left_origin(page_height).as_tuple()
+                ),  # TODO
+                "confidence": c.confidence,
+                "created_by": "high_conf_pred",
+                "cell_ids": [],
+                "type": c.label,
+            }
+            for c in clusters_out
+        ]
+
+        raw_cells = [
+            {
+                "id": c.id,
+                "bbox": list(
+                    c.bbox.to_bottom_left_origin(page_height).as_tuple()
+                ),  # TODO
+                "text": c.text,
+            }
+            for c in cells
+        ]
+        cell_count = len(raw_cells)
+
+        _log.debug("---- 0. Treat cluster overlaps ------")
+        clusters_out = lu.remove_cluster_duplicates_by_conf(clusters_out, 0.8)
+
+        _log.debug(
+            "---- 1. Initially assign cells to clusters based on minimum intersection ------"
+        )
+        ## Check for cells included in or touched by clusters:
+        clusters_out = lu.assigning_cell_ids_to_clusters(
+            clusters_out, raw_cells, MIN_INTERSECTION
+        )
+
+        _log.debug("---- 2. Assign Orphans with Low Confidence Detections")
+        # Creates a map of cell_id->cluster_id
+        (
+            clusters_around_cells,
+            orphan_cell_indices,
+            ambiguous_cell_indices,
+        ) = lu.cell_id_state_map(clusters_out, cell_count)
+
+        # Assign orphan cells with lower confidence predictions
+        clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred(
+            clusters_out, clusters, raw_cells, orphan_cell_indices
+        )
+
+        # Refresh the cell_ids assignment, after creating new clusters using low conf predictions
+        clusters_out = lu.assigning_cell_ids_to_clusters(
+            clusters_out, raw_cells, MIN_INTERSECTION
+        )
+
+        _log.debug("---- 3. Settle Ambigous Cells")
+        # Creates an update map after assignment of cell_id->cluster_id
+        (
+            clusters_around_cells,
+            orphan_cell_indices,
+            ambiguous_cell_indices,
+        ) = lu.cell_id_state_map(clusters_out, cell_count)
+
+        # Settle pdf cells that belong to multiple clusters
+        clusters_out, ambiguous_cell_indices = lu.remove_ambigous_pdf_cell_by_conf(
+            clusters_out, raw_cells, ambiguous_cell_indices
+        )
+
+        _log.debug("---- 4. Set Orphans as Text")
+        (
+            clusters_around_cells,
+            orphan_cell_indices,
+            ambiguous_cell_indices,
+        ) = lu.cell_id_state_map(clusters_out, cell_count)
+
+        clusters_out, orphan_cell_indices = lu.set_orphan_as_text(
+            clusters_out, clusters, raw_cells, orphan_cell_indices
+        )
+
+        _log.debug("---- 5. Merge Cells & and adapt the bounding boxes")
+        # Merge cells orphan cells
+        clusters_out = lu.merge_cells(clusters_out)
+
+        # Clean up clusters that remain from merged and unreasonable clusters
+        clusters_out = lu.clean_up_clusters(
+            clusters_out,
+            raw_cells,
+            merge_cells=True,
+            img_table=True,
+            one_cell_table=True,
+        )
+
+        new_clusters = lu.adapt_bboxes(raw_cells, clusters_out, orphan_cell_indices)
+        clusters_out = new_clusters
+
+        ## We first rebuild where every cell is now:
+        ##   Now we write into a prediction cells list, not into the raw cells list.
+        ##   As we don't need previous labels, we best overwrite any old list, because that might
+        ##   have been sorted differently.
+        (
+            clusters_around_cells,
+            orphan_cell_indices,
+            ambiguous_cell_indices,
+        ) = lu.cell_id_state_map(clusters_out, cell_count)
+
+        target_cells = []
+        for ix, cell in enumerate(raw_cells):
+            new_cell = {
+                "id": ix,
+                "rawcell_id": ix,
+                "label": "None",
+                "bbox": cell["bbox"],
+                "text": cell["text"],
+            }
+            for cluster_index in clusters_around_cells[
+                ix
+            ]:  # By previous analysis, this is always 1 cluster.
+                new_cell["label"] = clusters_out[cluster_index]["type"]
+            target_cells.append(new_cell)
+            # _log.debug("New label of cell " + str(ix) + " is " + str(new_cell["label"]))
+        cells_out = target_cells
+
+        ## -------------------------------
+        ## Sort clusters into reasonable reading order, and sort the cells inside each cluster
+        _log.debug("---- 5. Sort clusters in reading order ------")
+        sorted_clusters = lu.produce_reading_order(
+            clusters_out, "raw_cell_ids", "raw_cell_ids", True
+        )
+        clusters_out = sorted_clusters
+
+        # end_time = timer()
+        _log.debug("---- End of postprocessing function ------")
+        end_time = time.time() - start_time
+        _log.debug(f"Finished post processing in seconds={end_time:.3f}")
+
+        cells_out = [
+            Cell(
+                id=c["id"],
+                bbox=BoundingBox.from_tuple(
+                    coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
+                ).to_top_left_origin(page_height),
+                text=c["text"],
+            )
+            for c in cells_out
+        ]
+        clusters_out_new = []
+        for c in clusters_out:
+            cluster_cells = [ccell for ccell in cells_out if ccell.id in c["cell_ids"]]
+            c_new = Cluster(
+                id=c["id"],
+                bbox=BoundingBox.from_tuple(
+                    coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
+                ).to_top_left_origin(page_height),
+                confidence=c["confidence"],
+                label=c["type"],
+                cells=cluster_cells,
+            )
+            clusters_out_new.append(c_new)
+
+        return clusters_out_new, cells_out
+
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        for page in page_batch:
+            clusters = []
+            for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)):
+                cluster = Cluster(
+                    id=ix,
+                    label=pred_item["label"],
+                    confidence=pred_item["confidence"],
+                    bbox=BoundingBox.model_validate(pred_item),
+                    cells=[],
+                )
+                clusters.append(cluster)
+
+            # Map cells to clusters
+            # TODO: Remove, postprocess should take care of it anyway.
+            for cell in page.cells:
+                for cluster in clusters:
+                    if not cell.bbox.area() > 0:
+                        overlap_frac = 0.0
+                    else:
+                        overlap_frac = (
+                            cell.bbox.intersection_area_with(cluster.bbox)
+                            / cell.bbox.area()
+                        )
+
+                    if overlap_frac > 0.5:
+                        cluster.cells.append(cell)
+
+            # Pre-sort clusters
+            # clusters = self.sort_clusters_by_cell_order(clusters)
+
+            # DEBUG code:
+            def draw_clusters_and_cells():
+                image = copy.deepcopy(page.image)
+                draw = ImageDraw.Draw(image)
+                for c in clusters:
+                    x0, y0, x1, y1 = c.bbox.as_tuple()
+                    draw.rectangle([(x0, y0), (x1, y1)], outline="green")
+
+                    cell_color = (
+                        random.randint(30, 140),
+                        random.randint(30, 140),
+                        random.randint(30, 140),
+                    )
+                    for tc in c.cells:  # [:1]:
+                        x0, y0, x1, y1 = tc.bbox.as_tuple()
+                        draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
+                image.show()
+
+            # draw_clusters_and_cells()
+
+            clusters, page.cells = self.postprocess(
+                clusters, page.cells, page.size.height
+            )
+
+            # draw_clusters_and_cells()
+
+            page.predictions.layout = LayoutPrediction(clusters=clusters)
+
+            yield page
--- a/docling/models/page_assemble_model.py
+++ b/docling/models/page_assemble_model.py
@ -0,0 +1,160 @@
+import logging
+import re
+from typing import Iterable, List
+
+from docling.datamodel.base_models import (
+    AssembledUnit,
+    FigureElement,
+    Page,
+    PageElement,
+    TableElement,
+    TextElement,
+)
+from docling.models.layout_model import LayoutModel
+
+_log = logging.getLogger(__name__)
+
+
+class PageAssembleModel:
+    def __init__(self, config):
+        self.config = config
+
+        # self.line_wrap_pattern = re.compile(r'(?<=[^\W_])- \n(?=\w)')
+
+    # def sanitize_text_poor(self, lines):
+    #     text = '\n'.join(lines)
+    #
+    #     # treat line wraps.
+    #     sanitized_text = self.line_wrap_pattern.sub('', text)
+    #
+    #     sanitized_text = sanitized_text.replace('\n', ' ')
+    #
+    #     return sanitized_text
+
+    def sanitize_text(self, lines):
+        if len(lines) <= 1:
+            return " ".join(lines)
+
+        for ix, line in enumerate(lines[1:]):
+            prev_line = lines[ix]
+
+            if prev_line.endswith("-"):
+                prev_words = re.findall(r"\b[\w]+\b", prev_line)
+                line_words = re.findall(r"\b[\w]+\b", line)
+
+                if (
+                    len(prev_words)
+                    and len(line_words)
+                    and prev_words[-1].isalnum()
+                    and line_words[0].isalnum()
+                ):
+                    lines[ix] = prev_line[:-1]
+            else:
+                lines[ix] += " "
+
+        sanitized_text = "".join(lines)
+
+        return sanitized_text.strip()  # Strip any leading or trailing whitespace
+
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        for page in page_batch:
+            # assembles some JSON output page by page.
+
+            elements: List[PageElement] = []
+            headers: List[PageElement] = []
+            body: List[PageElement] = []
+
+            for cluster in page.predictions.layout.clusters:
+                # _log.info("Cluster label seen:", cluster.label)
+                if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
+
+                    textlines = [
+                        cell.text.replace("\x02", "-").strip()
+                        for cell in cluster.cells
+                        if len(cell.text.strip()) > 0
+                    ]
+                    text = self.sanitize_text(textlines)
+                    text_el = TextElement(
+                        label=cluster.label,
+                        id=cluster.id,
+                        text=text,
+                        page_no=page.page_no,
+                        cluster=cluster,
+                    )
+                    elements.append(text_el)
+
+                    if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
+                        headers.append(text_el)
+                    else:
+                        body.append(text_el)
+                elif cluster.label == LayoutModel.TABLE_LABEL:
+                    tbl = None
+                    if page.predictions.tablestructure:
+                        tbl = page.predictions.tablestructure.table_map.get(
+                            cluster.id, None
+                        )
+                    if (
+                        not tbl
+                    ):  # fallback: add table without structure, if it isn't present
+                        tbl = TableElement(
+                            label=cluster.label,
+                            id=cluster.id,
+                            text="",
+                            otsl_seq=[],
+                            table_cells=[],
+                            cluster=cluster,
+                            page_no=page.page_no,
+                        )
+
+                    elements.append(tbl)
+                    body.append(tbl)
+                elif cluster.label == LayoutModel.FIGURE_LABEL:
+                    fig = None
+                    if page.predictions.figures_classification:
+                        fig = page.predictions.figures_classification.figure_map.get(
+                            cluster.id, None
+                        )
+                    if (
+                        not fig
+                    ):  # fallback: add figure without classification, if it isn't present
+                        fig = FigureElement(
+                            label=cluster.label,
+                            id=cluster.id,
+                            text="",
+                            data=None,
+                            cluster=cluster,
+                            page_no=page.page_no,
+                        )
+                    elements.append(fig)
+                    body.append(fig)
+                elif cluster.label == LayoutModel.FORMULA_LABEL:
+                    equation = None
+                    if page.predictions.equations_prediction:
+                        equation = (
+                            page.predictions.equations_prediction.equation_map.get(
+                                cluster.id, None
+                            )
+                        )
+                    if not equation:  # fallback: add empty formula, if it isn't present
+                        text = self.sanitize_text(
+                            [
+                                cell.text.replace("\x02", "-").strip()
+                                for cell in cluster.cells
+                                if len(cell.text.strip()) > 0
+                            ]
+                        )
+                        equation = TextElement(
+                            label=cluster.label,
+                            id=cluster.id,
+                            cluster=cluster,
+                            page_no=page.page_no,
+                            text=text,
+                        )
+                    elements.append(equation)
+                    body.append(equation)
+
+            page.assembled = AssembledUnit(
+                elements=elements, headers=headers, body=body
+            )
+
+            yield page
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -0,0 +1,114 @@
+from typing import Iterable
+
+import numpy
+from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
+
+from docling.datamodel.base_models import (
+    BoundingBox,
+    Page,
+    TableCell,
+    TableElement,
+    TableStructurePrediction,
+)
+
+
+class TableStructureModel:
+    def __init__(self, config):
+        self.config = config
+        self.do_cell_matching = config["do_cell_matching"]
+
+        self.enabled = config["enabled"]
+        if self.enabled:
+            artifacts_path = config["artifacts_path"]
+            # Third Party
+            import docling_ibm_models.tableformer.common as c
+
+            self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
+            self.tm_config["model"]["save_dir"] = artifacts_path
+            self.tm_model_type = self.tm_config["model"]["type"]
+
+            self.tf_predictor = TFPredictor(self.tm_config)
+
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+
+        if not self.enabled:
+            yield from page_batch
+            return
+
+        for page in page_batch:
+            page.predictions.tablestructure = TableStructurePrediction()  # dummy
+
+            in_tables = [
+                (
+                    cluster,
+                    [
+                        round(cluster.bbox.l),
+                        round(cluster.bbox.t),
+                        round(cluster.bbox.r),
+                        round(cluster.bbox.b),
+                    ],
+                )
+                for cluster in page.predictions.layout.clusters
+                if cluster.label == "Table"
+            ]
+            if not len(in_tables):
+                yield page
+                continue
+
+            tokens = []
+            for c in page.cells:
+                for cluster, _ in in_tables:
+                    if c.bbox.area() > 0:
+                        if (
+                            c.bbox.intersection_area_with(cluster.bbox) / c.bbox.area()
+                            > 0.2
+                        ):
+                            # Only allow non empty stings (spaces) into the cells of a table
+                            if len(c.text.strip()) > 0:
+                                tokens.append(c.model_dump())
+
+            iocr_page = {
+                "image": numpy.asarray(page.image),
+                "tokens": tokens,
+                "width": page.size.width,
+                "height": page.size.height,
+            }
+
+            table_clusters, table_bboxes = zip(*in_tables)
+
+            if len(table_bboxes):
+                tf_output = self.tf_predictor.multi_table_predict(
+                    iocr_page, table_bboxes, do_matching=self.do_cell_matching
+                )
+
+                for table_cluster, table_out in zip(table_clusters, tf_output):
+                    table_cells = []
+                    for element in table_out["tf_responses"]:
+
+                        if not self.do_cell_matching:
+                            the_bbox = BoundingBox.model_validate(element["bbox"])
+                            text_piece = page._backend.get_text_in_rect(the_bbox)
+                            element["bbox"]["token"] = text_piece
+
+                        tc = TableCell.model_validate(element)
+                        table_cells.append(tc)
+
+                    # Retrieving cols/rows, after post processing:
+                    num_rows = table_out["predict_details"]["num_rows"]
+                    num_cols = table_out["predict_details"]["num_cols"]
+                    otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
+
+                    tbl = TableElement(
+                        otsl_seq=otsl_seq,
+                        table_cells=table_cells,
+                        num_rows=num_rows,
+                        num_cols=num_cols,
+                        id=table_cluster.id,
+                        page_no=page.page_no,
+                        cluster=table_cluster,
+                        label="Table",
+                    )
+
+                    page.predictions.tablestructure.table_map[table_cluster.id] = tbl
+
+            yield page
--- a/docling/pipeline/init.py
+++ b/docling/pipeline/init.py
--- a/docling/pipeline/base_model_pipeline.py
+++ b/docling/pipeline/base_model_pipeline.py
@ -0,0 +1,18 @@
+from abc import abstractmethod
+from pathlib import Path
+from typing import Iterable
+
+from docling.datamodel.base_models import Page, PipelineOptions
+
+
+class BaseModelPipeline:
+    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
+        self.model_pipe = []
+        self.artifacts_path = artifacts_path
+        self.pipeline_options = pipeline_options
+
+    def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+        for model in self.model_pipe:
+            page_batch = model(page_batch)
+
+        yield from page_batch
--- a/docling/pipeline/standard_model_pipeline.py
+++ b/docling/pipeline/standard_model_pipeline.py
@ -0,0 +1,40 @@
+from pathlib import Path
+from typing import Iterable
+
+from docling.datamodel.base_models import Page, PipelineOptions
+from docling.models.easyocr_model import EasyOcrModel
+from docling.models.layout_model import LayoutModel
+from docling.models.page_assemble_model import PageAssembleModel
+from docling.models.table_structure_model import TableStructureModel
+from docling.pipeline.base_model_pipeline import BaseModelPipeline
+
+
+class StandardModelPipeline(BaseModelPipeline):
+    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
+    _table_model_path = "model_artifacts/tableformer"
+
+    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
+        super().__init__(artifacts_path, pipeline_options)
+
+        self.model_pipe = [
+            EasyOcrModel(
+                config={
+                    "lang": ["fr", "de", "es", "en"],
+                    "enabled": pipeline_options.do_ocr,
+                }
+            ),
+            LayoutModel(
+                config={
+                    "artifacts_path": artifacts_path
+                    / StandardModelPipeline._layout_model_path
+                }
+            ),
+            TableStructureModel(
+                config={
+                    "artifacts_path": artifacts_path
+                    / StandardModelPipeline._table_model_path,
+                    "enabled": pipeline_options.do_table_structure,
+                    "do_cell_matching": False,
+                }
+            ),
+        ]
--- a/docling/utils/init.py
+++ b/docling/utils/init.py
--- a/docling/utils/layout_utils.py
+++ b/docling/utils/layout_utils.py
@ -0,0 +1,806 @@
+import copy
+import logging
+
+import networkx as nx
+
+logger = logging.getLogger("layout_utils")
+
+
+## -------------------------------
+## Geometric helper functions
+## The coordinates grow left to right, and bottom to top.
+## The bounding box list elements 0 to 3 are x_left, y_bottom, x_right, y_top.
+
+
+def area(bbox):
+    return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+
+
+def contains(bbox_i, bbox_j):
+    ## Returns True if bbox_i contains bbox_j, else False
+    return (
+        bbox_i[0] <= bbox_j[0]
+        and bbox_i[1] <= bbox_j[1]
+        and bbox_i[2] >= bbox_j[2]
+        and bbox_i[3] >= bbox_j[3]
+    )
+
+
+def is_intersecting(bbox_i, bbox_j):
+    return not (
+        bbox_i[2] < bbox_j[0]
+        or bbox_i[0] > bbox_j[2]
+        or bbox_i[3] < bbox_j[1]
+        or bbox_i[1] > bbox_j[3]
+    )
+
+
+def bb_iou(boxA, boxB):
+    # determine the (x, y)-coordinates of the intersection rectangle
+    xA = max(boxA[0], boxB[0])
+    yA = max(boxA[1], boxB[1])
+    xB = min(boxA[2], boxB[2])
+    yB = min(boxA[3], boxB[3])
+    # compute the area of intersection rectangle
+    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
+    # compute the area of both the prediction and ground-truth
+    # rectangles
+    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
+    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
+    # compute the intersection over union by taking the intersection
+    # area and dividing it by the sum of prediction + ground-truth
+    # areas - the interesection area
+    iou = interArea / float(boxAArea + boxBArea - interArea)
+    # return the intersection over union value
+    return iou
+
+
+def compute_intersection(bbox_i, bbox_j):
+    ## Returns the size of the intersection area of the two boxes
+    if not is_intersecting(bbox_i, bbox_j):
+        return 0
+    ## Determine the (x, y)-coordinates of the intersection rectangle:
+    xA = max(bbox_i[0], bbox_j[0])
+    yA = max(bbox_i[1], bbox_j[1])
+    xB = min(bbox_i[2], bbox_j[2])
+    yB = min(bbox_i[3], bbox_j[3])
+    ## Compute the area of intersection rectangle:
+    interArea = (xB - xA) * (yB - yA)
+    if interArea < 0:
+        logger.debug("Warning: Negative intersection detected!")
+        return 0
+    return interArea
+
+
+def surrounding(bbox_i, bbox_j):
+    ## Computes minimal box that contains both input boxes
+    sbox = []
+    sbox.append(min(bbox_i[0], bbox_j[0]))
+    sbox.append(min(bbox_i[1], bbox_j[1]))
+    sbox.append(max(bbox_i[2], bbox_j[2]))
+    sbox.append(max(bbox_i[3], bbox_j[3]))
+    return sbox
+
+
+def surrounding_list(bbox_list):
+    ## Computes minimal box that contains all boxes in the input list
+    ## The list should be non-empty, but just in case it's not:
+    if len(bbox_list) == 0:
+        sbox = [0, 0, 0, 0]
+    else:
+        sbox = []
+        sbox.append(min([bbox[0] for bbox in bbox_list]))
+        sbox.append(min([bbox[1] for bbox in bbox_list]))
+        sbox.append(max([bbox[2] for bbox in bbox_list]))
+        sbox.append(max([bbox[3] for bbox in bbox_list]))
+    return sbox
+
+
+def vertical_overlap(bboxA, bboxB):
+    ## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
+    if bboxB[3] < bboxA[1]:  ## B below A
+        return False
+    elif bboxA[3] < bboxB[1]:  ## A below B
+        return False
+    else:
+        return True
+
+
+def vertical_overlap_fraction(bboxA, bboxB):
+    ## Returns the vertical overlap as fraction of the lower bbox height.
+    ## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
+    ## Height 0 is permitted in the input.
+    heightA = bboxA[3] - bboxA[1]
+    heightB = bboxB[3] - bboxB[1]
+    min_height = min(heightA, heightB)
+    if bboxA[3] >= bboxB[3]:  ## A starts higher or equal
+        if (
+            bboxA[1] <= bboxB[1]
+        ):  ## B is completely in A; this can include height of B = 0:
+            fraction = 1
+        else:
+            overlap = max(bboxB[3] - bboxA[1], 0)
+            fraction = overlap / max(min_height, 0.001)
+    else:
+        if (
+            bboxB[1] <= bboxA[1]
+        ):  ## A is completely in B; this can include height of A = 0:
+            fraction = 1
+        else:
+            overlap = max(bboxA[3] - bboxB[1], 0)
+            fraction = overlap / max(min_height, 0.001)
+    return fraction
+
+
+## -------------------------------
+## Cluster-and-cell relations
+
+
+def compute_enclosed_cells(
+    cluster_bbox, raw_cells, min_cell_intersection_with_cluster=0.2
+):
+    cells_in_cluster = []
+    cells_in_cluster_int = []
+    for ix, cell in enumerate(raw_cells):
+        cell_bbox = cell["bbox"]
+        intersection = compute_intersection(cell_bbox, cluster_bbox)
+        frac_area = area(cell_bbox) * min_cell_intersection_with_cluster
+
+        if (
+            intersection > frac_area and frac_area > 0
+        ):  # intersect > certain fraction of cell
+            cells_in_cluster.append(ix)
+            cells_in_cluster_int.append(intersection)
+        elif contains(
+            cluster_bbox,
+            [cell_bbox[0] + 3, cell_bbox[1] + 3, cell_bbox[2] - 3, cell_bbox[3] - 3],
+        ):
+            cells_in_cluster.append(ix)
+    return cells_in_cluster, cells_in_cluster_int
+
+
+def find_clusters_around_cells(cell_count, clusters):
+    ## Per raw cell, find to which clusters it belongs.
+    ## Return list of these indices in the raw-cell order.
+    clusters_around_cells = [[] for _ in range(cell_count)]
+    for cl_ix, cluster in enumerate(clusters):
+        for ix in cluster["cell_ids"]:
+            clusters_around_cells[ix].append(cl_ix)
+    return clusters_around_cells
+
+
+def find_cell_index(raw_ix, cell_array):
+    ## "raw_ix" is a rawcell_id.
+    ## "cell_array" has the structure of an (annotation) cells array.
+    ## Returns index of cell in cell_array that has this rawcell_id.
+    for ix, cell in enumerate(cell_array):
+        if cell["rawcell_id"] == raw_ix:
+            return ix
+
+
+def find_cell_indices(cluster, cell_array):
+    ## "cluster" must have the structure as in a clusters array in a prediction,
+    ## "cell_array" that of a cells array.
+    ## Returns list of indices of cells in cell_array that have the rawcell_ids as in the cluster,
+    ## in the order of the rawcell_ids.
+    result = []
+    for raw_ix in sorted(cluster["cell_ids"]):
+        ## Find the cell with this rawcell_id (if any)
+        for ix, cell in enumerate(cell_array):
+            if cell["rawcell_id"] == raw_ix:
+                result.append(ix)
+    return result
+
+
+def find_first_cell_index(cluster, cell_array):
+    ## "cluster" must be a dict with key "cell_ids"; it can also be a line.
+    ## "cell_array" has the structure of a cells array in an annotation.
+    ## Returns index of cell in cell_array that has the lowest rawcell_id from the cluster.
+    result = []  ## We keep it a list as it can be empty (picture without text cells)
+    if len(cluster["cell_ids"]) == 0:
+        return result
+    raw_ix = min(cluster["cell_ids"])
+    ## Find the cell with this rawcell_id (if any)
+    for ix, cell in enumerate(cell_array):
+        if cell["rawcell_id"] == raw_ix:
+            result.append(ix)
+            break  ## One is enough; should be only one anyway.
+    if result == []:
+        logger.debug(
+            "  Warning: Raw cell " + str(raw_ix) + " not found in annotation cells"
+        )
+    return result
+
+
+## -------------------------------
+## Cluster labels and text
+
+
+def relabel_cluster(cluster, cl_ix, new_label, target_pred):
+    ## "cluster" must have the structure as in a clusters array in a prediction,
+    ## "cl_ix" is its index in target_pred,
+    ## "new_label" is the intended new label,
+    ## "target_pred" is the entire current target prediction.
+    ## Sets label on the cluster itself, and on the cells in the target_pred.
+    ## Returns new_label so that also the cl_label variable in the main code is easily set.
+    target_pred["clusters"][cl_ix]["type"] = new_label
+    cluster_target_cells = find_cell_indices(cluster, target_pred["cells"])
+    for ix in cluster_target_cells:
+        target_pred["cells"][ix]["label"] = new_label
+    return new_label
+
+
+def find_cluster_text(cluster, raw_cells):
+    ## "cluster" must be a dict with "cell_ids"; it can also be a line.
+    ## "raw_cells" must have the format of item["raw"]["cells"]
+    ## Returns the text of the cluster, with blanks between the cell contents
+    ## (which seem to be words or phrases without starting or trailing blanks).
+    ## Note that in formulas, this may give a lot more blanks than originally
+    cluster_text = ""
+    for raw_ix in sorted(cluster["cell_ids"]):
+        cluster_text = cluster_text + raw_cells[raw_ix]["text"] + " "
+    return cluster_text.rstrip()
+
+
+def find_cluster_text_without_blanks(cluster, raw_cells):
+    ## "cluster" must be a dict with "cell_ids"; it can also be a line.
+    ## "raw_cells" must have the format of item["raw"]["cells"]
+    ## Returns the text of the cluster, without blanks between the cell contents
+    ## Interesting in formula analysis.
+    cluster_text = ""
+    for raw_ix in sorted(cluster["cell_ids"]):
+        cluster_text = cluster_text + raw_cells[raw_ix]["text"]
+    return cluster_text.rstrip()
+
+
+## -------------------------------
+## Clusters and lines
+## (Most line-oriented functions are only needed in TextAnalysisGivenClusters,
+##  but this one also in FormulaAnalysis)
+
+
+def build_cluster_from_lines(lines, label, id):
+    ## Lines must be a non-empty list of dicts (lines) with elements "cell_ids" and "bbox"
+    ## (There is no condition that they are really geometrically lines)
+    ## A cluster in standard format is returned with given label and id
+    local_lines = copy.deepcopy(
+        lines
+    )  ## without this, it changes "lines" also outside this function
+    first_line = local_lines.pop(0)
+    cluster = {
+        "id": id,
+        "type": label,
+        "cell_ids": first_line["cell_ids"],
+        "bbox": first_line["bbox"],
+        "confidence": 0,
+        "created_by": "merged_cells",
+    }
+    confidence = 0
+    counter = 0
+    for line in local_lines:
+        new_cell_ids = cluster["cell_ids"] + line["cell_ids"]
+        cluster["cell_ids"] = new_cell_ids
+        cluster["bbox"] = surrounding(cluster["bbox"], line["bbox"])
+        counter += 1
+        confidence += line["confidence"]
+    confidence = confidence / counter
+    cluster["confidence"] = confidence
+    return cluster
+
+
+## -------------------------------
+## Reading order
+
+
+def produce_reading_order(clusters, cluster_sort_type, cell_sort_type, sort_ids):
+    ## In:
+    ##   Clusters: list as in predictions.
+    ##   cluster_sort_type: string, currently only "raw_cells".
+    ##   cell_sort_type: string, currently only "raw_cells".
+    ##   sort_ids: Boolean, whether the cluster ids should be adapted to their new position
+    ## Out: Another clusters list, sorted according to the type.
+
+    logger.debug("---- Start cluster sorting ------")
+
+    if cell_sort_type == "raw_cell_ids":
+        for cl in clusters:
+            sorted_cell_ids = sorted(cl["cell_ids"])
+            cl["cell_ids"] = sorted_cell_ids
+    else:
+        logger.debug(
+            "Unknown cell_sort_type `"
+            + cell_sort_type
+            + "`, no cell sorting will happen."
+        )
+
+    if cluster_sort_type == "raw_cell_ids":
+        clusters_with_cells = [cl for cl in clusters if cl["cell_ids"] != []]
+        clusters_without_cells = [cl for cl in clusters if cl["cell_ids"] == []]
+        logger.debug(
+            "Clusters with cells: " + str([cl["id"] for cl in clusters_with_cells])
+        )
+        logger.debug(
+            "  Their first cell ids: "
+            + str([cl["cell_ids"][0] for cl in clusters_with_cells])
+        )
+        logger.debug(
+            "Clusters without cells: "
+            + str([cl["id"] for cl in clusters_without_cells])
+        )
+        clusters_with_cells_sorted = sorted(
+            clusters_with_cells, key=lambda cluster: cluster["cell_ids"][0]
+        )
+        logger.debug(
+            "  First cell ids after sorting: "
+            + str([cl["cell_ids"][0] for cl in clusters_with_cells_sorted])
+        )
+        sorted_clusters = clusters_with_cells_sorted + clusters_without_cells
+    else:
+        logger.debug(
+            "Unknown cluster_sort_type: `"
+            + cluster_sort_type
+            + "`, no cluster sorting will happen."
+        )
+
+    if sort_ids:
+        for i, cl in enumerate(sorted_clusters):
+            cl["id"] = i
+    return sorted_clusters
+
+
+## -------------------------------
+## Line Splitting
+
+
+def sort_cells_horizontal(line_cell_ids, raw_cells):
+    ## "line_cells" should be a non-empty list of (raw) cell_ids
+    ## "raw_cells" has the structure of item["raw"]["cells"].
+    ## Sorts the cells in the line by x0 (left start).
+    new_line_cell_ids = sorted(
+        line_cell_ids, key=lambda cell_id: raw_cells[cell_id]["bbox"][0]
+    )
+    return new_line_cell_ids
+
+
+def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
+    new_clusters = []
+    for ix, cluster in enumerate(clusters):
+        new_cluster = copy.deepcopy(cluster)
+        logger.debug(
+            "Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
+        )
+        logger.debug("  with cells: " + str(new_cluster["cell_ids"]))
+        if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
+            logger.debug("  Empty non-picture, removed")
+            continue  ## Skip this former cluster, now without cells.
+        new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
+        new_cluster["bbox"] = new_bbox
+        new_clusters.append(new_cluster)
+    return new_clusters
+
+
+def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
+    if not (cluster["type"] in ["Table", "Picture"]):
+        ## A text-like cluster. The bbox only needs to be around the text cells:
+        logger.debug("    Initial bbox: " + str(cluster["bbox"]))
+        new_bbox = surrounding_list(
+            [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
+        )
+        logger.debug("  New bounding box:" + str(new_bbox))
+    if cluster["type"] == "Picture":
+        ## We only make the bbox completely comprise included text cells:
+        logger.debug("  Picture")
+        if len(cluster["cell_ids"]) != 0:
+            min_bbox = surrounding_list(
+                [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
+            )
+            logger.debug("    Minimum bbox: " + str(min_bbox))
+            logger.debug("    Initial bbox: " + str(cluster["bbox"]))
+            new_bbox = surrounding(min_bbox, cluster["bbox"])
+            logger.debug("    New bbox (initial and text cells): " + str(new_bbox))
+        else:
+            logger.debug("    without text cells, no change.")
+            new_bbox = cluster["bbox"]
+    else:  ## A table
+        ## At least we have to keep the included text cells, and we make the bbox completely comprise them
+        min_bbox = surrounding_list(
+            [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
+        )
+        logger.debug("    Minimum bbox: " + str(min_bbox))
+        logger.debug("    Initial bbox: " + str(cluster["bbox"]))
+        new_bbox = surrounding(min_bbox, cluster["bbox"])
+        logger.debug("    Possibly increased bbox: " + str(new_bbox))
+
+        ## Now we look which non-belonging cells are covered.
+        ## (To decrease dependencies, we don't make use of which cells we actually removed.)
+        ## We don't worry about orphan cells, those could still be added to the table.
+        enclosed_cells = compute_enclosed_cells(
+            new_bbox, raw_cells, min_cell_intersection_with_cluster=0.3
+        )[0]
+        additional_cells = set(enclosed_cells) - set(cluster["cell_ids"])
+        logger.debug(
+            "    Additional cells enclosed by Table bbox: " + str(additional_cells)
+        )
+        spurious_cells = additional_cells - set(orphan_cell_indices)
+        logger.debug(
+            "    Spurious cells enclosed by Table bbox (additional minus orphans): "
+            + str(spurious_cells)
+        )
+        if len(spurious_cells) == 0:
+            return new_bbox
+
+        ## Else we want to keep as much as possible, e.g., grid lines, but not the spurious cells if we can.
+        ## We initialize possible cuts with the current bbox.
+        left_cut = new_bbox[0]
+        right_cut = new_bbox[2]
+        upper_cut = new_bbox[3]
+        lower_cut = new_bbox[1]
+
+        for cell_ix in spurious_cells:
+            cell = raw_cells[cell_ix]
+            # logger.debug("     Spurious cell bbox: " + str(cell["bbox"]))
+            is_left = cell["bbox"][2] < min_bbox[0]
+            is_right = cell["bbox"][0] > min_bbox[2]
+            is_above = cell["bbox"][1] > min_bbox[3]
+            is_below = cell["bbox"][3] < min_bbox[1]
+            # logger.debug("      Left, right, above, below? " + str([is_left, is_right, is_above, is_below]))
+
+            if is_left:
+                if cell["bbox"][2] > left_cut:
+                    ## We move the left cut to exclude this cell:
+                    left_cut = cell["bbox"][2]
+            if is_right:
+                if cell["bbox"][0] < right_cut:
+                    ## We move the right cut to exclude this cell:
+                    right_cut = cell["bbox"][0]
+            if is_above:
+                if cell["bbox"][1] < upper_cut:
+                    ## We move the upper cut to exclude this cell:
+                    upper_cut = cell["bbox"][1]
+            if is_below:
+                if cell["bbox"][3] > lower_cut:
+                    ## We move the left cut to exclude this cell:
+                    lower_cut = cell["bbox"][3]
+            # logger.debug("      Current bbox: " + str([left_cut, lower_cut, right_cut, upper_cut]))
+
+            new_bbox = [left_cut, lower_cut, right_cut, upper_cut]
+
+        logger.debug("   Final bbox: " + str(new_bbox))
+    return new_bbox
+
+
+def remove_cluster_duplicates_by_conf(cluster_predictions, threshold=0.5):
+    DuplicateDeletedClusterIDs = []
+    for cluster_1 in cluster_predictions:
+        for cluster_2 in cluster_predictions:
+            if cluster_1["id"] != cluster_2["id"]:
+                if_conf = False
+                if cluster_1["confidence"] > cluster_2["confidence"]:
+                    if_conf = True
+                if if_conf == True:
+                    if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > threshold:
+                        DuplicateDeletedClusterIDs.append(cluster_2["id"])
+                    elif contains(
+                        cluster_1["bbox"],
+                        [
+                            cluster_2["bbox"][0] + 3,
+                            cluster_2["bbox"][1] + 3,
+                            cluster_2["bbox"][2] - 3,
+                            cluster_2["bbox"][3] - 3,
+                        ],
+                    ):
+                        DuplicateDeletedClusterIDs.append(cluster_2["id"])
+
+    DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
+
+    for cl_id in DuplicateDeletedClusterIDs:
+        for cluster in cluster_predictions:
+            if cl_id == cluster["id"]:
+                cluster_predictions.remove(cluster)
+    return cluster_predictions
+
+
+# Assign orphan cells by a low confidence prediction that is below the assigned confidence
+def assign_orphans_with_low_conf_pred(
+    cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
+):
+    for orph_id in orphan_cell_indices:
+        cluster_chosen = {}
+        iou_thresh = 0.05
+        confidence = 0.05
+
+        # Loop over all predictions, and find the one with the highest IOU, and confidence
+        for cluster in cluster_predictions_low:
+            calc_iou = bb_iou(cluster["bbox"], raw_cells[orph_id]["bbox"])
+            cluster_area = (cluster["bbox"][3] - cluster["bbox"][1]) * (
+                cluster["bbox"][2] - cluster["bbox"][0]
+            )
+            cell_area = (
+                raw_cells[orph_id]["bbox"][3] - raw_cells[orph_id]["bbox"][1]
+            ) * (raw_cells[orph_id]["bbox"][2] - raw_cells[orph_id]["bbox"][0])
+
+            if (
+                (iou_thresh < calc_iou)
+                and (cluster["confidence"] > confidence)
+                and (cell_area * 3 > cluster_area)
+            ):
+                cluster_chosen = cluster
+                iou_thresh = calc_iou
+                confidence = cluster["confidence"]
+        # If a candidate is found, assign to it the PDF cell ids, and tag that it was created by this function for tracking
+        if iou_thresh != 0.05 and confidence != 0.05:
+            cluster_chosen["cell_ids"].append(orph_id)
+            cluster_chosen["created_by"] = "orph_low_conf"
+            cluster_predictions.append(cluster_chosen)
+            orphan_cell_indices.remove(orph_id)
+    return cluster_predictions, orphan_cell_indices
+
+
+def remove_ambigous_pdf_cell_by_conf(cluster_predictions, raw_cells, amb_cell_idxs):
+    for amb_cell_id in amb_cell_idxs:
+        highest_conf = 0
+        highest_bbox_iou = 0
+        cluster_chosen = None
+        problamatic_clusters = []
+
+        # Find clusters in question
+        for cluster in cluster_predictions:
+
+            if amb_cell_id in cluster["cell_ids"]:
+                problamatic_clusters.append(amb_cell_id)
+
+                # If the cell_id is in a cluster of high conf, and highest iou score, and smaller in area
+                bbox_iou_val = bb_iou(cluster["bbox"], raw_cells[amb_cell_id]["bbox"])
+
+                if (
+                    cluster["confidence"] > highest_conf
+                    and bbox_iou_val > highest_bbox_iou
+                ):
+                    cluster_chosen = cluster
+                    highest_conf = cluster["confidence"]
+                    highest_bbox_iou = bbox_iou_val
+                    if cluster["id"] in problamatic_clusters:
+                        problamatic_clusters.remove(cluster["id"])
+
+        # now remove the assigning of cell id from lower confidence, and threshold
+        for cluster in cluster_predictions:
+            for prob_amb_id in problamatic_clusters:
+                if prob_amb_id in cluster["cell_ids"]:
+                    cluster["cell_ids"].remove(prob_amb_id)
+        amb_cell_idxs.remove(amb_cell_id)
+
+    return cluster_predictions, amb_cell_idxs
+
+
+def ranges(nums):
+    # Find if consecutive numbers exist within pdf cells
+    # Used to remove line numbers for review manuscripts
+    nums = sorted(set(nums))
+    gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s + 1 < e]
+    edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
+    return list(zip(edges, edges))
+
+
+def set_orphan_as_text(
+    cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
+):
+    max_id = -1
+    figures = []
+    for cluster in cluster_predictions:
+        if cluster["type"] == "Picture":
+            figures.append(cluster)
+
+        if cluster["id"] > max_id:
+            max_id = cluster["id"]
+    max_id += 1
+
+    lines_detector = False
+    content_of_orphans = []
+    for orph_id in orphan_cell_indices:
+        orph_cell = raw_cells[orph_id]
+        content_of_orphans.append(raw_cells[orph_id]["text"])
+
+    fil_content_of_orphans = []
+    for cell_content in content_of_orphans:
+        if cell_content.isnumeric():
+            try:
+                num = int(cell_content)
+                fil_content_of_orphans.append(num)
+            except ValueError:  # ignore the cell
+                pass
+
+    # line_orphans = []
+    #  Check if there are more than 2 pdf orphan cells, if there are more than 2,
+    #  then check between the orphan cells if they are numeric
+    # and if they are a consecutive series of numbers (using ranges function) to decide
+
+    if len(fil_content_of_orphans) > 2:
+        out_ranges = ranges(fil_content_of_orphans)
+        if len(out_ranges) > 1:
+            cnt_range = 0
+            for ranges_ in out_ranges:
+                if ranges_[0] != ranges_[1]:
+                    # If there are more than 75 (half the total line number of a review manuscript page)
+                    # decide that there are line numbers on page to be ignored.
+                    if len(list(range(ranges_[0], ranges_[1]))) > 75:
+                        lines_detector = True
+                        # line_orphans = line_orphans + list(range(ranges_[0], ranges_[1]))
+
+    for orph_id in orphan_cell_indices:
+        orph_cell = raw_cells[orph_id]
+        if bool(orph_cell["text"] and not orph_cell["text"].isspace()):
+            fig_flag = False
+            # Do not assign orphan cells if they are inside a figure
+            for fig in figures:
+                if contains(fig["bbox"], orph_cell["bbox"]):
+                    fig_flag = True
+
+            # if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
+            if fig_flag == False and lines_detector == False:
+                # get class from low confidence detections if not set as text:
+                class_type = "Text"
+
+                for cluster in cluster_predictions_low:
+                    intersection = compute_intersection(
+                        orph_cell["bbox"], cluster["bbox"]
+                    )
+                    class_type = "Text"
+                    if (
+                        cluster["confidence"] > 0.1
+                        and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
+                    ):
+                        class_type = cluster["type"]
+                    elif contains(
+                        cluster["bbox"],
+                        [
+                            orph_cell["bbox"][0] + 3,
+                            orph_cell["bbox"][1] + 3,
+                            orph_cell["bbox"][2] - 3,
+                            orph_cell["bbox"][3] - 3,
+                        ],
+                    ):
+                        class_type = cluster["type"]
+                    elif intersection > area(orph_cell["bbox"]) * 0.2:
+                        class_type = cluster["type"]
+
+                new_cluster = {
+                    "id": max_id,
+                    "bbox": orph_cell["bbox"],
+                    "type": class_type,
+                    "cell_ids": [orph_id],
+                    "confidence": -1,
+                    "created_by": "orphan_default",
+                }
+                max_id += 1
+                cluster_predictions.append(new_cluster)
+    return cluster_predictions, orphan_cell_indices
+
+
+def merge_cells(cluster_predictions):
+    # Using graph component creates clusters if orphan cells are touching or too close.
+    G = nx.Graph()
+    for cluster in cluster_predictions:
+        if cluster["created_by"] == "orphan_default":
+            G.add_node(cluster["id"])
+
+    for cluster_1 in cluster_predictions:
+        for cluster_2 in cluster_predictions:
+            if (
+                cluster_1["id"] != cluster_2["id"]
+                and cluster_2["created_by"] == "orphan_default"
+                and cluster_1["created_by"] == "orphan_default"
+            ):
+                cl1 = copy.deepcopy(cluster_1["bbox"])
+                cl2 = copy.deepcopy(cluster_2["bbox"])
+                cl1[0] = cl1[0] - 2
+                cl1[1] = cl1[1] - 2
+                cl1[2] = cl1[2] + 2
+                cl1[3] = cl1[3] + 2
+                cl2[0] = cl2[0] - 2
+                cl2[1] = cl2[1] - 2
+                cl2[2] = cl2[2] + 2
+                cl2[3] = cl2[3] + 2
+                if is_intersecting(cl1, cl2):
+                    G.add_edge(cluster_1["id"], cluster_2["id"])
+
+    component = sorted(map(sorted, nx.k_edge_components(G, k=1)))
+    max_id = -1
+    for cluster_1 in cluster_predictions:
+        if cluster_1["id"] > max_id:
+            max_id = cluster_1["id"]
+
+    for nodes in component:
+        if len(nodes) > 1:
+            max_id += 1
+            lines = []
+            for node in nodes:
+                for cluster in cluster_predictions:
+                    if cluster["id"] == node:
+                        lines.append(cluster)
+                        cluster_predictions.remove(cluster)
+            new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
+            cluster_predictions.append(new_merged_cluster)
+    return cluster_predictions
+
+
+def clean_up_clusters(
+    cluster_predictions,
+    raw_cells,
+    merge_cells=False,
+    img_table=False,
+    one_cell_table=False,
+):
+    DuplicateDeletedClusterIDs = []
+
+    for cluster_1 in cluster_predictions:
+        for cluster_2 in cluster_predictions:
+            if cluster_1["id"] != cluster_2["id"]:
+                # remove any artifcats created by merging clusters
+                if merge_cells == True:
+                    if contains(
+                        cluster_1["bbox"],
+                        [
+                            cluster_2["bbox"][0] + 3,
+                            cluster_2["bbox"][1] + 3,
+                            cluster_2["bbox"][2] - 3,
+                            cluster_2["bbox"][3] - 3,
+                        ],
+                    ):
+                        cluster_1["cell_ids"] = (
+                            cluster_1["cell_ids"] + cluster_2["cell_ids"]
+                        )
+                        DuplicateDeletedClusterIDs.append(cluster_2["id"])
+                # remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
+                elif img_table == True:
+                    if (
+                        cluster_1["type"] == "Text"
+                        and cluster_2["type"] == "Picture"
+                        or cluster_2["type"] == "Table"
+                    ):
+                        if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
+                            DuplicateDeletedClusterIDs.append(cluster_1["id"])
+                        elif contains(
+                            [
+                                cluster_2["bbox"][0] - 3,
+                                cluster_2["bbox"][1] - 3,
+                                cluster_2["bbox"][2] + 3,
+                                cluster_2["bbox"][3] + 3,
+                            ],
+                            cluster_1["bbox"],
+                        ):
+                            DuplicateDeletedClusterIDs.append(cluster_1["id"])
+            # remove tables that have one pdf cell
+            if one_cell_table == True:
+                if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
+                    DuplicateDeletedClusterIDs.append(cluster_1["id"])
+
+    DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
+
+    for cl_id in DuplicateDeletedClusterIDs:
+        for cluster in cluster_predictions:
+            if cl_id == cluster["id"]:
+                cluster_predictions.remove(cluster)
+    return cluster_predictions
+
+
+def assigning_cell_ids_to_clusters(clusters, raw_cells, threshold):
+    for cluster in clusters:
+        cells_in_cluster, _ = compute_enclosed_cells(
+            cluster["bbox"], raw_cells, min_cell_intersection_with_cluster=threshold
+        )
+        cluster["cell_ids"] = cells_in_cluster
+        ## These cell_ids are ids of the raw cells.
+        ## They are often, but not always, the same as the "id" or the index of the "cells" list in a prediction.
+    return clusters
+
+
+# Creates a map of cell_id->cluster_id
+def cell_id_state_map(clusters, cell_count):
+    clusters_around_cells = find_clusters_around_cells(cell_count, clusters)
+    orphan_cell_indices = [
+        ix for ix in range(cell_count) if len(clusters_around_cells[ix]) == 0
+    ]  # which cells are assigned no cluster?
+    ambiguous_cell_indices = [
+        ix for ix in range(cell_count) if len(clusters_around_cells[ix]) > 1
+    ]  # which cells are assigned > 1 clusters?
+    return clusters_around_cells, orphan_cell_indices, ambiguous_cell_indices
--- a/docling/utils/utils.py
+++ b/docling/utils/utils.py
@ -0,0 +1,41 @@
+import hashlib
+from io import BytesIO
+from itertools import islice
+from pathlib import Path
+from typing import List, Union
+
+
+def chunkify(iterator, chunk_size):
+    """Yield successive chunks of chunk_size from the iterable."""
+    if isinstance(iterator, List):
+        iterator = iter(iterator)
+    for first in iterator:  # Take the first element from the iterator
+        yield [first] + list(islice(iterator, chunk_size - 1))
+
+
+def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
+    """Create a stable page_hash of the path_or_stream of a file"""
+
+    block_size = 65536
+    hasher = hashlib.sha256()
+
+    def _hash_buf(binary_stream):
+        buf = binary_stream.read(block_size)  # read and page_hash in chunks
+        while len(buf) > 0:
+            hasher.update(buf)
+            buf = binary_stream.read(block_size)
+
+    if isinstance(path_or_stream, Path):
+        with path_or_stream.open("rb") as afile:
+            _hash_buf(afile)
+    elif isinstance(path_or_stream, BytesIO):
+        _hash_buf(path_or_stream)
+
+    return hasher.hexdigest()
+
+
+def create_hash(string: str):
+    hasher = hashlib.sha256()
+    hasher.update(string.encode("utf-8"))
+
+    return hasher.hexdigest()
--- a/examples/convert.py
+++ b/examples/convert.py
@ -0,0 +1,73 @@
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Iterable
+
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
+from docling.document_converter import DocumentConverter
+
+_log = logging.getLogger(__name__)
+
+
+def export_documents(
+    converted_docs: Iterable[ConvertedDocument],
+    output_dir: Path,
+):
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    success_count = 0
+    failure_count = 0
+
+    for doc in converted_docs:
+        if doc.status == ConversionStatus.SUCCESS:
+            success_count += 1
+            doc_filename = doc.input.file.stem
+
+            # Export Deep Search document JSON format:
+            with (output_dir / f"{doc_filename}.json").open("w") as fp:
+                fp.write(json.dumps(doc.render_as_dict()))
+
+            # Export Markdown format:
+            with (output_dir / f"{doc_filename}.md").open("w") as fp:
+                fp.write(doc.render_as_markdown())
+        else:
+            _log.info(f"Document {doc.input.file} failed to convert.")
+            failure_count += 1
+
+    _log.info(
+        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
+    )
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_paths = [
+        # Path("/Users/cau/Downloads/Issue-36122.pdf"),
+        # Path("/Users/cau/Downloads/IBM_Storage_Insights_Fact_Sheet.pdf"),
+        Path("./test/data/2206.01062.pdf"),
+        Path("./test/data/2203.01017v2.pdf"),
+        Path("./test/data/2305.03393v1.pdf"),
+    ]
+
+    artifacts_path = DocumentConverter.download_models_hf()
+
+    doc_converter = DocumentConverter(artifacts_path=artifacts_path)
+
+    input = DocumentConversionInput.from_paths(input_doc_paths)
+
+    start_time = time.time()
+
+    converted_docs = doc_converter.convert(input)
+    export_documents(converted_docs, output_dir=Path("./scratch"))
+
+    end_time = time.time() - start_time
+
+    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/minimal.py
+++ b/examples/minimal.py
@ -0,0 +1,11 @@
+from docling.datamodel.document import DocumentConversionInput
+from docling.document_converter import DocumentConverter
+
+artifacts_path = DocumentConverter.download_models_hf()
+doc_converter = DocumentConverter(artifacts_path=artifacts_path)
+
+input = DocumentConversionInput.from_paths(["factsheet.pdf"])
+converted_docs = doc_converter.convert(input)
+
+for d in converted_docs:
+    print(d.render_as_dict())
--- a/logo.png
+++ b/logo.png
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,72 @@
+[tool.poetry]
+name = "docling"
+version = "0.1.0"
+description = "Docling PDF conversion package"
+authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
+license = "MIT"
+readme = "README.md"
+keywords= ["docling", "convert", "document", "pdf", "layout model", "segmentation", "table structure", "table former"]
+ classifiers = [
+     "License :: OSI Approved :: MIT License",
+     "Operating System :: MacOS :: MacOS X",
+     "Operating System :: POSIX :: Linux",
+     "Development Status :: 5 - Production/Stable",
+     "Intended Audience :: Developers",
+     "Intended Audience :: Science/Research",
+     "Topic :: Scientific/Engineering :: Artificial Intelligence",
+     "Programming Language :: Python :: 3"
+ ]
+packages = [{include = "docling"}]
+
+[tool.poetry.dependencies]
+python = "^3.11"
+pydantic = "^2.0.0"
+docling-core = "^0.2.0"
+docling-ibm-models = "^0.2.0"
+deepsearch-glm = ">=0.18.4,<1"
+deepsearch-toolkit = ">=0.47.0,<1"
+filetype = "^1.2.0"
+pypdfium2 = "^4.30.0"
+pydantic-settings = "^2.3.0"
+huggingface_hub = ">=0.23,<1"
+
+[tool.poetry.group.ocr.dependencies]
+easyocr = "^1.7"
+
+[tool.poetry.group.dev.dependencies]
+black = {extras = ["jupyter"], version = "^24.4.2"}
+pytest = "^7.2.2"
+pre-commit = "^3.7.1"
+mypy = "^1.10.1"
+isort = "^5.10.1"
+python-semantic-release = "^7.32.2"
+flake8 = "^6.0.0"
+pyproject-flake8 = "^6.0.0"
+pytest-xdist = "^3.3.1"
+types-requests = "^2.31.0.2"
+flake8-pyproject = "^1.2.3"
+pylint = "^2.17.5"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.black]
+line-length = 88
+target-version = ["py311"]
+include = '\.pyi?$'
+
+[tool.isort]
+profile = "black"
+line_length = 88
+py_version=311
+
+[tool.mypy]
+pretty = true
+# strict = true
+no_implicit_optional = true
+python_version = "3.11"
+
+[tool.flake8]
+max-line-length = 88
+extend-ignore = ["E203", "E501"]
--- a/test/data/2203.01017v2.pdf
+++ b/test/data/2203.01017v2.pdf
--- a/test/data/2206.01062.pdf
+++ b/test/data/2206.01062.pdf
--- a/test/data/2305.03393v1.pdf
+++ b/test/data/2305.03393v1.pdf
--- a/test/test_backend_pdfium.py
+++ b/test/test_backend_pdfium.py
@ -0,0 +1,33 @@
+from pathlib import Path
+
+import pytest
+
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend
+from docling.datamodel.base_models import BoundingBox
+
+
+@pytest.fixture
+def test_doc_path():
+    return Path("./data/2206.01062.pdf")
+
+def test_get_text_from_rect(test_doc_path):
+    doc_backend = PyPdfiumDocumentBackend(test_doc_path)
+    page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
+
+    # Get the title text of the DocLayNet paper
+    textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124))
+    ref = "DocLayNet: A Large Human-Annotated Dataset for\r\nDocument-Layout Analysis"
+
+    assert textpiece.strip() == ref
+
+def test_crop_page_image(test_doc_path):
+    doc_backend = PyPdfiumDocumentBackend(test_doc_path)
+    page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
+
+    # Crop out "Figure 1" from the DocLayNet paper
+    im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527))
+    # im.show()
+
+def test_num_pages(test_doc_path):
+    doc_backend = PyPdfiumDocumentBackend(test_doc_path)
+    doc_backend.page_count() == 9