Initial commit

2024-07-15 09:42:42 +02:00 · 2024-07-15 09:42:42 +02:00 · e2d996753b
commit e2d996753b
38 changed files with 8767 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,442 @@
 model_artifacts/
 scratch/
 ds_convert_models/
 # Created by https://www.toptal.com/developers/gitignore/api/python,macos,virtualenv,pycharm,visualstudiocode,emacs,vim,jupyternotebooks
 # Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,virtualenv,pycharm,visualstudiocode,emacs,vim,jupyternotebooks
 ### Emacs ###
 # -*- mode: gitignore; -*-
 *~
 \#*\#
 /.emacs.desktop
 /.emacs.desktop.lock
 *.elc
 auto-save-list
 tramp
 .\#*
 # Org-mode
 .org-id-locations
 *_archive
 # flymake-mode
 *_flymake.*
 # eshell files
 /eshell/history
 /eshell/lastdir
 # elpa packages
 /elpa/
 # reftex files
 *.rel
 # AUCTeX auto folder
 /auto/
 # cask packages
 .cask/
 dist/
 # Flycheck
 flycheck_*.el
 # server auth directory
 /server/
 # projectiles files
 .projectile
 # directory configuration
 .dir-locals.el
 # network security
 /network-security.data
 ### JupyterNotebooks ###
 # gitignore template for Jupyter Notebooks
 # website: http://jupyter.org/
 .ipynb_checkpoints
 */.ipynb_checkpoints/*
 # IPython
 profile_default/
 ipython_config.py
 # Remove previous ipynb_checkpoints
 #   git rm -r .ipynb_checkpoints/
 ### macOS ###
 # General
 .DS_Store
 .AppleDouble
 .LSOverride
 # Icon must end with two \r
 Icon
 # Thumbnails
 ._*
 # Files that might appear in the root of a volume
 .DocumentRevisions-V100
 .fseventsd
 .Spotlight-V100
 .TemporaryItems
 .Trashes
 .VolumeIcon.icns
 .com.apple.timemachine.donotpresent
 # Directories potentially created on remote AFP share
 .AppleDB
 .AppleDesktop
 Network Trash Folder
 Temporary Items
 .apdisk
 ### macOS Patch ###
 # iCloud generated files
 *.icloud
 ### PyCharm ###
 # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
 # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 # User-specific stuff
 .idea/**/workspace.xml
 .idea/**/tasks.xml
 .idea/**/usage.statistics.xml
 .idea/**/dictionaries
 .idea/**/shelf
 # AWS User-specific
 .idea/**/aws.xml
 # Generated files
 .idea/**/contentModel.xml
 # Sensitive or high-churn files
 .idea/**/dataSources/
 .idea/**/dataSources.ids
 .idea/**/dataSources.local.xml
 .idea/**/sqlDataSources.xml
 .idea/**/dynamic.xml
 .idea/**/uiDesigner.xml
 .idea/**/dbnavigator.xml
 # Gradle
 .idea/**/gradle.xml
 .idea/**/libraries
 # Gradle and Maven with auto-import
 # When using Gradle or Maven with auto-import, you should exclude module files,
 # since they will be recreated, and may cause churn.  Uncomment if using
 # auto-import.
 # .idea/artifacts
 # .idea/compiler.xml
 # .idea/jarRepositories.xml
 # .idea/modules.xml
 # .idea/*.iml
 # .idea/modules
 # *.iml
 # *.ipr
 # CMake
 cmake-build-*/
 # Mongo Explorer plugin
 .idea/**/mongoSettings.xml
 # File-based project format
 *.iws
 # IntelliJ
 out/
 # mpeltonen/sbt-idea plugin
 .idea_modules/
 # JIRA plugin
 atlassian-ide-plugin.xml
 # Cursive Clojure plugin
 .idea/replstate.xml
 # SonarLint plugin
 .idea/sonarlint/
 # Crashlytics plugin (for Android Studio and IntelliJ)
 com_crashlytics_export_strings.xml
 crashlytics.properties
 crashlytics-build.properties
 fabric.properties
 # Editor-based Rest Client
 .idea/httpRequests
 # Android studio 3.1+ serialized cache file
 .idea/caches/build_file_checksums.ser
 ### PyCharm Patch ###
 # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
 # *.iml
 # modules.xml
 # .idea/misc.xml
 # *.ipr
 # Sonarlint plugin
 # https://plugins.jetbrains.com/plugin/7973-sonarlint
 .idea/**/sonarlint/
 # SonarQube Plugin
 # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
 .idea/**/sonarIssues.xml
 # Markdown Navigator plugin
 # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
 .idea/**/markdown-navigator.xml
 .idea/**/markdown-navigator-enh.xml
 .idea/**/markdown-navigator/
 # Cache file creation bug
 # See https://youtrack.jetbrains.com/issue/JBR-2257
 .idea/$CACHE_FILE$
 # CodeStream plugin
 # https://plugins.jetbrains.com/plugin/12206-codestream
 .idea/codestream.xml
 # Azure Toolkit for IntelliJ plugin
 # https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
 .idea/**/azureSettings.xml
 ### Python ###
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 # IPython
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 ### Python Patch ###
 # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
 poetry.toml
 # ruff
 .ruff_cache/
 ### Vim ###
 # Swap
 [._]*.s[a-v][a-z]
 !*.svg  # comment out if you don't need vector files
 [._]*.sw[a-p]
 [._]s[a-rt-v][a-z]
 [._]ss[a-gi-z]
 [._]sw[a-p]
 # Session
 Session.vim
 Sessionx.vim
 # Temporary
 .netrwhist
 # Auto-generated tag files
 tags
 # Persistent undo
 [._]*.un~
 ### Visual Studio Code ###
 .vscode/
 ### VirtualEnv ###
 # Virtualenv
 # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
 [Bb]in
 [Ii]nclude
 [Ll]ib
 [Ll]ib64
 [Ll]ocal
 [Ss]cripts
 pyvenv.cfg
 pip-selfcheck.json
 ### VisualStudioCode ###
 .vscode/*
 !.vscode/settings.json
 !.vscode/tasks.json
 !.vscode/launch.json
 !.vscode/extensions.json
 !.vscode/*.code-snippets
 # Local History for Visual Studio Code
 .history/
 # Built Visual Studio Code Extensions
 *.vsix
 ### VisualStudioCode Patch ###
 # Ignore all local history of files
 .history
 .ionide
 # Docs
 # docs/**/*.png
 # docs/**/*.svg
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,34 @@
 fail_fast: true
 repos:
  - repo: local
    hooks:
      - id: system
        name: Black
        entry: poetry run black docling examples
        pass_filenames: false
        language: system
        files: '\.py$'
  - repo: local
    hooks:
      - id: system
        name: isort
        entry: poetry run isort docling examples
        pass_filenames: false
        language: system
        files: '\.py$'
 #  - repo: local
 #    hooks:
 #      - id: system
 #        name: flake8
 #        entry: poetry run flake8 docling
 #        pass_filenames: false
 #        language: system
 #        files: '\.py$'
 #  - repo: local
 #    hooks:
 #     - id: system
 #       name: MyPy
 #       entry: poetry run mypy docling
 #       pass_filenames: false
 #       language: system
 #       files: '\.py$'
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -0,0 +1,129 @@
 # Contributor Covenant Code of Conduct
 ## Our Pledge
 We as members, contributors, and leaders pledge to make participation in our
 community a harassment-free experience for everyone, regardless of age, body
 size, visible or invisible disability, ethnicity, sex characteristics, gender
 identity and expression, level of experience, education, socio-economic status,
 nationality, personal appearance, race, religion, or sexual identity
 and orientation.
 We pledge to act and interact in ways that contribute to an open, welcoming,
 diverse, inclusive, and healthy community.
 ## Our Standards
 Examples of behavior that contributes to a positive environment for our
 community include:
 * Demonstrating empathy and kindness toward other people
 * Being respectful of differing opinions, viewpoints, and experiences
 * Giving and gracefully accepting constructive feedback
 * Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
 * Focusing on what is best not just for us as individuals, but for the
  overall community
 Examples of unacceptable behavior include:
 * The use of sexualized language or imagery, and sexual attention or
  advances of any kind
 * Trolling, insulting or derogatory comments, and personal or political attacks
 * Public or private harassment
 * Publishing others' private information, such as a physical or email
  address, without their explicit permission
 * Other conduct which could reasonably be considered inappropriate in a
  professional setting
 ## Enforcement Responsibilities
 Community leaders are responsible for clarifying and enforcing our standards of
 acceptable behavior and will take appropriate and fair corrective action in
 response to any behavior that they deem inappropriate, threatening, offensive,
 or harmful.
 Community leaders have the right and responsibility to remove, edit, or reject
 comments, commits, code, wiki edits, issues, and other contributions that are
 not aligned to this Code of Conduct, and will communicate reasons for moderation
 decisions when appropriate.
 ## Scope
 This Code of Conduct applies within all community spaces, and also applies when
 an individual is officially representing the community in public spaces.
 Examples of representing our community include using an official e-mail address,
 posting via an official social media account, or acting as an appointed
 representative at an online or offline event.
 ## Enforcement
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement using
 [deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com).
 All complaints will be reviewed and investigated promptly and fairly.
 All community leaders are obligated to respect the privacy and security of the
 reporter of any incident.
 ## Enforcement Guidelines
 Community leaders will follow these Community Impact Guidelines in determining
 the consequences for any action they deem in violation of this Code of Conduct:
 ### 1. Correction
 **Community Impact**: Use of inappropriate language or other behavior deemed
 unprofessional or unwelcome in the community.
 **Consequence**: A private, written warning from community leaders, providing
 clarity around the nature of the violation and an explanation of why the
 behavior was inappropriate. A public apology may be requested.
 ### 2. Warning
 **Community Impact**: A violation through a single incident or series
 of actions.
 **Consequence**: A warning with consequences for continued behavior. No
 interaction with the people involved, including unsolicited interaction with
 those enforcing the Code of Conduct, for a specified period of time. This
 includes avoiding interactions in community spaces as well as external channels
 like social media. Violating these terms may lead to a temporary or
 permanent ban.
 ### 3. Temporary Ban
 **Community Impact**: A serious violation of community standards, including
 sustained inappropriate behavior.
 **Consequence**: A temporary ban from any sort of interaction or public
 communication with the community for a specified period of time. No public or
 private interaction with the people involved, including unsolicited interaction
 with those enforcing the Code of Conduct, is allowed during this period.
 Violating these terms may lead to a permanent ban.
 ### 4. Permanent Ban
 **Community Impact**: Demonstrating a pattern of violation of community
 standards, including sustained inappropriate behavior,  harassment of an
 individual, or aggression toward or disparagement of classes of individuals.
 **Consequence**: A permanent ban from any sort of public interaction within
 the community.
 ## Attribution
 This Code of Conduct is adapted from the [Contributor Covenant][homepage],
 version 2.0, available at
 [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html](https://www.contributor-covenant.org/version/2/0/code_of_conduct.html).
 Community Impact Guidelines were inspired by [Mozilla's code of conduct
 enforcement ladder](https://github.com/mozilla/diversity).
 Homepage: [https://www.contributor-covenant.org](https://www.contributor-covenant.org)
 For answers to common questions about this code of conduct, see the FAQ at
 [https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq). Translations are available at
 [https://www.contributor-covenant.org/translations](https://www.contributor-covenant.org/translations).
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,184 @@
 ## Contributing In General
 Our project welcomes external contributions. If you have an itch, please feel
 free to scratch it.
 To contribute code or documentation, please submit a [pull request](https://github.com/DS4SD/docling/pulls).
 A good way to familiarize yourself with the codebase and contribution process is
 to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/DS4SD/docling/issues).
 Before embarking on a more ambitious contribution, please quickly [get in touch](#communication) with us.
 For general questions or support requests, please refer to the [discussion section](https://github.com/DS4SD/docling/discussions).
 **Note: We appreciate your effort, and want to avoid a situation where a contribution
 requires extensive rework (by you or by us), sits in backlog for a long time, or
 cannot be accepted at all!**
 ### Proposing new features
 If you would like to implement a new feature, please [raise an issue](https://github.com/DS4SD/docling/issues)
 before sending a pull request so the feature can be discussed. This is to avoid
 you wasting your valuable time working on a feature that the project developers
 are not interested in accepting into the code base.
 ### Fixing bugs
 If you would like to fix a bug, please [raise an issue](https://github.com/DS4SD/docling/issues) before sending a
 pull request so it can be tracked.
 ### Merge approval
 The project maintainers use LGTM (Looks Good To Me) in comments on the code
 review to indicate acceptance. A change requires LGTMs from two of the
 maintainers of each component affected.
 For a list of the maintainers, see the [MAINTAINERS.md](MAINTAINERS.md) page.
 ## Legal
 Each source file must include a license header for the MIT
 Software. Using the SPDX format is the simplest approach.
 e.g.
 ```
 /*
 Copyright IBM Inc. All rights reserved.
 SPDX-License-Identifier: MIT
 */
 ```
 We have tried to make it as easy as possible to make contributions. This
 applies to how we handle the legal aspects of contribution. We use the
 same approach - the [Developer's Certificate of Origin 1.1 (DCO)](https://github.com/hyperledger/fabric/blob/master/docs/source/DCO1.1.txt) - that the Linux® Kernel [community](https://elinux.org/Developer_Certificate_Of_Origin)
 uses to manage code contributions.
 We simply ask that when submitting a patch for review, the developer
 must include a sign-off statement in the commit message.
 Here is an example Signed-off-by line, which indicates that the
 submitter accepts the DCO:
 ```
 Signed-off-by: John Doe <john.doe@example.com>
 ```
 You can include this automatically when you commit a change to your
 local git repository using the following command:
 ```
 git commit -s
 ```
 ## Communication
 Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
 ## Developing
 ### Usage of Poetry
 We use Poetry to manage dependencies.
 #### Install
 To install, see the documentation here: https://python-poetry.org/docs/master/#installing-with-the-official-installer
 1. Install the Poetry globally in your machine
    ```bash
    curl -sSL https://install.python-poetry.org | python3 -
    ```
    The installation script will print the installation bin folder `POETRY_BIN` which you need in the next steps.
 2. Make sure Poetry is in your `$PATH`
    - for `zsh`
        ```sh
        echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.zshrc
        ```
    - for `bash`
        ```sh
        echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.bashrc
        ```
 3. The official guidelines linked above include useful details on the configuration of autocomplete for most shell environments, e.g. Bash and Zsh.
 #### Create a Virtual Environment and Install Dependencies
 To activate the Virtual Environment, run:
 ```bash
 poetry shell
 ```
 To spawn a shell with the Virtual Environment activated. If the Virtual Environment doesn't exist, Poetry will create one for you. Then, to install dependencies, run:
 ```bash
 poetry install
 ```
 **(Advanced) Use a Specific Python Version**
 If for whatever reason you need to work in a specific (older) version of Python, run:
 ```bash
 poetry env use $(which python3.8)
 ```
 This creates a Virtual Environment with Python 3.8. For other versions, replace `$(which python3.8)` by the path to the interpreter (e.g., `/usr/bin/python3.8`) or use `$(which pythonX.Y)`.
 #### Add a new dependency
 ```bash
 poetry add NAME
 ```
 ## Coding style guidelines
 We use the following tools to enforce code style:
 - iSort, to sort imports
 - Black, to format code
 We run a series of checks on the code base on every commit, using `pre-commit`. To install the hooks, run:
 ```bash
 pre-commit install
 ```
 To run the checks on-demand, run:
 ```
 pre-commit run --all-files
 ```
 Note: Checks like `Black` and `isort` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by their Hooks. In these cases, `git add` the modified files and `git commit` again.
 ## Documentation
 We use [MkDocs](https://www.mkdocs.org/) to write documentation.
 To run the documentation server, do:
 ```bash
 mkdocs serve
 ```
 The server will be available on [http://localhost:8000](http://localhost:8000).
 ### Pushing Documentation to GitHub pages
 Run the following:
 ```bash
 mkdocs gh-deploy
 ```
--- a/23
+++ b/23
@ -0,0 +1,23 @@
 FROM python:3.11-slim-bookworm
 ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"
 RUN apt-get update \
    && apt-get install -y libgl1 libglib2.0-0 curl wget git \
    && apt-get clean
 RUN --mount=type=ssh \
    pip install --no-cache-dir https://github.com/DS4SD/docling.git
 ENV HF_HOME=/tmp/
 ENV TORCH_HOME=/tmp/
 COPY examples/minimal.py /root/minimal.py
 RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
 RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
 RUN wget "https://www.ibm.com/docs/en/SSQRB8/com.ibm.spectrum.si.pdfs/IBM_Storage_Insights_Fact_Sheet.pdf" -O /root/factsheet.pdf
 # On container shell:
 # > cd /root/
 # > python minimal.py
--- a/21
+++ b/21
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) [year] [fullname]
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/MAINTAINERS.md
+++ b/MAINTAINERS.md
@ -0,0 +1,10 @@
 # MAINTAINERS
 - Christoph Auer - [@cau-git](https://github.com/cau-git)
 - Michele Dolfi - [@dolfim-ibm](https://github.com/dolfim-ibm)
 - Maxim Lysak - [@maxmnemonic](https://github.com/maxmnemonic)
 - Nikos Livathinos - [@nikos-livathinos](https://github.com/nikos-livathinos)
 - Ahmed Nassar [@nassarofficial](https://github.com/nassarofficial)
 - Peter Staar - [@PeterStaar-IBM](https://github.com/PeterStaar-IBM)
 Maintainers can be contacted at [deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com).
--- a/README.md
+++ b/README.md
@ -0,0 +1,99 @@
 <p align="center">
  <a href="https://github.com/ds4sd/docling"> <img loading="lazy" alt="Docling" src="logo.png" width="150" /> </a>
 </p>
 # Docling
 Dockling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
 ## Features
 * ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
 * 📑 Understands detailed page layout, reading order and recovers table structures
 * 📝 Extracts metadata from the document, such as title, authors, references and language
 * 🔍 Optionally applies OCR (use with scanned PDFs)
 ## Setup
 You need Python 3.11 and poetry. Install poetry from [here](https://python-poetry.org/docs/#installing-with-the-official-installer).
 Once you have `poetry` installed, create an environment and install the package:
 ```bash
 poetry env use $(which python3.11)
 poetry shell
 poetry install
 ```
 **Notes**:
 * Works on macOS and Linux environments. Windows platforms are currently not tested.
 ## Usage
 For basic usage, see the [convert.py](examples/convert.py) example module. Run with:
 ```
 python examples/convert.py
 ```
 The output of the above command will be written to `./scratch`.
 ### Enable or disable pipeline features
 You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter` 
 ```python
 doc_converter = DocumentConverter(
    artifacts_path=artifacts_path,
    pipeline_options=PipelineOptions(do_table_structure=False, # Controls if table structure is recovered. 
                                     do_ocr=True), # Controls if OCR is applied (ignores programmatic content)
 )
 ```
 ### Impose limits on the document size
 You can limit the file size and number of pages which should be allowed to process per document.
 ```python
 paths = [Path("./test/data/2206.01062.pdf")]
 input = DocumentConversionInput.from_paths(
    paths, limits=DocumentLimits(max_num_pages=100, max_file_size=20971520)
 )
 ```
 ### Convert from binary PDF streams 
 You can convert PDFs from a binary stream instead of from the filesystem as follows:
 ```python
 buf = BytesIO(your_binary_stream)
 docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
 input = DocumentConversionInput.from_streams(docs)
 converted_docs = doc_converter.convert(input)
 ```
 ### Limit resource usage
 You can limit the CPU threads used by `docling` by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
 ## Contributing
 Please read [Contributing to Docling](./CONTRIBUTING.md) for details.
 ## References
 If you use `Docling` in your projects, please consider citing the following:
 ```bib
@software{Docling,
 author = {Deep Search Team},
 month = {7},
 title = {{Docling}},
 url = {https://github.com/DS4SD/docling},
 version = {main},
 year = {2024}
 }
 ```
 ## License
 The `Docling` codebase is under MIT license.
 For individual model usage, please refer to the model licenses found in the original packages.
--- a/docling/init.py
+++ b/docling/init.py
--- a/docling/backend/init.py
+++ b/docling/backend/init.py
--- a/docling/backend/abstract_backend.py
+++ b/docling/backend/abstract_backend.py
@ -0,0 +1,55 @@
 from abc import ABC, abstractmethod
 from io import BytesIO
 from pathlib import Path
 from typing import Any, Iterable, Optional, Union
 from PIL import Image
 class PdfPageBackend(ABC):
    def __init__(self, page_obj: Any) -> object:
        pass
    @abstractmethod
    def get_text_in_rect(self, bbox: "BoundingBox") -> str:
        pass
    @abstractmethod
    def get_text_cells(self) -> Iterable["Cell"]:
        pass
    @abstractmethod
    def get_page_image(
        self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
    ) -> Image.Image:
        pass
    @abstractmethod
    def get_size(self) -> "PageSize":
        pass
    @abstractmethod
    def unload(self):
        pass
 class PdfDocumentBackend(ABC):
    @abstractmethod
    def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
        pass
    @abstractmethod
    def load_page(self, page_no: int) -> PdfPageBackend:
        pass
    @abstractmethod
    def page_count(self) -> int:
        pass
    @abstractmethod
    def is_valid(self) -> bool:
        pass
    @abstractmethod
    def unload(self):
        pass
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@ -0,0 +1,223 @@
 import random
 from io import BytesIO
 from pathlib import Path
 from typing import Iterable, List, Optional, Union
 import pypdfium2 as pdfium
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
 from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
 from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
 class PyPdfiumPageBackend(PdfPageBackend):
    def __init__(self, page_obj: PdfPage):
        super().__init__(page_obj)
        self._ppage = page_obj
        self.text_page = None
    def get_text_in_rect(self, bbox: BoundingBox) -> str:
        if not self.text_page:
            self.text_page = self._ppage.get_textpage()
        if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
            bbox = bbox.to_bottom_left_origin(self.get_size().height)
        text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
        return text_piece
    def get_text_cells(self) -> Iterable[Cell]:
        if not self.text_page:
            self.text_page = self._ppage.get_textpage()
        cells = []
        cell_counter = 0
        page_size = self.get_size()
        for i in range(self.text_page.count_rects()):
            rect = self.text_page.get_rect(i)
            text_piece = self.text_page.get_text_bounded(*rect)
            x0, y0, x1, y1 = rect
            cells.append(
                Cell(
                    id=cell_counter,
                    text=text_piece,
                    bbox=BoundingBox(
                        l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
                    ).to_top_left_origin(page_size.height),
                )
            )
            cell_counter += 1
        # PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
        # The cell merging code below is to clean this up.
        def merge_horizontal_cells(
            cells: List[Cell],
            horizontal_threshold_factor: float = 1.0,
            vertical_threshold_factor: float = 0.5,
        ) -> List[Cell]:
            if not cells:
                return []
            def group_rows(cells: List[Cell]) -> List[List[Cell]]:
                rows = []
                current_row = [cells[0]]
                row_top = cells[0].bbox.t
                row_bottom = cells[0].bbox.b
                row_height = cells[0].bbox.height
                for cell in cells[1:]:
                    vertical_threshold = row_height * vertical_threshold_factor
                    if (
                        abs(cell.bbox.t - row_top) <= vertical_threshold
                        and abs(cell.bbox.b - row_bottom) <= vertical_threshold
                    ):
                        current_row.append(cell)
                        row_top = min(row_top, cell.bbox.t)
                        row_bottom = max(row_bottom, cell.bbox.b)
                        row_height = row_bottom - row_top
                    else:
                        rows.append(current_row)
                        current_row = [cell]
                        row_top = cell.bbox.t
                        row_bottom = cell.bbox.b
                        row_height = cell.bbox.height
                if current_row:
                    rows.append(current_row)
                return rows
            def merge_row(row: List[Cell]) -> List[Cell]:
                merged = []
                current_group = [row[0]]
                for cell in row[1:]:
                    prev_cell = current_group[-1]
                    avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
                    if (
                        cell.bbox.l - prev_cell.bbox.r
                        <= avg_height * horizontal_threshold_factor
                    ):
                        current_group.append(cell)
                    else:
                        merged.append(merge_group(current_group))
                        current_group = [cell]
                if current_group:
                    merged.append(merge_group(current_group))
                return merged
            def merge_group(group: List[Cell]) -> Cell:
                if len(group) == 1:
                    return group[0]
                merged_text = "".join(cell.text for cell in group)
                merged_bbox = BoundingBox(
                    l=min(cell.bbox.l for cell in group),
                    t=min(cell.bbox.t for cell in group),
                    r=max(cell.bbox.r for cell in group),
                    b=max(cell.bbox.b for cell in group),
                )
                return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
            rows = group_rows(cells)
            merged_cells = [cell for row in rows for cell in merge_row(row)]
            for i, cell in enumerate(merged_cells, 1):
                cell.id = i
            return merged_cells
        def draw_clusters_and_cells():
            image = self.get_page_image()
            draw = ImageDraw.Draw(image)
            for c in cells:
                x0, y0, x1, y1 = c.bbox.as_tuple()
                cell_color = (
                    random.randint(30, 140),
                    random.randint(30, 140),
                    random.randint(30, 140),
                )
                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
            image.show()
        # before merge:
        # draw_clusters_and_cells()
        cells = merge_horizontal_cells(cells)
        # after merge:
        # draw_clusters_and_cells()
        return cells
    def get_page_image(
        self, scale: int = 1, cropbox: Optional[BoundingBox] = None
    ) -> Image.Image:
        page_size = self.get_size()
        if not cropbox:
            cropbox = BoundingBox(
                l=0,
                r=page_size.width,
                t=0,
                b=page_size.height,
                coord_origin=CoordOrigin.TOPLEFT,
            )
            padbox = BoundingBox(
                l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
            )
        else:
            padbox = cropbox.to_bottom_left_origin(page_size.height)
            padbox.r = page_size.width - padbox.r
            padbox.t = page_size.height - padbox.t
        image = (
            self._ppage.render(
                scale=scale * 1.5,
                rotation=0,  # no additional rotation
                crop=padbox.as_tuple(),
            )
            .to_pil()
            .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
        )  # We resize the image from 1.5x the given scale to make it sharper.
        return image
    def get_size(self) -> PageSize:
        return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
    def unload(self):
        self._ppage = None
        self.text_page = None
 class PyPdfiumDocumentBackend(PdfDocumentBackend):
    def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
        super().__init__(path_or_stream)
        if isinstance(path_or_stream, Path):
            self._pdoc = pdfium.PdfDocument(path_or_stream)
        elif isinstance(path_or_stream, BytesIO):
            self._pdoc = pdfium.PdfDocument(
                path_or_stream
            )  # TODO Fix me, won't accept bytes.
    def page_count(self) -> int:
        return len(self._pdoc)
    def load_page(self, page_no: int) -> PdfPage:
        return PyPdfiumPageBackend(self._pdoc[page_no])
    def is_valid(self) -> bool:
        return self.page_count() > 0
    def unload(self):
        self._pdoc.close()
        self._pdoc = None
--- a/docling/datamodel/init.py
+++ b/docling/datamodel/init.py
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -0,0 +1,247 @@
 from enum import Enum, auto
 from io import BytesIO
 from typing import Any, Dict, List, Optional, Tuple, Union
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict, model_validator
 from docling.backend.abstract_backend import PdfPageBackend
 class ConversionStatus(str, Enum):
    PENDING = auto()
    STARTED = auto()
    FAILURE = auto()
    SUCCESS = auto()
    SUCCESS_WITH_ERRORS = auto()
 class DocInputType(str, Enum):
    PATH = auto()
    STREAM = auto()
 class CoordOrigin(str, Enum):
    TOPLEFT = auto()
    BOTTOMLEFT = auto()
 class PageSize(BaseModel):
    width: float = 0.0
    height: float = 0.0
 class BoundingBox(BaseModel):
    l: float  # left
    t: float  # top
    r: float  # right
    b: float  # bottom
    coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
    @property
    def width(self):
        return self.r - self.l
    @property
    def height(self):
        return abs(self.t - self.b)
    def as_tuple(self):
        if self.coord_origin == CoordOrigin.TOPLEFT:
            return (self.l, self.t, self.r, self.b)
        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
            return (self.l, self.b, self.r, self.t)
    @classmethod
    def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
        if origin == CoordOrigin.TOPLEFT:
            return BoundingBox(
                l=coord[0], t=coord[1], r=coord[2], b=coord[3], coord_origin=origin
            )
        elif origin == CoordOrigin.BOTTOMLEFT:
            return BoundingBox(
                l=coord[0], b=coord[1], r=coord[2], t=coord[3], coord_origin=origin
            )
    def area(self) -> float:
        return (self.r - self.l) * (self.b - self.t)
    def intersection_area_with(self, other: "BoundingBox") -> float:
        # Calculate intersection coordinates
        left = max(self.l, other.l)
        top = max(self.t, other.t)
        right = min(self.r, other.r)
        bottom = min(self.b, other.b)
        # Calculate intersection dimensions
        width = right - left
        height = bottom - top
        # If the bounding boxes do not overlap, width or height will be negative
        if width <= 0 or height <= 0:
            return 0.0
        return width * height
    def to_bottom_left_origin(self, page_height) -> "BoundingBox":
        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
            return self
        elif self.coord_origin == CoordOrigin.TOPLEFT:
            return BoundingBox(
                l=self.l,
                r=self.r,
                t=page_height - self.t,
                b=page_height - self.b,
                coord_origin=CoordOrigin.BOTTOMLEFT,
            )
    def to_top_left_origin(self, page_height):
        if self.coord_origin == CoordOrigin.TOPLEFT:
            return self
        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
            return BoundingBox(
                l=self.l,
                r=self.r,
                t=page_height - self.t,  # self.b
                b=page_height - self.b,  # self.t
                coord_origin=CoordOrigin.TOPLEFT,
            )
 class Cell(BaseModel):
    id: int
    text: str
    bbox: BoundingBox
 class OcrCell(Cell):
    confidence: float
 class Cluster(BaseModel):
    id: int
    label: str
    bbox: BoundingBox
    confidence: float = 1.0
    cells: List[Cell] = []
 class BasePageElement(BaseModel):
    label: str
    id: int
    page_no: int
    cluster: Cluster
    text: Optional[str] = None
 class LayoutPrediction(BaseModel):
    clusters: List[Cluster] = []
 class TableCell(BaseModel):
    bbox: BoundingBox
    row_span: int
    col_span: int
    start_row_offset_idx: int
    end_row_offset_idx: int
    start_col_offset_idx: int
    end_col_offset_idx: int
    text: str
    column_header: bool = False
    row_header: bool = False
    row_section: bool = False
    @model_validator(mode="before")
    @classmethod
    def from_dict_format(cls, data: Any) -> Any:
        if isinstance(data, Dict):
            text = data["bbox"].get("token", "")
            if not len(text):
                text_cells = data.pop("text_cell_bboxes", None)
                if text_cells:
                    for el in text_cells:
                        text += el["token"] + " "
                text = text.strip()
            data["text"] = text
        return data
 class TableElement(BasePageElement):
    otsl_seq: List[str]
    num_rows: int = 0
    num_cols: int = 0
    table_cells: List[TableCell]
 class TableStructurePrediction(BaseModel):
    table_map: Dict[int, TableElement] = {}
 class TextElement(BasePageElement):
    ...
 class FigureData(BaseModel):
    pass
 class FigureElement(BasePageElement):
    data: Optional[FigureData] = None
    provenance: Optional[str] = None
    predicted_class: Optional[str] = None
    confidence: Optional[float] = None
 class FigureClassificationPrediction(BaseModel):
    figure_count: int = 0
    figure_map: Dict[int, FigureElement] = {}
 class EquationPrediction(BaseModel):
    equation_count: int = 0
    equation_map: Dict[int, TextElement] = {}
 class PagePredictions(BaseModel):
    layout: LayoutPrediction = None
    tablestructure: TableStructurePrediction = None
    figures_classification: FigureClassificationPrediction = None
    equations_prediction: EquationPrediction = None
 PageElement = Union[TextElement, TableElement, FigureElement]
 class AssembledUnit(BaseModel):
    elements: List[PageElement]
    body: List[PageElement]
    headers: List[PageElement]
 class Page(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    page_no: int
    page_hash: str = None
    size: PageSize = None
    image: Image = None
    cells: List[Cell] = None
    predictions: PagePredictions = PagePredictions()
    assembled: AssembledUnit = None
    _backend: PdfPageBackend = None  # Internal PDF backend
 class DocumentStream(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    filename: str
    stream: BytesIO
 class PipelineOptions(BaseModel):
    do_table_structure: bool = True
    do_ocr: bool = False
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -0,0 +1,351 @@
 import logging
 from io import BytesIO
 from pathlib import Path, PurePath
 from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
 from deepsearch.documents.core.export import export_to_markdown
 from docling_core.types import BaseCell, BaseText
 from docling_core.types import BoundingBox as DsBoundingBox
 from docling_core.types import Document as DsDocument
 from docling_core.types import DocumentDescription as DsDocumentDescription
 from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
 from docling_core.types import TableCell
 from pydantic import BaseModel
 from docling.backend.abstract_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import (
    AssembledUnit,
    ConversionStatus,
    DocumentStream,
    FigureElement,
    Page,
    TableElement,
    TextElement,
 )
 from docling.datamodel.settings import DocumentLimits
 from docling.utils.utils import create_file_hash
 _log = logging.getLogger(__name__)
 layout_label_to_ds_type = {
    "Title": "title",
    "Document Index": "table-of-path_or_stream",
    "Section-header": "subtitle-level-1",
    "Checkbox-Selected": "checkbox-selected",
    "Checkbox-Unselected": "checkbox-unselected",
    "Caption": "caption",
    "Page-header": "page-header",
    "Page-footer": "page-footer",
    "Footnote": "footnote",
    "Table": "table",
    "Formula": "equation",
    "List-item": "paragraph",
    "Code": "paragraph",
    "Picture": "figure",
    "Text": "paragraph",
 }
 class InputDocument(BaseModel):
    file: PurePath = None
    document_hash: Optional[str] = None
    valid: bool = False
    limits: DocumentLimits = DocumentLimits()
    filesize: Optional[int] = None
    page_count: Optional[int] = None
    _backend: PdfDocumentBackend = None  # Internal PDF backend used
    def __init__(
        self,
        path_or_stream: Union[BytesIO, Path],
        filename: Optional[str] = None,
        limits: Optional[DocumentLimits] = None,
        pdf_backend=PyPdfiumDocumentBackend,
    ):
        super().__init__()
        self.limits = limits or DocumentLimits()
        try:
            if isinstance(path_or_stream, Path):
                self.file = path_or_stream
                self.filesize = path_or_stream.stat().st_size
                if self.filesize > self.limits.max_file_size:
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
                    self._backend = pdf_backend(path_or_stream=path_or_stream)
            elif isinstance(path_or_stream, BytesIO):
                self.file = PurePath(filename)
                self.filesize = path_or_stream.getbuffer().nbytes
                if self.filesize > self.limits.max_file_size:
                    self.valid = False
                else:
                    self.document_hash = create_file_hash(path_or_stream)
                    self._backend = pdf_backend(path_or_stream=path_or_stream)
            if self.document_hash and self._backend.page_count() > 0:
                self.page_count = self._backend.page_count()
                if self.page_count <= self.limits.max_num_pages:
                    self.valid = True
        except (FileNotFoundError, OSError) as e:
            _log.exception(
                f"File {self.file.name} not found or cannot be opened.", exc_info=e
            )
            # raise
        except RuntimeError as e:
            _log.exception(
                f"An unexpected error occurred while opening the document {self.file.name}",
                exc_info=e,
            )
            # raise
 class ConvertedDocument(BaseModel):
    input: InputDocument
    status: ConversionStatus = ConversionStatus.PENDING  # failure, success
    errors: List[Dict] = []  # structure to keep errors
    pages: List[Page] = []
    assembled: AssembledUnit = None
    output: DsDocument = None
    def to_ds_document(self) -> DsDocument:
        title = ""
        desc = DsDocumentDescription(logs=[])
        page_hashes = [
            PageReference(hash=p.page_hash, page=p.page_no, model="default")
            for p in self.pages
        ]
        file_info = DsFileInfoObject(
            filename=self.input.file.name,
            document_hash=self.input.document_hash,
            num_pages=self.input.page_count,
            page_hashes=page_hashes,
        )
        main_text = []
        tables = []
        figures = []
        page_no_to_page = {p.page_no: p for p in self.pages}
        for element in self.assembled.elements:
            # Convert bboxes to lower-left origin.
            target_bbox = DsBoundingBox(
                element.cluster.bbox.to_bottom_left_origin(
                    page_no_to_page[element.page_no].size.height
                ).as_tuple()
            )
            if isinstance(element, TextElement):
                main_text.append(
                    BaseText(
                        text=element.text,
                        obj_type=layout_label_to_ds_type.get(element.label),
                        name=element.label,
                        prov=[
                            Prov(
                                bbox=target_bbox,
                                page=element.page_no,
                                span=[0, len(element.text)],
                            )
                        ],
                    )
                )
            elif isinstance(element, TableElement):
                index = len(tables)
                ref_str = f"#/tables/{index}"
                main_text.append(
                    Ref(
                        name=element.label,
                        obj_type=layout_label_to_ds_type.get(element.label),
                        ref=ref_str,
                    ),
                )
                # Initialise empty table data grid (only empty cells)
                table_data = [
                    [
                        TableCell(
                            text="",
                            # bbox=[0,0,0,0],
                            spans=[[i, j]],
                            obj_type="body",
                        )
                        for j in range(element.num_cols)
                    ]
                    for i in range(element.num_rows)
                ]
                # Overwrite cells in table data for which there is actual cell content.
                for cell in element.table_cells:
                    for i in range(
                        min(cell.start_row_offset_idx, element.num_rows),
                        min(cell.end_row_offset_idx, element.num_rows),
                    ):
                        for j in range(
                            min(cell.start_col_offset_idx, element.num_cols),
                            min(cell.end_col_offset_idx, element.num_cols),
                        ):
                            celltype = "body"
                            if cell.column_header:
                                celltype = "col_header"
                            elif cell.row_header:
                                celltype = "row_header"
                            def make_spans(cell):
                                for rspan in range(
                                    min(cell.start_row_offset_idx, element.num_rows),
                                    min(cell.end_row_offset_idx, element.num_rows),
                                ):
                                    for cspan in range(
                                        min(
                                            cell.start_col_offset_idx, element.num_cols
                                        ),
                                        min(cell.end_col_offset_idx, element.num_cols),
                                    ):
                                        yield [rspan, cspan]
                            spans = list(make_spans(cell))
                            table_data[i][j] = TableCell(
                                text=cell.text,
                                bbox=cell.bbox.to_bottom_left_origin(
                                    page_no_to_page[element.page_no].size.height
                                ).as_tuple(),
                                # col=j,
                                # row=i,
                                spans=spans,
                                obj_type=celltype,
                                # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
                                # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
                            )
                tables.append(
                    DsSchemaTable(
                        num_cols=element.num_cols,
                        num_rows=element.num_rows,
                        obj_type=layout_label_to_ds_type.get(element.label),
                        data=table_data,
                        prov=[
                            Prov(
                                bbox=target_bbox,
                                page=element.page_no,
                                span=[0, 0],
                            )
                        ],
                    )
                )
            elif isinstance(element, FigureElement):
                index = len(figures)
                ref_str = f"#/figures/{index}"
                main_text.append(
                    Ref(
                        name=element.label,
                        obj_type=layout_label_to_ds_type.get(element.label),
                        ref=ref_str,
                    ),
                )
                figures.append(
                    BaseCell(
                        prov=[
                            Prov(
                                bbox=target_bbox,
                                page=element.page_no,
                                span=[0, 0],
                            )
                        ],
                        obj_type=layout_label_to_ds_type.get(element.label),
                        # data=[[]],
                    )
                )
        page_dimensions = [
            PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
            for p in self.pages
        ]
        ds_doc = DsDocument(
            name=title,
            description=desc,
            file_info=file_info,
            main_text=main_text,
            tables=tables,
            figures=figures,
            page_dimensions=page_dimensions,
        )
        return ds_doc
    def render_as_dict(self):
        if self.output:
            return self.output.model_dump(by_alias=True, exclude_none=True)
        else:
            return {}
    def render_as_markdown(self):
        if self.output:
            return export_to_markdown(
                self.output.model_dump(by_alias=True, exclude_none=True)
            )
        else:
            return ""
 class DocumentConversionInput(BaseModel):
    _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
    limits: Optional[DocumentLimits] = DocumentLimits()
    DEFAULT_BACKEND: ClassVar = PyPdfiumDocumentBackend
    def docs(
        self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
    ) -> Iterable[InputDocument]:
        pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
        for obj in self._path_or_stream_iterator:
            if isinstance(obj, Path):
                yield InputDocument(
                    path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
                )
            elif isinstance(obj, DocumentStream):
                yield InputDocument(
                    path_or_stream=obj.stream,
                    filename=obj.filename,
                    limits=self.limits,
                    pdf_backend=pdf_backend,
                )
    @classmethod
    def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
        paths = [Path(p) for p in paths]
        doc_input = cls(limits=limits)
        doc_input._path_or_stream_iterator = paths
        return doc_input
    @classmethod
    def from_streams(
        cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
    ):
        doc_input = cls(limits=limits)
        doc_input._path_or_stream_iterator = streams
        return doc_input
--- a/docling/datamodel/settings.py
+++ b/docling/datamodel/settings.py
@ -0,0 +1,32 @@
 import sys
 from pydantic import BaseModel
 from pydantic_settings import BaseSettings
 class DocumentLimits(BaseModel):
    max_num_pages: int = sys.maxsize
    max_file_size: int = sys.maxsize
 class BatchConcurrencySettings(BaseModel):
    doc_batch_size: int = 2
    doc_batch_concurrency: int = 2
    page_batch_size: int = 4
    page_batch_concurrency: int = 2
    # doc_batch_size: int = 1
    # doc_batch_concurrency: int = 1
    # page_batch_size: int = 1
    # page_batch_concurrency: int = 1
    # model_concurrency: int = 2
    # To force models into single core: export OMP_NUM_THREADS=1
 class AppSettings(BaseSettings):
    perf: BatchConcurrencySettings
 settings = AppSettings(perf=BatchConcurrencySettings())
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -0,0 +1,207 @@
 import functools
 import logging
 import time
 import traceback
 from pathlib import Path
 from typing import Iterable, Optional, Type, Union
 from PIL import ImageDraw
 from docling.backend.abstract_backend import PdfDocumentBackend
 from docling.datamodel.base_models import (
    AssembledUnit,
    ConversionStatus,
    Page,
    PipelineOptions,
 )
 from docling.datamodel.document import (
    ConvertedDocument,
    DocumentConversionInput,
    InputDocument,
 )
 from docling.datamodel.settings import settings
 from docling.models.ds_glm_model import GlmModel
 from docling.models.page_assemble_model import PageAssembleModel
 from docling.pipeline.base_model_pipeline import BaseModelPipeline
 from docling.pipeline.standard_model_pipeline import StandardModelPipeline
 from docling.utils.utils import chunkify, create_hash
 _log = logging.getLogger(__name__)
 class DocumentConverter:
    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
    _table_model_path = "model_artifacts/tableformer"
    def __init__(
        self,
        artifacts_path: Optional[Union[Path, str]] = None,
        pipeline_options: PipelineOptions = PipelineOptions(),
        pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
        pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
    ):
        if not artifacts_path:
            artifacts_path = self.download_models_hf()
        artifacts_path = Path(artifacts_path)
        self.model_pipeline = pipeline_cls(
            artifacts_path=artifacts_path, pipeline_options=pipeline_options
        )
        self.page_assemble_model = PageAssembleModel(config={})
        self.glm_model = GlmModel(config={})
        self.pdf_backend = pdf_backend
    @staticmethod
    def download_models_hf(
        local_dir: Optional[Path] = None, force: bool = False
    ) -> Path:
        from huggingface_hub import snapshot_download
        download_path = snapshot_download(
            repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
        )
        return Path(download_path)
    def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument]:
        for input_batch in chunkify(
            input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
        ):
            _log.info(f"Going to convert document batch...")
            # parallel processing only within input_batch
            # with ThreadPoolExecutor(
            #    max_workers=settings.perf.doc_batch_concurrency
            # ) as pool:
            #   yield from pool.map(self.process_document, input_batch)
            # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
            yield from map(self.process_document, input_batch)
    def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
        start_doc_time = time.time()
        converted_doc = ConvertedDocument(input=in_doc)
        if not in_doc.valid:
            converted_doc.status = ConversionStatus.FAILURE
            return converted_doc
        for i in range(0, in_doc.page_count):
            converted_doc.pages.append(Page(page_no=i))
        all_assembled_pages = []
        try:
            # Iterate batches of pages (page_batch_size) in the doc
            for page_batch in chunkify(
                converted_doc.pages, settings.perf.page_batch_size
            ):
                start_pb_time = time.time()
                # Pipeline
                # 1. Initialise the page resources
                init_pages = map(
                    functools.partial(self.initialize_page, in_doc), page_batch
                )
                # 2. Populate page image
                pages_with_images = map(
                    functools.partial(self.populate_page_images, in_doc), init_pages
                )
                # 3. Populate programmatic page cells
                pages_with_cells = map(
                    functools.partial(self.parse_page_cells, in_doc),
                    pages_with_images,
                )
                pipeline_pages = self.model_pipeline.apply(pages_with_cells)
                # 7. Assemble page elements (per page)
                assembled_pages = self.page_assemble_model(pipeline_pages)
                # exhaust assembled_pages
                for assembled_page in assembled_pages:
                    # Free up mem resources before moving on with next batch
                    assembled_page.image = (
                        None  # Comment this if you want to visualize page images
                    )
                    assembled_page._backend.unload()
                    all_assembled_pages.append(assembled_page)
                end_pb_time = time.time() - start_pb_time
                _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
            # Free up mem resources of PDF backend
            in_doc._backend.unload()
            converted_doc.pages = all_assembled_pages
            self.assemble_doc(converted_doc)
            converted_doc.status = ConversionStatus.SUCCESS
        except Exception as e:
            converted_doc.status = ConversionStatus.FAILURE
            trace = "\n".join(traceback.format_exception(e))
            _log.info(f"Encountered an error during conversion: {trace}")
        end_doc_time = time.time() - start_doc_time
        _log.info(
            f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
        )
        return converted_doc
    # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
        page._backend = doc._backend.load_page(page.page_no)
        page.size = page._backend.get_size()
        page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
        return page
    # Generate the page image and store it in the page object
    def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
        page.image = page._backend.get_page_image()
        return page
    # Extract and populate the page cells and store it in the page object
    def parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
        page.cells = page._backend.get_text_cells()
        # DEBUG code:
        def draw_text_boxes(image, cells):
            draw = ImageDraw.Draw(image)
            for c in cells:
                x0, y0, x1, y1 = c.bbox.as_tuple()
                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
            image.show()
        # draw_text_boxes(page.image, cells)
        return page
    def assemble_doc(self, converted_doc: ConvertedDocument):
        all_elements = []
        all_headers = []
        all_body = []
        for p in converted_doc.pages:
            for el in p.assembled.body:
                all_body.append(el)
            for el in p.assembled.headers:
                all_headers.append(el)
            for el in p.assembled.elements:
                all_elements.append(el)
        converted_doc.assembled = AssembledUnit(
            elements=all_elements, headers=all_headers, body=all_body
        )
        converted_doc.output = self.glm_model(converted_doc)
--- a/docling/models/init.py
+++ b/docling/models/init.py
--- a/docling/models/ds_glm_model.py
+++ b/docling/models/ds_glm_model.py
@ -0,0 +1,82 @@
 import copy
 import random
 from deepsearch_glm.nlp_utils import init_nlp_model
 from deepsearch_glm.utils.ds_utils import to_legacy_document_format
 from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
 from docling_core.types import BaseText
 from docling_core.types import Document as DsDocument
 from docling_core.types import Ref
 from PIL import ImageDraw
 from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
 from docling.datamodel.document import ConvertedDocument
 class GlmModel:
    def __init__(self, config):
        self.config = config
        load_pretrained_nlp_models()
        model = init_nlp_model(model_names="language;term;reference")
        self.model = model
    def __call__(self, document: ConvertedDocument) -> DsDocument:
        ds_doc = document.to_ds_document()
        ds_doc_dict = ds_doc.model_dump(by_alias=True)
        glm_doc = self.model.apply_on_doc(ds_doc_dict)
        ds_doc_dict = to_legacy_document_format(
            glm_doc, ds_doc_dict, update_name_label=True
        )
        exported_doc = DsDocument.model_validate(ds_doc_dict)
        # DEBUG code:
        def draw_clusters_and_cells(ds_document, page_no):
            clusters_to_draw = []
            image = copy.deepcopy(document.pages[page_no].image)
            for ix, elem in enumerate(ds_document.main_text):
                if isinstance(elem, BaseText):
                    prov = elem.prov[0]
                elif isinstance(elem, Ref):
                    _, arr, index = elem.ref.split("/")
                    index = int(index)
                    if arr == "tables":
                        prov = ds_document.tables[index].prov[0]
                    elif arr == "figures":
                        prov = ds_document.figures[index].prov[0]
                    else:
                        prov = None
                if prov and prov.page == page_no:
                    clusters_to_draw.append(
                        Cluster(
                            id=ix,
                            label=elem.name,
                            bbox=BoundingBox.from_tuple(
                                coord=prov.bbox,
                                origin=CoordOrigin.BOTTOMLEFT,
                            ).to_top_left_origin(document.pages[page_no].size.height),
                        )
                    )
            draw = ImageDraw.Draw(image)
            for c in clusters_to_draw:
                x0, y0, x1, y1 = c.bbox.as_tuple()
                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
                draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
                cell_color = (
                    random.randint(30, 140),
                    random.randint(30, 140),
                    random.randint(30, 140),
                )
                for tc in c.cells:  # [:1]:
                    x0, y0, x1, y1 = tc.bbox.as_tuple()
                    draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
            image.show()
        # draw_clusters_and_cells(ds_doc, 0)
        # draw_clusters_and_cells(exported_doc, 0)
        return exported_doc
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@ -0,0 +1,77 @@
 import copy
 import logging
 import random
 from typing import Iterable
 import numpy
 from PIL import ImageDraw
 from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
 _log = logging.getLogger(__name__)
 class EasyOcrModel:
    def __init__(self, config):
        self.config = config
        self.enabled = config["enabled"]
        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
        if self.enabled:
            import easyocr
            self.reader = easyocr.Reader(config["lang"])
    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        if not self.enabled:
            yield from page_batch
            return
        for page in page_batch:
            # rects = page._fpage.
            high_res_image = page._backend.get_page_image(scale=self.scale)
            im = numpy.array(high_res_image)
            result = self.reader.readtext(im)
            del high_res_image
            del im
            cells = [
                OcrCell(
                    id=ix,
                    text=line[1],
                    confidence=line[2],
                    bbox=BoundingBox.from_tuple(
                        coord=(
                            line[0][0][0] / self.scale,
                            line[0][0][1] / self.scale,
                            line[0][2][0] / self.scale,
                            line[0][2][1] / self.scale,
                        ),
                        origin=CoordOrigin.TOPLEFT,
                    ),
                )
                for ix, line in enumerate(result)
            ]
            page.cells = cells  # For now, just overwrites all digital cells.
            # DEBUG code:
            def draw_clusters_and_cells():
                image = copy.deepcopy(page.image)
                draw = ImageDraw.Draw(image)
                cell_color = (
                    random.randint(30, 140),
                    random.randint(30, 140),
                    random.randint(30, 140),
                )
                for tc in cells:
                    x0, y0, x1, y1 = tc.bbox.as_tuple()
                    draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
                image.show()
            # draw_clusters_and_cells()
            yield page
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -0,0 +1,318 @@
 import copy
 import logging
 import random
 import time
 from typing import Iterable, List
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 from PIL import ImageDraw
 from docling.datamodel.base_models import (
    BoundingBox,
    Cell,
    Cluster,
    CoordOrigin,
    LayoutPrediction,
    Page,
 )
 from docling.utils import layout_utils as lu
 _log = logging.getLogger(__name__)
 class LayoutModel:
    TEXT_ELEM_LABELS = [
        "Text",
        "Footnote",
        "Caption",
        "Checkbox-Unselected",
        "Checkbox-Selected",
        "Section-header",
        "Page-header",
        "Page-footer",
        "Code",
        "List-item",
        # "Formula",
    ]
    PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
    TABLE_LABEL = "Table"
    FIGURE_LABEL = "Picture"
    FORMULA_LABEL = "Formula"
    def __init__(self, config):
        self.config = config
        self.layout_predictor = LayoutPredictor(
            config["artifacts_path"]
        )  # TODO temporary
    def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
        MIN_INTERSECTION = 0.2
        CLASS_THRESHOLDS = {
            "Caption": 0.35,
            "Footnote": 0.35,
            "Formula": 0.35,
            "List-item": 0.35,
            "Page-footer": 0.35,
            "Page-header": 0.35,
            "Picture": 0.2,  # low threshold adjust to capture chemical structures for examples.
            "Section-header": 0.45,
            "Table": 0.35,
            "Text": 0.45,
            "Title": 0.45,
            "Document Index": 0.45,
            "Code": 0.45,
            "Checkbox-Selected": 0.45,
            "Checkbox-Unselected": 0.45,
            "Form": 0.45,
            "Key-Value Region": 0.45,
        }
        _log.debug("================= Start postprocess function ====================")
        start_time = time.time()
        # Apply Confidence Threshold to cluster predictions
        # confidence = self.conf_threshold
        clusters_out = []
        for cluster in clusters:
            confidence = CLASS_THRESHOLDS[cluster.label]
            if cluster.confidence >= confidence:
                # annotation["created_by"] = "high_conf_pred"
                clusters_out.append(cluster)
        # map to dictionary clusters and cells, with bottom left origin
        clusters = [
            {
                "id": c.id,
                "bbox": list(
                    c.bbox.to_bottom_left_origin(page_height).as_tuple()
                ),  # TODO
                "confidence": c.confidence,
                "cell_ids": [],
                "type": c.label,
            }
            for c in clusters
        ]
        clusters_out = [
            {
                "id": c.id,
                "bbox": list(
                    c.bbox.to_bottom_left_origin(page_height).as_tuple()
                ),  # TODO
                "confidence": c.confidence,
                "created_by": "high_conf_pred",
                "cell_ids": [],
                "type": c.label,
            }
            for c in clusters_out
        ]
        raw_cells = [
            {
                "id": c.id,
                "bbox": list(
                    c.bbox.to_bottom_left_origin(page_height).as_tuple()
                ),  # TODO
                "text": c.text,
            }
            for c in cells
        ]
        cell_count = len(raw_cells)
        _log.debug("---- 0. Treat cluster overlaps ------")
        clusters_out = lu.remove_cluster_duplicates_by_conf(clusters_out, 0.8)
        _log.debug(
            "---- 1. Initially assign cells to clusters based on minimum intersection ------"
        )
        ## Check for cells included in or touched by clusters:
        clusters_out = lu.assigning_cell_ids_to_clusters(
            clusters_out, raw_cells, MIN_INTERSECTION
        )
        _log.debug("---- 2. Assign Orphans with Low Confidence Detections")
        # Creates a map of cell_id->cluster_id
        (
            clusters_around_cells,
            orphan_cell_indices,
            ambiguous_cell_indices,
        ) = lu.cell_id_state_map(clusters_out, cell_count)
        # Assign orphan cells with lower confidence predictions
        clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred(
            clusters_out, clusters, raw_cells, orphan_cell_indices
        )
        # Refresh the cell_ids assignment, after creating new clusters using low conf predictions
        clusters_out = lu.assigning_cell_ids_to_clusters(
            clusters_out, raw_cells, MIN_INTERSECTION
        )
        _log.debug("---- 3. Settle Ambigous Cells")
        # Creates an update map after assignment of cell_id->cluster_id
        (
            clusters_around_cells,
            orphan_cell_indices,
            ambiguous_cell_indices,
        ) = lu.cell_id_state_map(clusters_out, cell_count)
        # Settle pdf cells that belong to multiple clusters
        clusters_out, ambiguous_cell_indices = lu.remove_ambigous_pdf_cell_by_conf(
            clusters_out, raw_cells, ambiguous_cell_indices
        )
        _log.debug("---- 4. Set Orphans as Text")
        (
            clusters_around_cells,
            orphan_cell_indices,
            ambiguous_cell_indices,
        ) = lu.cell_id_state_map(clusters_out, cell_count)
        clusters_out, orphan_cell_indices = lu.set_orphan_as_text(
            clusters_out, clusters, raw_cells, orphan_cell_indices
        )
        _log.debug("---- 5. Merge Cells & and adapt the bounding boxes")
        # Merge cells orphan cells
        clusters_out = lu.merge_cells(clusters_out)
        # Clean up clusters that remain from merged and unreasonable clusters
        clusters_out = lu.clean_up_clusters(
            clusters_out,
            raw_cells,
            merge_cells=True,
            img_table=True,
            one_cell_table=True,
        )
        new_clusters = lu.adapt_bboxes(raw_cells, clusters_out, orphan_cell_indices)
        clusters_out = new_clusters
        ## We first rebuild where every cell is now:
        ##   Now we write into a prediction cells list, not into the raw cells list.
        ##   As we don't need previous labels, we best overwrite any old list, because that might
        ##   have been sorted differently.
        (
            clusters_around_cells,
            orphan_cell_indices,
            ambiguous_cell_indices,
        ) = lu.cell_id_state_map(clusters_out, cell_count)
        target_cells = []
        for ix, cell in enumerate(raw_cells):
            new_cell = {
                "id": ix,
                "rawcell_id": ix,
                "label": "None",
                "bbox": cell["bbox"],
                "text": cell["text"],
            }
            for cluster_index in clusters_around_cells[
                ix
            ]:  # By previous analysis, this is always 1 cluster.
                new_cell["label"] = clusters_out[cluster_index]["type"]
            target_cells.append(new_cell)
            # _log.debug("New label of cell " + str(ix) + " is " + str(new_cell["label"]))
        cells_out = target_cells
        ## -------------------------------
        ## Sort clusters into reasonable reading order, and sort the cells inside each cluster
        _log.debug("---- 5. Sort clusters in reading order ------")
        sorted_clusters = lu.produce_reading_order(
            clusters_out, "raw_cell_ids", "raw_cell_ids", True
        )
        clusters_out = sorted_clusters
        # end_time = timer()
        _log.debug("---- End of postprocessing function ------")
        end_time = time.time() - start_time
        _log.debug(f"Finished post processing in seconds={end_time:.3f}")
        cells_out = [
            Cell(
                id=c["id"],
                bbox=BoundingBox.from_tuple(
                    coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
                ).to_top_left_origin(page_height),
                text=c["text"],
            )
            for c in cells_out
        ]
        clusters_out_new = []
        for c in clusters_out:
            cluster_cells = [ccell for ccell in cells_out if ccell.id in c["cell_ids"]]
            c_new = Cluster(
                id=c["id"],
                bbox=BoundingBox.from_tuple(
                    coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
                ).to_top_left_origin(page_height),
                confidence=c["confidence"],
                label=c["type"],
                cells=cluster_cells,
            )
            clusters_out_new.append(c_new)
        return clusters_out_new, cells_out
    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        for page in page_batch:
            clusters = []
            for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)):
                cluster = Cluster(
                    id=ix,
                    label=pred_item["label"],
                    confidence=pred_item["confidence"],
                    bbox=BoundingBox.model_validate(pred_item),
                    cells=[],
                )
                clusters.append(cluster)
            # Map cells to clusters
            # TODO: Remove, postprocess should take care of it anyway.
            for cell in page.cells:
                for cluster in clusters:
                    if not cell.bbox.area() > 0:
                        overlap_frac = 0.0
                    else:
                        overlap_frac = (
                            cell.bbox.intersection_area_with(cluster.bbox)
                            / cell.bbox.area()
                        )
                    if overlap_frac > 0.5:
                        cluster.cells.append(cell)
            # Pre-sort clusters
            # clusters = self.sort_clusters_by_cell_order(clusters)
            # DEBUG code:
            def draw_clusters_and_cells():
                image = copy.deepcopy(page.image)
                draw = ImageDraw.Draw(image)
                for c in clusters:
                    x0, y0, x1, y1 = c.bbox.as_tuple()
                    draw.rectangle([(x0, y0), (x1, y1)], outline="green")
                    cell_color = (
                        random.randint(30, 140),
                        random.randint(30, 140),
                        random.randint(30, 140),
                    )
                    for tc in c.cells:  # [:1]:
                        x0, y0, x1, y1 = tc.bbox.as_tuple()
                        draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
                image.show()
            # draw_clusters_and_cells()
            clusters, page.cells = self.postprocess(
                clusters, page.cells, page.size.height
            )
            # draw_clusters_and_cells()
            page.predictions.layout = LayoutPrediction(clusters=clusters)
            yield page
--- a/docling/models/page_assemble_model.py
+++ b/docling/models/page_assemble_model.py
@ -0,0 +1,160 @@
 import logging
 import re
 from typing import Iterable, List
 from docling.datamodel.base_models import (
    AssembledUnit,
    FigureElement,
    Page,
    PageElement,
    TableElement,
    TextElement,
 )
 from docling.models.layout_model import LayoutModel
 _log = logging.getLogger(__name__)
 class PageAssembleModel:
    def __init__(self, config):
        self.config = config
        # self.line_wrap_pattern = re.compile(r'(?<=[^\W_])- \n(?=\w)')
    # def sanitize_text_poor(self, lines):
    #     text = '\n'.join(lines)
    #
    #     # treat line wraps.
    #     sanitized_text = self.line_wrap_pattern.sub('', text)
    #
    #     sanitized_text = sanitized_text.replace('\n', ' ')
    #
    #     return sanitized_text
    def sanitize_text(self, lines):
        if len(lines) <= 1:
            return " ".join(lines)
        for ix, line in enumerate(lines[1:]):
            prev_line = lines[ix]
            if prev_line.endswith("-"):
                prev_words = re.findall(r"\b[\w]+\b", prev_line)
                line_words = re.findall(r"\b[\w]+\b", line)
                if (
                    len(prev_words)
                    and len(line_words)
                    and prev_words[-1].isalnum()
                    and line_words[0].isalnum()
                ):
                    lines[ix] = prev_line[:-1]
            else:
                lines[ix] += " "
        sanitized_text = "".join(lines)
        return sanitized_text.strip()  # Strip any leading or trailing whitespace
    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        for page in page_batch:
            # assembles some JSON output page by page.
            elements: List[PageElement] = []
            headers: List[PageElement] = []
            body: List[PageElement] = []
            for cluster in page.predictions.layout.clusters:
                # _log.info("Cluster label seen:", cluster.label)
                if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
                    textlines = [
                        cell.text.replace("\x02", "-").strip()
                        for cell in cluster.cells
                        if len(cell.text.strip()) > 0
                    ]
                    text = self.sanitize_text(textlines)
                    text_el = TextElement(
                        label=cluster.label,
                        id=cluster.id,
                        text=text,
                        page_no=page.page_no,
                        cluster=cluster,
                    )
                    elements.append(text_el)
                    if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
                        headers.append(text_el)
                    else:
                        body.append(text_el)
                elif cluster.label == LayoutModel.TABLE_LABEL:
                    tbl = None
                    if page.predictions.tablestructure:
                        tbl = page.predictions.tablestructure.table_map.get(
                            cluster.id, None
                        )
                    if (
                        not tbl
                    ):  # fallback: add table without structure, if it isn't present
                        tbl = TableElement(
                            label=cluster.label,
                            id=cluster.id,
                            text="",
                            otsl_seq=[],
                            table_cells=[],
                            cluster=cluster,
                            page_no=page.page_no,
                        )
                    elements.append(tbl)
                    body.append(tbl)
                elif cluster.label == LayoutModel.FIGURE_LABEL:
                    fig = None
                    if page.predictions.figures_classification:
                        fig = page.predictions.figures_classification.figure_map.get(
                            cluster.id, None
                        )
                    if (
                        not fig
                    ):  # fallback: add figure without classification, if it isn't present
                        fig = FigureElement(
                            label=cluster.label,
                            id=cluster.id,
                            text="",
                            data=None,
                            cluster=cluster,
                            page_no=page.page_no,
                        )
                    elements.append(fig)
                    body.append(fig)
                elif cluster.label == LayoutModel.FORMULA_LABEL:
                    equation = None
                    if page.predictions.equations_prediction:
                        equation = (
                            page.predictions.equations_prediction.equation_map.get(
                                cluster.id, None
                            )
                        )
                    if not equation:  # fallback: add empty formula, if it isn't present
                        text = self.sanitize_text(
                            [
                                cell.text.replace("\x02", "-").strip()
                                for cell in cluster.cells
                                if len(cell.text.strip()) > 0
                            ]
                        )
                        equation = TextElement(
                            label=cluster.label,
                            id=cluster.id,
                            cluster=cluster,
                            page_no=page.page_no,
                            text=text,
                        )
                    elements.append(equation)
                    body.append(equation)
            page.assembled = AssembledUnit(
                elements=elements, headers=headers, body=body
            )
            yield page
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -0,0 +1,114 @@
 from typing import Iterable
 import numpy
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
 from docling.datamodel.base_models import (
    BoundingBox,
    Page,
    TableCell,
    TableElement,
    TableStructurePrediction,
 )
 class TableStructureModel:
    def __init__(self, config):
        self.config = config
        self.do_cell_matching = config["do_cell_matching"]
        self.enabled = config["enabled"]
        if self.enabled:
            artifacts_path = config["artifacts_path"]
            # Third Party
            import docling_ibm_models.tableformer.common as c
            self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
            self.tm_config["model"]["save_dir"] = artifacts_path
            self.tm_model_type = self.tm_config["model"]["type"]
            self.tf_predictor = TFPredictor(self.tm_config)
    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        if not self.enabled:
            yield from page_batch
            return
        for page in page_batch:
            page.predictions.tablestructure = TableStructurePrediction()  # dummy
            in_tables = [
                (
                    cluster,
                    [
                        round(cluster.bbox.l),
                        round(cluster.bbox.t),
                        round(cluster.bbox.r),
                        round(cluster.bbox.b),
                    ],
                )
                for cluster in page.predictions.layout.clusters
                if cluster.label == "Table"
            ]
            if not len(in_tables):
                yield page
                continue
            tokens = []
            for c in page.cells:
                for cluster, _ in in_tables:
                    if c.bbox.area() > 0:
                        if (
                            c.bbox.intersection_area_with(cluster.bbox) / c.bbox.area()
                            > 0.2
                        ):
                            # Only allow non empty stings (spaces) into the cells of a table
                            if len(c.text.strip()) > 0:
                                tokens.append(c.model_dump())
            iocr_page = {
                "image": numpy.asarray(page.image),
                "tokens": tokens,
                "width": page.size.width,
                "height": page.size.height,
            }
            table_clusters, table_bboxes = zip(*in_tables)
            if len(table_bboxes):
                tf_output = self.tf_predictor.multi_table_predict(
                    iocr_page, table_bboxes, do_matching=self.do_cell_matching
                )
                for table_cluster, table_out in zip(table_clusters, tf_output):
                    table_cells = []
                    for element in table_out["tf_responses"]:
                        if not self.do_cell_matching:
                            the_bbox = BoundingBox.model_validate(element["bbox"])
                            text_piece = page._backend.get_text_in_rect(the_bbox)
                            element["bbox"]["token"] = text_piece
                        tc = TableCell.model_validate(element)
                        table_cells.append(tc)
                    # Retrieving cols/rows, after post processing:
                    num_rows = table_out["predict_details"]["num_rows"]
                    num_cols = table_out["predict_details"]["num_cols"]
                    otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
                    tbl = TableElement(
                        otsl_seq=otsl_seq,
                        table_cells=table_cells,
                        num_rows=num_rows,
                        num_cols=num_cols,
                        id=table_cluster.id,
                        page_no=page.page_no,
                        cluster=table_cluster,
                        label="Table",
                    )
                    page.predictions.tablestructure.table_map[table_cluster.id] = tbl
            yield page
--- a/docling/pipeline/init.py
+++ b/docling/pipeline/init.py
--- a/docling/pipeline/base_model_pipeline.py
+++ b/docling/pipeline/base_model_pipeline.py
@ -0,0 +1,18 @@
 from abc import abstractmethod
 from pathlib import Path
 from typing import Iterable
 from docling.datamodel.base_models import Page, PipelineOptions
 class BaseModelPipeline:
    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
        self.model_pipe = []
        self.artifacts_path = artifacts_path
        self.pipeline_options = pipeline_options
    def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        for model in self.model_pipe:
            page_batch = model(page_batch)
        yield from page_batch
--- a/docling/pipeline/standard_model_pipeline.py
+++ b/docling/pipeline/standard_model_pipeline.py
@ -0,0 +1,40 @@
 from pathlib import Path
 from typing import Iterable
 from docling.datamodel.base_models import Page, PipelineOptions
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.page_assemble_model import PageAssembleModel
 from docling.models.table_structure_model import TableStructureModel
 from docling.pipeline.base_model_pipeline import BaseModelPipeline
 class StandardModelPipeline(BaseModelPipeline):
    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
    _table_model_path = "model_artifacts/tableformer"
    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
        super().__init__(artifacts_path, pipeline_options)
        self.model_pipe = [
            EasyOcrModel(
                config={
                    "lang": ["fr", "de", "es", "en"],
                    "enabled": pipeline_options.do_ocr,
                }
            ),
            LayoutModel(
                config={
                    "artifacts_path": artifacts_path
                    / StandardModelPipeline._layout_model_path
                }
            ),
            TableStructureModel(
                config={
                    "artifacts_path": artifacts_path
                    / StandardModelPipeline._table_model_path,
                    "enabled": pipeline_options.do_table_structure,
                    "do_cell_matching": False,
                }
            ),
        ]
--- a/docling/utils/init.py
+++ b/docling/utils/init.py
--- a/docling/utils/layout_utils.py
+++ b/docling/utils/layout_utils.py
@ -0,0 +1,806 @@
 import copy
 import logging
 import networkx as nx
 logger = logging.getLogger("layout_utils")
 ## -------------------------------
 ## Geometric helper functions
 ## The coordinates grow left to right, and bottom to top.
 ## The bounding box list elements 0 to 3 are x_left, y_bottom, x_right, y_top.
 def area(bbox):
    return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
 def contains(bbox_i, bbox_j):
    ## Returns True if bbox_i contains bbox_j, else False
    return (
        bbox_i[0] <= bbox_j[0]
        and bbox_i[1] <= bbox_j[1]
        and bbox_i[2] >= bbox_j[2]
        and bbox_i[3] >= bbox_j[3]
    )
 def is_intersecting(bbox_i, bbox_j):
    return not (
        bbox_i[2] < bbox_j[0]
        or bbox_i[0] > bbox_j[2]
        or bbox_i[3] < bbox_j[1]
        or bbox_i[1] > bbox_j[3]
    )
 def bb_iou(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    # compute the area of intersection rectangle
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)
    # return the intersection over union value
    return iou
 def compute_intersection(bbox_i, bbox_j):
    ## Returns the size of the intersection area of the two boxes
    if not is_intersecting(bbox_i, bbox_j):
        return 0
    ## Determine the (x, y)-coordinates of the intersection rectangle:
    xA = max(bbox_i[0], bbox_j[0])
    yA = max(bbox_i[1], bbox_j[1])
    xB = min(bbox_i[2], bbox_j[2])
    yB = min(bbox_i[3], bbox_j[3])
    ## Compute the area of intersection rectangle:
    interArea = (xB - xA) * (yB - yA)
    if interArea < 0:
        logger.debug("Warning: Negative intersection detected!")
        return 0
    return interArea
 def surrounding(bbox_i, bbox_j):
    ## Computes minimal box that contains both input boxes
    sbox = []
    sbox.append(min(bbox_i[0], bbox_j[0]))
    sbox.append(min(bbox_i[1], bbox_j[1]))
    sbox.append(max(bbox_i[2], bbox_j[2]))
    sbox.append(max(bbox_i[3], bbox_j[3]))
    return sbox
 def surrounding_list(bbox_list):
    ## Computes minimal box that contains all boxes in the input list
    ## The list should be non-empty, but just in case it's not:
    if len(bbox_list) == 0:
        sbox = [0, 0, 0, 0]
    else:
        sbox = []
        sbox.append(min([bbox[0] for bbox in bbox_list]))
        sbox.append(min([bbox[1] for bbox in bbox_list]))
        sbox.append(max([bbox[2] for bbox in bbox_list]))
        sbox.append(max([bbox[3] for bbox in bbox_list]))
    return sbox
 def vertical_overlap(bboxA, bboxB):
    ## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
    if bboxB[3] < bboxA[1]:  ## B below A
        return False
    elif bboxA[3] < bboxB[1]:  ## A below B
        return False
    else:
        return True
 def vertical_overlap_fraction(bboxA, bboxB):
    ## Returns the vertical overlap as fraction of the lower bbox height.
    ## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
    ## Height 0 is permitted in the input.
    heightA = bboxA[3] - bboxA[1]
    heightB = bboxB[3] - bboxB[1]
    min_height = min(heightA, heightB)
    if bboxA[3] >= bboxB[3]:  ## A starts higher or equal
        if (
            bboxA[1] <= bboxB[1]
        ):  ## B is completely in A; this can include height of B = 0:
            fraction = 1
        else:
            overlap = max(bboxB[3] - bboxA[1], 0)
            fraction = overlap / max(min_height, 0.001)
    else:
        if (
            bboxB[1] <= bboxA[1]
        ):  ## A is completely in B; this can include height of A = 0:
            fraction = 1
        else:
            overlap = max(bboxA[3] - bboxB[1], 0)
            fraction = overlap / max(min_height, 0.001)
    return fraction
 ## -------------------------------
 ## Cluster-and-cell relations
 def compute_enclosed_cells(
    cluster_bbox, raw_cells, min_cell_intersection_with_cluster=0.2
 ):
    cells_in_cluster = []
    cells_in_cluster_int = []
    for ix, cell in enumerate(raw_cells):
        cell_bbox = cell["bbox"]
        intersection = compute_intersection(cell_bbox, cluster_bbox)
        frac_area = area(cell_bbox) * min_cell_intersection_with_cluster
        if (
            intersection > frac_area and frac_area > 0
        ):  # intersect > certain fraction of cell
            cells_in_cluster.append(ix)
            cells_in_cluster_int.append(intersection)
        elif contains(
            cluster_bbox,
            [cell_bbox[0] + 3, cell_bbox[1] + 3, cell_bbox[2] - 3, cell_bbox[3] - 3],
        ):
            cells_in_cluster.append(ix)
    return cells_in_cluster, cells_in_cluster_int
 def find_clusters_around_cells(cell_count, clusters):
    ## Per raw cell, find to which clusters it belongs.
    ## Return list of these indices in the raw-cell order.
    clusters_around_cells = [[] for _ in range(cell_count)]
    for cl_ix, cluster in enumerate(clusters):
        for ix in cluster["cell_ids"]:
            clusters_around_cells[ix].append(cl_ix)
    return clusters_around_cells
 def find_cell_index(raw_ix, cell_array):
    ## "raw_ix" is a rawcell_id.
    ## "cell_array" has the structure of an (annotation) cells array.
    ## Returns index of cell in cell_array that has this rawcell_id.
    for ix, cell in enumerate(cell_array):
        if cell["rawcell_id"] == raw_ix:
            return ix
 def find_cell_indices(cluster, cell_array):
    ## "cluster" must have the structure as in a clusters array in a prediction,
    ## "cell_array" that of a cells array.
    ## Returns list of indices of cells in cell_array that have the rawcell_ids as in the cluster,
    ## in the order of the rawcell_ids.
    result = []
    for raw_ix in sorted(cluster["cell_ids"]):
        ## Find the cell with this rawcell_id (if any)
        for ix, cell in enumerate(cell_array):
            if cell["rawcell_id"] == raw_ix:
                result.append(ix)
    return result
 def find_first_cell_index(cluster, cell_array):
    ## "cluster" must be a dict with key "cell_ids"; it can also be a line.
    ## "cell_array" has the structure of a cells array in an annotation.
    ## Returns index of cell in cell_array that has the lowest rawcell_id from the cluster.
    result = []  ## We keep it a list as it can be empty (picture without text cells)
    if len(cluster["cell_ids"]) == 0:
        return result
    raw_ix = min(cluster["cell_ids"])
    ## Find the cell with this rawcell_id (if any)
    for ix, cell in enumerate(cell_array):
        if cell["rawcell_id"] == raw_ix:
            result.append(ix)
            break  ## One is enough; should be only one anyway.
    if result == []:
        logger.debug(
            "  Warning: Raw cell " + str(raw_ix) + " not found in annotation cells"
        )
    return result
 ## -------------------------------
 ## Cluster labels and text
 def relabel_cluster(cluster, cl_ix, new_label, target_pred):
    ## "cluster" must have the structure as in a clusters array in a prediction,
    ## "cl_ix" is its index in target_pred,
    ## "new_label" is the intended new label,
    ## "target_pred" is the entire current target prediction.
    ## Sets label on the cluster itself, and on the cells in the target_pred.
    ## Returns new_label so that also the cl_label variable in the main code is easily set.
    target_pred["clusters"][cl_ix]["type"] = new_label
    cluster_target_cells = find_cell_indices(cluster, target_pred["cells"])
    for ix in cluster_target_cells:
        target_pred["cells"][ix]["label"] = new_label
    return new_label
 def find_cluster_text(cluster, raw_cells):
    ## "cluster" must be a dict with "cell_ids"; it can also be a line.
    ## "raw_cells" must have the format of item["raw"]["cells"]
    ## Returns the text of the cluster, with blanks between the cell contents
    ## (which seem to be words or phrases without starting or trailing blanks).
    ## Note that in formulas, this may give a lot more blanks than originally
    cluster_text = ""
    for raw_ix in sorted(cluster["cell_ids"]):
        cluster_text = cluster_text + raw_cells[raw_ix]["text"] + " "
    return cluster_text.rstrip()
 def find_cluster_text_without_blanks(cluster, raw_cells):
    ## "cluster" must be a dict with "cell_ids"; it can also be a line.
    ## "raw_cells" must have the format of item["raw"]["cells"]
    ## Returns the text of the cluster, without blanks between the cell contents
    ## Interesting in formula analysis.
    cluster_text = ""
    for raw_ix in sorted(cluster["cell_ids"]):
        cluster_text = cluster_text + raw_cells[raw_ix]["text"]
    return cluster_text.rstrip()
 ## -------------------------------
 ## Clusters and lines
 ## (Most line-oriented functions are only needed in TextAnalysisGivenClusters,
 ##  but this one also in FormulaAnalysis)
 def build_cluster_from_lines(lines, label, id):
    ## Lines must be a non-empty list of dicts (lines) with elements "cell_ids" and "bbox"
    ## (There is no condition that they are really geometrically lines)
    ## A cluster in standard format is returned with given label and id
    local_lines = copy.deepcopy(
        lines
    )  ## without this, it changes "lines" also outside this function
    first_line = local_lines.pop(0)
    cluster = {
        "id": id,
        "type": label,
        "cell_ids": first_line["cell_ids"],
        "bbox": first_line["bbox"],
        "confidence": 0,
        "created_by": "merged_cells",
    }
    confidence = 0
    counter = 0
    for line in local_lines:
        new_cell_ids = cluster["cell_ids"] + line["cell_ids"]
        cluster["cell_ids"] = new_cell_ids
        cluster["bbox"] = surrounding(cluster["bbox"], line["bbox"])
        counter += 1
        confidence += line["confidence"]
    confidence = confidence / counter
    cluster["confidence"] = confidence
    return cluster
 ## -------------------------------
 ## Reading order
 def produce_reading_order(clusters, cluster_sort_type, cell_sort_type, sort_ids):
    ## In:
    ##   Clusters: list as in predictions.
    ##   cluster_sort_type: string, currently only "raw_cells".
    ##   cell_sort_type: string, currently only "raw_cells".
    ##   sort_ids: Boolean, whether the cluster ids should be adapted to their new position
    ## Out: Another clusters list, sorted according to the type.
    logger.debug("---- Start cluster sorting ------")
    if cell_sort_type == "raw_cell_ids":
        for cl in clusters:
            sorted_cell_ids = sorted(cl["cell_ids"])
            cl["cell_ids"] = sorted_cell_ids
    else:
        logger.debug(
            "Unknown cell_sort_type `"
            + cell_sort_type
            + "`, no cell sorting will happen."
        )
    if cluster_sort_type == "raw_cell_ids":
        clusters_with_cells = [cl for cl in clusters if cl["cell_ids"] != []]
        clusters_without_cells = [cl for cl in clusters if cl["cell_ids"] == []]
        logger.debug(
            "Clusters with cells: " + str([cl["id"] for cl in clusters_with_cells])
        )
        logger.debug(
            "  Their first cell ids: "
            + str([cl["cell_ids"][0] for cl in clusters_with_cells])
        )
        logger.debug(
            "Clusters without cells: "
            + str([cl["id"] for cl in clusters_without_cells])
        )
        clusters_with_cells_sorted = sorted(
            clusters_with_cells, key=lambda cluster: cluster["cell_ids"][0]
        )
        logger.debug(
            "  First cell ids after sorting: "
            + str([cl["cell_ids"][0] for cl in clusters_with_cells_sorted])
        )
        sorted_clusters = clusters_with_cells_sorted + clusters_without_cells
    else:
        logger.debug(
            "Unknown cluster_sort_type: `"
            + cluster_sort_type
            + "`, no cluster sorting will happen."
        )
    if sort_ids:
        for i, cl in enumerate(sorted_clusters):
            cl["id"] = i
    return sorted_clusters
 ## -------------------------------
 ## Line Splitting
 def sort_cells_horizontal(line_cell_ids, raw_cells):
    ## "line_cells" should be a non-empty list of (raw) cell_ids
    ## "raw_cells" has the structure of item["raw"]["cells"].
    ## Sorts the cells in the line by x0 (left start).
    new_line_cell_ids = sorted(
        line_cell_ids, key=lambda cell_id: raw_cells[cell_id]["bbox"][0]
    )
    return new_line_cell_ids
 def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
    new_clusters = []
    for ix, cluster in enumerate(clusters):
        new_cluster = copy.deepcopy(cluster)
        logger.debug(
            "Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
        )
        logger.debug("  with cells: " + str(new_cluster["cell_ids"]))
        if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
            logger.debug("  Empty non-picture, removed")
            continue  ## Skip this former cluster, now without cells.
        new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
        new_cluster["bbox"] = new_bbox
        new_clusters.append(new_cluster)
    return new_clusters
 def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
    if not (cluster["type"] in ["Table", "Picture"]):
        ## A text-like cluster. The bbox only needs to be around the text cells:
        logger.debug("    Initial bbox: " + str(cluster["bbox"]))
        new_bbox = surrounding_list(
            [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
        )
        logger.debug("  New bounding box:" + str(new_bbox))
    if cluster["type"] == "Picture":
        ## We only make the bbox completely comprise included text cells:
        logger.debug("  Picture")
        if len(cluster["cell_ids"]) != 0:
            min_bbox = surrounding_list(
                [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
            )
            logger.debug("    Minimum bbox: " + str(min_bbox))
            logger.debug("    Initial bbox: " + str(cluster["bbox"]))
            new_bbox = surrounding(min_bbox, cluster["bbox"])
            logger.debug("    New bbox (initial and text cells): " + str(new_bbox))
        else:
            logger.debug("    without text cells, no change.")
            new_bbox = cluster["bbox"]
    else:  ## A table
        ## At least we have to keep the included text cells, and we make the bbox completely comprise them
        min_bbox = surrounding_list(
            [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
        )
        logger.debug("    Minimum bbox: " + str(min_bbox))
        logger.debug("    Initial bbox: " + str(cluster["bbox"]))
        new_bbox = surrounding(min_bbox, cluster["bbox"])
        logger.debug("    Possibly increased bbox: " + str(new_bbox))
        ## Now we look which non-belonging cells are covered.
        ## (To decrease dependencies, we don't make use of which cells we actually removed.)
        ## We don't worry about orphan cells, those could still be added to the table.
        enclosed_cells = compute_enclosed_cells(
            new_bbox, raw_cells, min_cell_intersection_with_cluster=0.3
        )[0]
        additional_cells = set(enclosed_cells) - set(cluster["cell_ids"])
        logger.debug(
            "    Additional cells enclosed by Table bbox: " + str(additional_cells)
        )
        spurious_cells = additional_cells - set(orphan_cell_indices)
        logger.debug(
            "    Spurious cells enclosed by Table bbox (additional minus orphans): "
            + str(spurious_cells)
        )
        if len(spurious_cells) == 0:
            return new_bbox
        ## Else we want to keep as much as possible, e.g., grid lines, but not the spurious cells if we can.
        ## We initialize possible cuts with the current bbox.
        left_cut = new_bbox[0]
        right_cut = new_bbox[2]
        upper_cut = new_bbox[3]
        lower_cut = new_bbox[1]
        for cell_ix in spurious_cells:
            cell = raw_cells[cell_ix]
            # logger.debug("     Spurious cell bbox: " + str(cell["bbox"]))
            is_left = cell["bbox"][2] < min_bbox[0]
            is_right = cell["bbox"][0] > min_bbox[2]
            is_above = cell["bbox"][1] > min_bbox[3]
            is_below = cell["bbox"][3] < min_bbox[1]
            # logger.debug("      Left, right, above, below? " + str([is_left, is_right, is_above, is_below]))
            if is_left:
                if cell["bbox"][2] > left_cut:
                    ## We move the left cut to exclude this cell:
                    left_cut = cell["bbox"][2]
            if is_right:
                if cell["bbox"][0] < right_cut:
                    ## We move the right cut to exclude this cell:
                    right_cut = cell["bbox"][0]
            if is_above:
                if cell["bbox"][1] < upper_cut:
                    ## We move the upper cut to exclude this cell:
                    upper_cut = cell["bbox"][1]
            if is_below:
                if cell["bbox"][3] > lower_cut:
                    ## We move the left cut to exclude this cell:
                    lower_cut = cell["bbox"][3]
            # logger.debug("      Current bbox: " + str([left_cut, lower_cut, right_cut, upper_cut]))
            new_bbox = [left_cut, lower_cut, right_cut, upper_cut]
        logger.debug("   Final bbox: " + str(new_bbox))
    return new_bbox
 def remove_cluster_duplicates_by_conf(cluster_predictions, threshold=0.5):
    DuplicateDeletedClusterIDs = []
    for cluster_1 in cluster_predictions:
        for cluster_2 in cluster_predictions:
            if cluster_1["id"] != cluster_2["id"]:
                if_conf = False
                if cluster_1["confidence"] > cluster_2["confidence"]:
                    if_conf = True
                if if_conf == True:
                    if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > threshold:
                        DuplicateDeletedClusterIDs.append(cluster_2["id"])
                    elif contains(
                        cluster_1["bbox"],
                        [
                            cluster_2["bbox"][0] + 3,
                            cluster_2["bbox"][1] + 3,
                            cluster_2["bbox"][2] - 3,
                            cluster_2["bbox"][3] - 3,
                        ],
                    ):
                        DuplicateDeletedClusterIDs.append(cluster_2["id"])
    DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
    for cl_id in DuplicateDeletedClusterIDs:
        for cluster in cluster_predictions:
            if cl_id == cluster["id"]:
                cluster_predictions.remove(cluster)
    return cluster_predictions
 # Assign orphan cells by a low confidence prediction that is below the assigned confidence
 def assign_orphans_with_low_conf_pred(
    cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
 ):
    for orph_id in orphan_cell_indices:
        cluster_chosen = {}
        iou_thresh = 0.05
        confidence = 0.05
        # Loop over all predictions, and find the one with the highest IOU, and confidence
        for cluster in cluster_predictions_low:
            calc_iou = bb_iou(cluster["bbox"], raw_cells[orph_id]["bbox"])
            cluster_area = (cluster["bbox"][3] - cluster["bbox"][1]) * (
                cluster["bbox"][2] - cluster["bbox"][0]
            )
            cell_area = (
                raw_cells[orph_id]["bbox"][3] - raw_cells[orph_id]["bbox"][1]
            ) * (raw_cells[orph_id]["bbox"][2] - raw_cells[orph_id]["bbox"][0])
            if (
                (iou_thresh < calc_iou)
                and (cluster["confidence"] > confidence)
                and (cell_area * 3 > cluster_area)
            ):
                cluster_chosen = cluster
                iou_thresh = calc_iou
                confidence = cluster["confidence"]
        # If a candidate is found, assign to it the PDF cell ids, and tag that it was created by this function for tracking
        if iou_thresh != 0.05 and confidence != 0.05:
            cluster_chosen["cell_ids"].append(orph_id)
            cluster_chosen["created_by"] = "orph_low_conf"
            cluster_predictions.append(cluster_chosen)
            orphan_cell_indices.remove(orph_id)
    return cluster_predictions, orphan_cell_indices
 def remove_ambigous_pdf_cell_by_conf(cluster_predictions, raw_cells, amb_cell_idxs):
    for amb_cell_id in amb_cell_idxs:
        highest_conf = 0
        highest_bbox_iou = 0
        cluster_chosen = None
        problamatic_clusters = []
        # Find clusters in question
        for cluster in cluster_predictions:
            if amb_cell_id in cluster["cell_ids"]:
                problamatic_clusters.append(amb_cell_id)
                # If the cell_id is in a cluster of high conf, and highest iou score, and smaller in area
                bbox_iou_val = bb_iou(cluster["bbox"], raw_cells[amb_cell_id]["bbox"])
                if (
                    cluster["confidence"] > highest_conf
                    and bbox_iou_val > highest_bbox_iou
                ):
                    cluster_chosen = cluster
                    highest_conf = cluster["confidence"]
                    highest_bbox_iou = bbox_iou_val
                    if cluster["id"] in problamatic_clusters:
                        problamatic_clusters.remove(cluster["id"])
        # now remove the assigning of cell id from lower confidence, and threshold
        for cluster in cluster_predictions:
            for prob_amb_id in problamatic_clusters:
                if prob_amb_id in cluster["cell_ids"]:
                    cluster["cell_ids"].remove(prob_amb_id)
        amb_cell_idxs.remove(amb_cell_id)
    return cluster_predictions, amb_cell_idxs
 def ranges(nums):
    # Find if consecutive numbers exist within pdf cells
    # Used to remove line numbers for review manuscripts
    nums = sorted(set(nums))
    gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s + 1 < e]
    edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
    return list(zip(edges, edges))
 def set_orphan_as_text(
    cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
 ):
    max_id = -1
    figures = []
    for cluster in cluster_predictions:
        if cluster["type"] == "Picture":
            figures.append(cluster)
        if cluster["id"] > max_id:
            max_id = cluster["id"]
    max_id += 1
    lines_detector = False
    content_of_orphans = []
    for orph_id in orphan_cell_indices:
        orph_cell = raw_cells[orph_id]
        content_of_orphans.append(raw_cells[orph_id]["text"])
    fil_content_of_orphans = []
    for cell_content in content_of_orphans:
        if cell_content.isnumeric():
            try:
                num = int(cell_content)
                fil_content_of_orphans.append(num)
            except ValueError:  # ignore the cell
                pass
    # line_orphans = []
    #  Check if there are more than 2 pdf orphan cells, if there are more than 2,
    #  then check between the orphan cells if they are numeric
    # and if they are a consecutive series of numbers (using ranges function) to decide
    if len(fil_content_of_orphans) > 2:
        out_ranges = ranges(fil_content_of_orphans)
        if len(out_ranges) > 1:
            cnt_range = 0
            for ranges_ in out_ranges:
                if ranges_[0] != ranges_[1]:
                    # If there are more than 75 (half the total line number of a review manuscript page)
                    # decide that there are line numbers on page to be ignored.
                    if len(list(range(ranges_[0], ranges_[1]))) > 75:
                        lines_detector = True
                        # line_orphans = line_orphans + list(range(ranges_[0], ranges_[1]))
    for orph_id in orphan_cell_indices:
        orph_cell = raw_cells[orph_id]
        if bool(orph_cell["text"] and not orph_cell["text"].isspace()):
            fig_flag = False
            # Do not assign orphan cells if they are inside a figure
            for fig in figures:
                if contains(fig["bbox"], orph_cell["bbox"]):
                    fig_flag = True
            # if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
            if fig_flag == False and lines_detector == False:
                # get class from low confidence detections if not set as text:
                class_type = "Text"
                for cluster in cluster_predictions_low:
                    intersection = compute_intersection(
                        orph_cell["bbox"], cluster["bbox"]
                    )
                    class_type = "Text"
                    if (
                        cluster["confidence"] > 0.1
                        and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
                    ):
                        class_type = cluster["type"]
                    elif contains(
                        cluster["bbox"],
                        [
                            orph_cell["bbox"][0] + 3,
                            orph_cell["bbox"][1] + 3,
                            orph_cell["bbox"][2] - 3,
                            orph_cell["bbox"][3] - 3,
                        ],
                    ):
                        class_type = cluster["type"]
                    elif intersection > area(orph_cell["bbox"]) * 0.2:
                        class_type = cluster["type"]
                new_cluster = {
                    "id": max_id,
                    "bbox": orph_cell["bbox"],
                    "type": class_type,
                    "cell_ids": [orph_id],
                    "confidence": -1,
                    "created_by": "orphan_default",
                }
                max_id += 1
                cluster_predictions.append(new_cluster)
    return cluster_predictions, orphan_cell_indices
 def merge_cells(cluster_predictions):
    # Using graph component creates clusters if orphan cells are touching or too close.
    G = nx.Graph()
    for cluster in cluster_predictions:
        if cluster["created_by"] == "orphan_default":
            G.add_node(cluster["id"])
    for cluster_1 in cluster_predictions:
        for cluster_2 in cluster_predictions:
            if (
                cluster_1["id"] != cluster_2["id"]
                and cluster_2["created_by"] == "orphan_default"
                and cluster_1["created_by"] == "orphan_default"
            ):
                cl1 = copy.deepcopy(cluster_1["bbox"])
                cl2 = copy.deepcopy(cluster_2["bbox"])
                cl1[0] = cl1[0] - 2
                cl1[1] = cl1[1] - 2
                cl1[2] = cl1[2] + 2
                cl1[3] = cl1[3] + 2
                cl2[0] = cl2[0] - 2
                cl2[1] = cl2[1] - 2
                cl2[2] = cl2[2] + 2
                cl2[3] = cl2[3] + 2
                if is_intersecting(cl1, cl2):
                    G.add_edge(cluster_1["id"], cluster_2["id"])
    component = sorted(map(sorted, nx.k_edge_components(G, k=1)))
    max_id = -1
    for cluster_1 in cluster_predictions:
        if cluster_1["id"] > max_id:
            max_id = cluster_1["id"]
    for nodes in component:
        if len(nodes) > 1:
            max_id += 1
            lines = []
            for node in nodes:
                for cluster in cluster_predictions:
                    if cluster["id"] == node:
                        lines.append(cluster)
                        cluster_predictions.remove(cluster)
            new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
            cluster_predictions.append(new_merged_cluster)
    return cluster_predictions
 def clean_up_clusters(
    cluster_predictions,
    raw_cells,
    merge_cells=False,
    img_table=False,
    one_cell_table=False,
 ):
    DuplicateDeletedClusterIDs = []
    for cluster_1 in cluster_predictions:
        for cluster_2 in cluster_predictions:
            if cluster_1["id"] != cluster_2["id"]:
                # remove any artifcats created by merging clusters
                if merge_cells == True:
                    if contains(
                        cluster_1["bbox"],
                        [
                            cluster_2["bbox"][0] + 3,
                            cluster_2["bbox"][1] + 3,
                            cluster_2["bbox"][2] - 3,
                            cluster_2["bbox"][3] - 3,
                        ],
                    ):
                        cluster_1["cell_ids"] = (
                            cluster_1["cell_ids"] + cluster_2["cell_ids"]
                        )
                        DuplicateDeletedClusterIDs.append(cluster_2["id"])
                # remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
                elif img_table == True:
                    if (
                        cluster_1["type"] == "Text"
                        and cluster_2["type"] == "Picture"
                        or cluster_2["type"] == "Table"
                    ):
                        if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
                            DuplicateDeletedClusterIDs.append(cluster_1["id"])
                        elif contains(
                            [
                                cluster_2["bbox"][0] - 3,
                                cluster_2["bbox"][1] - 3,
                                cluster_2["bbox"][2] + 3,
                                cluster_2["bbox"][3] + 3,
                            ],
                            cluster_1["bbox"],
                        ):
                            DuplicateDeletedClusterIDs.append(cluster_1["id"])
            # remove tables that have one pdf cell
            if one_cell_table == True:
                if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
                    DuplicateDeletedClusterIDs.append(cluster_1["id"])
    DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
    for cl_id in DuplicateDeletedClusterIDs:
        for cluster in cluster_predictions:
            if cl_id == cluster["id"]:
                cluster_predictions.remove(cluster)
    return cluster_predictions
 def assigning_cell_ids_to_clusters(clusters, raw_cells, threshold):
    for cluster in clusters:
        cells_in_cluster, _ = compute_enclosed_cells(
            cluster["bbox"], raw_cells, min_cell_intersection_with_cluster=threshold
        )
        cluster["cell_ids"] = cells_in_cluster
        ## These cell_ids are ids of the raw cells.
        ## They are often, but not always, the same as the "id" or the index of the "cells" list in a prediction.
    return clusters
 # Creates a map of cell_id->cluster_id
 def cell_id_state_map(clusters, cell_count):
    clusters_around_cells = find_clusters_around_cells(cell_count, clusters)
    orphan_cell_indices = [
        ix for ix in range(cell_count) if len(clusters_around_cells[ix]) == 0
    ]  # which cells are assigned no cluster?
    ambiguous_cell_indices = [
        ix for ix in range(cell_count) if len(clusters_around_cells[ix]) > 1
    ]  # which cells are assigned > 1 clusters?
    return clusters_around_cells, orphan_cell_indices, ambiguous_cell_indices
--- a/docling/utils/utils.py
+++ b/docling/utils/utils.py
@ -0,0 +1,41 @@
 import hashlib
 from io import BytesIO
 from itertools import islice
 from pathlib import Path
 from typing import List, Union
 def chunkify(iterator, chunk_size):
    """Yield successive chunks of chunk_size from the iterable."""
    if isinstance(iterator, List):
        iterator = iter(iterator)
    for first in iterator:  # Take the first element from the iterator
        yield [first] + list(islice(iterator, chunk_size - 1))
 def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
    """Create a stable page_hash of the path_or_stream of a file"""
    block_size = 65536
    hasher = hashlib.sha256()
    def _hash_buf(binary_stream):
        buf = binary_stream.read(block_size)  # read and page_hash in chunks
        while len(buf) > 0:
            hasher.update(buf)
            buf = binary_stream.read(block_size)
    if isinstance(path_or_stream, Path):
        with path_or_stream.open("rb") as afile:
            _hash_buf(afile)
    elif isinstance(path_or_stream, BytesIO):
        _hash_buf(path_or_stream)
    return hasher.hexdigest()
 def create_hash(string: str):
    hasher = hashlib.sha256()
    hasher.update(string.encode("utf-8"))
    return hasher.hexdigest()
--- a/examples/convert.py
+++ b/examples/convert.py
@ -0,0 +1,73 @@
 import json
 import logging
 import time
 from pathlib import Path
 from typing import Iterable
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, PipelineOptions
 from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
 from docling.document_converter import DocumentConverter
 _log = logging.getLogger(__name__)
 def export_documents(
    converted_docs: Iterable[ConvertedDocument],
    output_dir: Path,
 ):
    output_dir.mkdir(parents=True, exist_ok=True)
    success_count = 0
    failure_count = 0
    for doc in converted_docs:
        if doc.status == ConversionStatus.SUCCESS:
            success_count += 1
            doc_filename = doc.input.file.stem
            # Export Deep Search document JSON format:
            with (output_dir / f"{doc_filename}.json").open("w") as fp:
                fp.write(json.dumps(doc.render_as_dict()))
            # Export Markdown format:
            with (output_dir / f"{doc_filename}.md").open("w") as fp:
                fp.write(doc.render_as_markdown())
        else:
            _log.info(f"Document {doc.input.file} failed to convert.")
            failure_count += 1
    _log.info(
        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
    )
 def main():
    logging.basicConfig(level=logging.INFO)
    input_doc_paths = [
        # Path("/Users/cau/Downloads/Issue-36122.pdf"),
        # Path("/Users/cau/Downloads/IBM_Storage_Insights_Fact_Sheet.pdf"),
        Path("./test/data/2206.01062.pdf"),
        Path("./test/data/2203.01017v2.pdf"),
        Path("./test/data/2305.03393v1.pdf"),
    ]
    artifacts_path = DocumentConverter.download_models_hf()
    doc_converter = DocumentConverter(artifacts_path=artifacts_path)
    input = DocumentConversionInput.from_paths(input_doc_paths)
    start_time = time.time()
    converted_docs = doc_converter.convert(input)
    export_documents(converted_docs, output_dir=Path("./scratch"))
    end_time = time.time() - start_time
    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
 if __name__ == "__main__":
    main()
--- a/examples/minimal.py
+++ b/examples/minimal.py
@ -0,0 +1,11 @@
 from docling.datamodel.document import DocumentConversionInput
 from docling.document_converter import DocumentConverter
 artifacts_path = DocumentConverter.download_models_hf()
 doc_converter = DocumentConverter(artifacts_path=artifacts_path)
 input = DocumentConversionInput.from_paths(["factsheet.pdf"])
 converted_docs = doc_converter.convert(input)
 for d in converted_docs:
    print(d.render_as_dict())
--- a/logo.png
+++ b/logo.png
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,72 @@
 [tool.poetry]
 name = "docling"
 version = "0.1.0"
 description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
 readme = "README.md"
 keywords= ["docling", "convert", "document", "pdf", "layout model", "segmentation", "table structure", "table former"]
 classifiers = [
     "License :: OSI Approved :: MIT License",
     "Operating System :: MacOS :: MacOS X",
     "Operating System :: POSIX :: Linux",
     "Development Status :: 5 - Production/Stable",
     "Intended Audience :: Developers",
     "Intended Audience :: Science/Research",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Programming Language :: Python :: 3"
 ]
 packages = [{include = "docling"}]
 [tool.poetry.dependencies]
 python = "^3.11"
 pydantic = "^2.0.0"
 docling-core = "^0.2.0"
 docling-ibm-models = "^0.2.0"
 deepsearch-glm = ">=0.18.4,<1"
 deepsearch-toolkit = ">=0.47.0,<1"
 filetype = "^1.2.0"
 pypdfium2 = "^4.30.0"
 pydantic-settings = "^2.3.0"
 huggingface_hub = ">=0.23,<1"
 [tool.poetry.group.ocr.dependencies]
 easyocr = "^1.7"
 [tool.poetry.group.dev.dependencies]
 black = {extras = ["jupyter"], version = "^24.4.2"}
 pytest = "^7.2.2"
 pre-commit = "^3.7.1"
 mypy = "^1.10.1"
 isort = "^5.10.1"
 python-semantic-release = "^7.32.2"
 flake8 = "^6.0.0"
 pyproject-flake8 = "^6.0.0"
 pytest-xdist = "^3.3.1"
 types-requests = "^2.31.0.2"
 flake8-pyproject = "^1.2.3"
 pylint = "^2.17.5"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 [tool.black]
 line-length = 88
 target-version = ["py311"]
 include = '\.pyi?$'
 [tool.isort]
 profile = "black"
 line_length = 88
 py_version=311
 [tool.mypy]
 pretty = true
 # strict = true
 no_implicit_optional = true
 python_version = "3.11"
 [tool.flake8]
 max-line-length = 88
 extend-ignore = ["E203", "E501"]
--- a/test/data/2203.01017v2.pdf
+++ b/test/data/2203.01017v2.pdf
--- a/test/data/2206.01062.pdf
+++ b/test/data/2206.01062.pdf
--- a/test/data/2305.03393v1.pdf
+++ b/test/data/2305.03393v1.pdf
--- a/test/test_backend_pdfium.py
+++ b/test/test_backend_pdfium.py
@ -0,0 +1,33 @@
 from pathlib import Path
 import pytest
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend
 from docling.datamodel.base_models import BoundingBox
@pytest.fixture
 def test_doc_path():
    return Path("./data/2206.01062.pdf")
 def test_get_text_from_rect(test_doc_path):
    doc_backend = PyPdfiumDocumentBackend(test_doc_path)
    page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
    # Get the title text of the DocLayNet paper
    textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124))
    ref = "DocLayNet: A Large Human-Annotated Dataset for\r\nDocument-Layout Analysis"
    assert textpiece.strip() == ref
 def test_crop_page_image(test_doc_path):
    doc_backend = PyPdfiumDocumentBackend(test_doc_path)
    page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
    # Crop out "Figure 1" from the DocLayNet paper
    im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527))
    # im.show()
 def test_num_pages(test_doc_path):
    doc_backend = PyPdfiumDocumentBackend(test_doc_path)
    doc_backend.page_count() == 9