Initial commit

This commit is contained in:
Christoph Auer 2024-07-15 09:42:42 +02:00
commit e2d996753b
38 changed files with 8767 additions and 0 deletions

442
.gitignore vendored Normal file
View File

@ -0,0 +1,442 @@
model_artifacts/
scratch/
ds_convert_models/
# Created by https://www.toptal.com/developers/gitignore/api/python,macos,virtualenv,pycharm,visualstudiocode,emacs,vim,jupyternotebooks
# Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,virtualenv,pycharm,visualstudiocode,emacs,vim,jupyternotebooks
### Emacs ###
# -*- mode: gitignore; -*-
*~
\#*\#
/.emacs.desktop
/.emacs.desktop.lock
*.elc
auto-save-list
tramp
.\#*
# Org-mode
.org-id-locations
*_archive
# flymake-mode
*_flymake.*
# eshell files
/eshell/history
/eshell/lastdir
# elpa packages
/elpa/
# reftex files
*.rel
# AUCTeX auto folder
/auto/
# cask packages
.cask/
dist/
# Flycheck
flycheck_*.el
# server auth directory
/server/
# projectiles files
.projectile
# directory configuration
.dir-locals.el
# network security
/network-security.data
### JupyterNotebooks ###
# gitignore template for Jupyter Notebooks
# website: http://jupyter.org/
.ipynb_checkpoints
*/.ipynb_checkpoints/*
# IPython
profile_default/
ipython_config.py
# Remove previous ipynb_checkpoints
# git rm -r .ipynb_checkpoints/
### macOS ###
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
### macOS Patch ###
# iCloud generated files
*.icloud
### PyCharm ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
# AWS User-specific
.idea/**/aws.xml
# Generated files
.idea/**/contentModel.xml
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# Gradle
.idea/**/gradle.xml
.idea/**/libraries
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# SonarLint plugin
.idea/sonarlint/
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
### PyCharm Patch ###
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
# *.iml
# modules.xml
# .idea/misc.xml
# *.ipr
# Sonarlint plugin
# https://plugins.jetbrains.com/plugin/7973-sonarlint
.idea/**/sonarlint/
# SonarQube Plugin
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
.idea/**/sonarIssues.xml
# Markdown Navigator plugin
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
.idea/**/markdown-navigator.xml
.idea/**/markdown-navigator-enh.xml
.idea/**/markdown-navigator/
# Cache file creation bug
# See https://youtrack.jetbrains.com/issue/JBR-2257
.idea/$CACHE_FILE$
# CodeStream plugin
# https://plugins.jetbrains.com/plugin/12206-codestream
.idea/codestream.xml
# Azure Toolkit for IntelliJ plugin
# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
.idea/**/azureSettings.xml
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
# IPython
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
### Python Patch ###
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
poetry.toml
# ruff
.ruff_cache/
### Vim ###
# Swap
[._]*.s[a-v][a-z]
!*.svg # comment out if you don't need vector files
[._]*.sw[a-p]
[._]s[a-rt-v][a-z]
[._]ss[a-gi-z]
[._]sw[a-p]
# Session
Session.vim
Sessionx.vim
# Temporary
.netrwhist
# Auto-generated tag files
tags
# Persistent undo
[._]*.un~
### Visual Studio Code ###
.vscode/
### VirtualEnv ###
# Virtualenv
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
[Bb]in
[Ii]nclude
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
pyvenv.cfg
pip-selfcheck.json
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets
# Local History for Visual Studio Code
.history/
# Built Visual Studio Code Extensions
*.vsix
### VisualStudioCode Patch ###
# Ignore all local history of files
.history
.ionide
# Docs
# docs/**/*.png
# docs/**/*.svg

34
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,34 @@
fail_fast: true
repos:
- repo: local
hooks:
- id: system
name: Black
entry: poetry run black docling examples
pass_filenames: false
language: system
files: '\.py$'
- repo: local
hooks:
- id: system
name: isort
entry: poetry run isort docling examples
pass_filenames: false
language: system
files: '\.py$'
# - repo: local
# hooks:
# - id: system
# name: flake8
# entry: poetry run flake8 docling
# pass_filenames: false
# language: system
# files: '\.py$'
# - repo: local
# hooks:
# - id: system
# name: MyPy
# entry: poetry run mypy docling
# pass_filenames: false
# language: system
# files: '\.py$'

129
CODE_OF_CONDUCT.md Normal file
View File

@ -0,0 +1,129 @@
# Contributor Covenant Code of Conduct
## Our Pledge
We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, religion, or sexual identity
and orientation.
We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.
## Our Standards
Examples of behavior that contributes to a positive environment for our
community include:
* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
and learning from the experience
* Focusing on what is best not just for us as individuals, but for the
overall community
Examples of unacceptable behavior include:
* The use of sexualized language or imagery, and sexual attention or
advances of any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email
address, without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Enforcement Responsibilities
Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.
Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.
## Scope
This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement using
[deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com).
All complaints will be reviewed and investigated promptly and fairly.
All community leaders are obligated to respect the privacy and security of the
reporter of any incident.
## Enforcement Guidelines
Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:
### 1. Correction
**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.
**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.
### 2. Warning
**Community Impact**: A violation through a single incident or series
of actions.
**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or
permanent ban.
### 3. Temporary Ban
**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.
**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.
### 4. Permanent Ban
**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.
**Consequence**: A permanent ban from any sort of public interaction within
the community.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.0, available at
[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html](https://www.contributor-covenant.org/version/2/0/code_of_conduct.html).
Community Impact Guidelines were inspired by [Mozilla's code of conduct
enforcement ladder](https://github.com/mozilla/diversity).
Homepage: [https://www.contributor-covenant.org](https://www.contributor-covenant.org)
For answers to common questions about this code of conduct, see the FAQ at
[https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq). Translations are available at
[https://www.contributor-covenant.org/translations](https://www.contributor-covenant.org/translations).

184
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,184 @@
## Contributing In General
Our project welcomes external contributions. If you have an itch, please feel
free to scratch it.
To contribute code or documentation, please submit a [pull request](https://github.com/DS4SD/docling/pulls).
A good way to familiarize yourself with the codebase and contribution process is
to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/DS4SD/docling/issues).
Before embarking on a more ambitious contribution, please quickly [get in touch](#communication) with us.
For general questions or support requests, please refer to the [discussion section](https://github.com/DS4SD/docling/discussions).
**Note: We appreciate your effort, and want to avoid a situation where a contribution
requires extensive rework (by you or by us), sits in backlog for a long time, or
cannot be accepted at all!**
### Proposing new features
If you would like to implement a new feature, please [raise an issue](https://github.com/DS4SD/docling/issues)
before sending a pull request so the feature can be discussed. This is to avoid
you wasting your valuable time working on a feature that the project developers
are not interested in accepting into the code base.
### Fixing bugs
If you would like to fix a bug, please [raise an issue](https://github.com/DS4SD/docling/issues) before sending a
pull request so it can be tracked.
### Merge approval
The project maintainers use LGTM (Looks Good To Me) in comments on the code
review to indicate acceptance. A change requires LGTMs from two of the
maintainers of each component affected.
For a list of the maintainers, see the [MAINTAINERS.md](MAINTAINERS.md) page.
## Legal
Each source file must include a license header for the MIT
Software. Using the SPDX format is the simplest approach.
e.g.
```
/*
Copyright IBM Inc. All rights reserved.
SPDX-License-Identifier: MIT
*/
```
We have tried to make it as easy as possible to make contributions. This
applies to how we handle the legal aspects of contribution. We use the
same approach - the [Developer's Certificate of Origin 1.1 (DCO)](https://github.com/hyperledger/fabric/blob/master/docs/source/DCO1.1.txt) - that the Linux® Kernel [community](https://elinux.org/Developer_Certificate_Of_Origin)
uses to manage code contributions.
We simply ask that when submitting a patch for review, the developer
must include a sign-off statement in the commit message.
Here is an example Signed-off-by line, which indicates that the
submitter accepts the DCO:
```
Signed-off-by: John Doe <john.doe@example.com>
```
You can include this automatically when you commit a change to your
local git repository using the following command:
```
git commit -s
```
## Communication
Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
## Developing
### Usage of Poetry
We use Poetry to manage dependencies.
#### Install
To install, see the documentation here: https://python-poetry.org/docs/master/#installing-with-the-official-installer
1. Install the Poetry globally in your machine
```bash
curl -sSL https://install.python-poetry.org | python3 -
```
The installation script will print the installation bin folder `POETRY_BIN` which you need in the next steps.
2. Make sure Poetry is in your `$PATH`
- for `zsh`
```sh
echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.zshrc
```
- for `bash`
```sh
echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.bashrc
```
3. The official guidelines linked above include useful details on the configuration of autocomplete for most shell environments, e.g. Bash and Zsh.
#### Create a Virtual Environment and Install Dependencies
To activate the Virtual Environment, run:
```bash
poetry shell
```
To spawn a shell with the Virtual Environment activated. If the Virtual Environment doesn't exist, Poetry will create one for you. Then, to install dependencies, run:
```bash
poetry install
```
**(Advanced) Use a Specific Python Version**
If for whatever reason you need to work in a specific (older) version of Python, run:
```bash
poetry env use $(which python3.8)
```
This creates a Virtual Environment with Python 3.8. For other versions, replace `$(which python3.8)` by the path to the interpreter (e.g., `/usr/bin/python3.8`) or use `$(which pythonX.Y)`.
#### Add a new dependency
```bash
poetry add NAME
```
## Coding style guidelines
We use the following tools to enforce code style:
- iSort, to sort imports
- Black, to format code
We run a series of checks on the code base on every commit, using `pre-commit`. To install the hooks, run:
```bash
pre-commit install
```
To run the checks on-demand, run:
```
pre-commit run --all-files
```
Note: Checks like `Black` and `isort` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by their Hooks. In these cases, `git add` the modified files and `git commit` again.
## Documentation
We use [MkDocs](https://www.mkdocs.org/) to write documentation.
To run the documentation server, do:
```bash
mkdocs serve
```
The server will be available on [http://localhost:8000](http://localhost:8000).
### Pushing Documentation to GitHub pages
Run the following:
```bash
mkdocs gh-deploy
```

23
Dockerfile Normal file
View File

@ -0,0 +1,23 @@
FROM python:3.11-slim-bookworm
ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"
RUN apt-get update \
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
&& apt-get clean
RUN --mount=type=ssh \
pip install --no-cache-dir https://github.com/DS4SD/docling.git
ENV HF_HOME=/tmp/
ENV TORCH_HOME=/tmp/
COPY examples/minimal.py /root/minimal.py
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
RUN wget "https://www.ibm.com/docs/en/SSQRB8/com.ibm.spectrum.si.pdfs/IBM_Storage_Insights_Fact_Sheet.pdf" -O /root/factsheet.pdf
# On container shell:
# > cd /root/
# > python minimal.py

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) [year] [fullname]
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

10
MAINTAINERS.md Normal file
View File

@ -0,0 +1,10 @@
# MAINTAINERS
- Christoph Auer - [@cau-git](https://github.com/cau-git)
- Michele Dolfi - [@dolfim-ibm](https://github.com/dolfim-ibm)
- Maxim Lysak - [@maxmnemonic](https://github.com/maxmnemonic)
- Nikos Livathinos - [@nikos-livathinos](https://github.com/nikos-livathinos)
- Ahmed Nassar [@nassarofficial](https://github.com/nassarofficial)
- Peter Staar - [@PeterStaar-IBM](https://github.com/PeterStaar-IBM)
Maintainers can be contacted at [deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com).

99
README.md Normal file
View File

@ -0,0 +1,99 @@
<p align="center">
<a href="https://github.com/ds4sd/docling"> <img loading="lazy" alt="Docling" src="logo.png" width="150" /> </a>
</p>
# Docling
Dockling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
## Features
* ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
* 📑 Understands detailed page layout, reading order and recovers table structures
* 📝 Extracts metadata from the document, such as title, authors, references and language
* 🔍 Optionally applies OCR (use with scanned PDFs)
## Setup
You need Python 3.11 and poetry. Install poetry from [here](https://python-poetry.org/docs/#installing-with-the-official-installer).
Once you have `poetry` installed, create an environment and install the package:
```bash
poetry env use $(which python3.11)
poetry shell
poetry install
```
**Notes**:
* Works on macOS and Linux environments. Windows platforms are currently not tested.
## Usage
For basic usage, see the [convert.py](examples/convert.py) example module. Run with:
```
python examples/convert.py
```
The output of the above command will be written to `./scratch`.
### Enable or disable pipeline features
You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`
```python
doc_converter = DocumentConverter(
artifacts_path=artifacts_path,
pipeline_options=PipelineOptions(do_table_structure=False, # Controls if table structure is recovered.
do_ocr=True), # Controls if OCR is applied (ignores programmatic content)
)
```
### Impose limits on the document size
You can limit the file size and number of pages which should be allowed to process per document.
```python
paths = [Path("./test/data/2206.01062.pdf")]
input = DocumentConversionInput.from_paths(
paths, limits=DocumentLimits(max_num_pages=100, max_file_size=20971520)
)
```
### Convert from binary PDF streams
You can convert PDFs from a binary stream instead of from the filesystem as follows:
```python
buf = BytesIO(your_binary_stream)
docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
input = DocumentConversionInput.from_streams(docs)
converted_docs = doc_converter.convert(input)
```
### Limit resource usage
You can limit the CPU threads used by `docling` by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
## Contributing
Please read [Contributing to Docling](./CONTRIBUTING.md) for details.
## References
If you use `Docling` in your projects, please consider citing the following:
```bib
@software{Docling,
author = {Deep Search Team},
month = {7},
title = {{Docling}},
url = {https://github.com/DS4SD/docling},
version = {main},
year = {2024}
}
```
## License
The `Docling` codebase is under MIT license.
For individual model usage, please refer to the model licenses found in the original packages.

0
docling/__init__.py Normal file
View File

View File

View File

@ -0,0 +1,55 @@
from abc import ABC, abstractmethod
from io import BytesIO
from pathlib import Path
from typing import Any, Iterable, Optional, Union
from PIL import Image
class PdfPageBackend(ABC):
def __init__(self, page_obj: Any) -> object:
pass
@abstractmethod
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
pass
@abstractmethod
def get_text_cells(self) -> Iterable["Cell"]:
pass
@abstractmethod
def get_page_image(
self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
) -> Image.Image:
pass
@abstractmethod
def get_size(self) -> "PageSize":
pass
@abstractmethod
def unload(self):
pass
class PdfDocumentBackend(ABC):
@abstractmethod
def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
pass
@abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend:
pass
@abstractmethod
def page_count(self) -> int:
pass
@abstractmethod
def is_valid(self) -> bool:
pass
@abstractmethod
def unload(self):
pass

View File

@ -0,0 +1,223 @@
import random
from io import BytesIO
from pathlib import Path
from typing import Iterable, List, Optional, Union
import pypdfium2 as pdfium
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
class PyPdfiumPageBackend(PdfPageBackend):
def __init__(self, page_obj: PdfPage):
super().__init__(page_obj)
self._ppage = page_obj
self.text_page = None
def get_text_in_rect(self, bbox: BoundingBox) -> str:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
bbox = bbox.to_bottom_left_origin(self.get_size().height)
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
return text_piece
def get_text_cells(self) -> Iterable[Cell]:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
cells = []
cell_counter = 0
page_size = self.get_size()
for i in range(self.text_page.count_rects()):
rect = self.text_page.get_rect(i)
text_piece = self.text_page.get_text_bounded(*rect)
x0, y0, x1, y1 = rect
cells.append(
Cell(
id=cell_counter,
text=text_piece,
bbox=BoundingBox(
l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
# The cell merging code below is to clean this up.
def merge_horizontal_cells(
cells: List[Cell],
horizontal_threshold_factor: float = 1.0,
vertical_threshold_factor: float = 0.5,
) -> List[Cell]:
if not cells:
return []
def group_rows(cells: List[Cell]) -> List[List[Cell]]:
rows = []
current_row = [cells[0]]
row_top = cells[0].bbox.t
row_bottom = cells[0].bbox.b
row_height = cells[0].bbox.height
for cell in cells[1:]:
vertical_threshold = row_height * vertical_threshold_factor
if (
abs(cell.bbox.t - row_top) <= vertical_threshold
and abs(cell.bbox.b - row_bottom) <= vertical_threshold
):
current_row.append(cell)
row_top = min(row_top, cell.bbox.t)
row_bottom = max(row_bottom, cell.bbox.b)
row_height = row_bottom - row_top
else:
rows.append(current_row)
current_row = [cell]
row_top = cell.bbox.t
row_bottom = cell.bbox.b
row_height = cell.bbox.height
if current_row:
rows.append(current_row)
return rows
def merge_row(row: List[Cell]) -> List[Cell]:
merged = []
current_group = [row[0]]
for cell in row[1:]:
prev_cell = current_group[-1]
avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
if (
cell.bbox.l - prev_cell.bbox.r
<= avg_height * horizontal_threshold_factor
):
current_group.append(cell)
else:
merged.append(merge_group(current_group))
current_group = [cell]
if current_group:
merged.append(merge_group(current_group))
return merged
def merge_group(group: List[Cell]) -> Cell:
if len(group) == 1:
return group[0]
merged_text = "".join(cell.text for cell in group)
merged_bbox = BoundingBox(
l=min(cell.bbox.l for cell in group),
t=min(cell.bbox.t for cell in group),
r=max(cell.bbox.r for cell in group),
b=max(cell.bbox.b for cell in group),
)
return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
rows = group_rows(cells)
merged_cells = [cell for row in rows for cell in merge_row(row)]
for i, cell in enumerate(merged_cells, 1):
cell.id = i
return merged_cells
def draw_clusters_and_cells():
image = self.get_page_image()
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# before merge:
# draw_clusters_and_cells()
cells = merge_horizontal_cells(cells)
# after merge:
# draw_clusters_and_cells()
return cells
def get_page_image(
self, scale: int = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
page_size = self.get_size()
if not cropbox:
cropbox = BoundingBox(
l=0,
r=page_size.width,
t=0,
b=page_size.height,
coord_origin=CoordOrigin.TOPLEFT,
)
padbox = BoundingBox(
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
)
else:
padbox = cropbox.to_bottom_left_origin(page_size.height)
padbox.r = page_size.width - padbox.r
padbox.t = page_size.height - padbox.t
image = (
self._ppage.render(
scale=scale * 1.5,
rotation=0, # no additional rotation
crop=padbox.as_tuple(),
)
.to_pil()
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
) # We resize the image from 1.5x the given scale to make it sharper.
return image
def get_size(self) -> PageSize:
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
def unload(self):
self._ppage = None
self.text_page = None
class PyPdfiumDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
super().__init__(path_or_stream)
if isinstance(path_or_stream, Path):
self._pdoc = pdfium.PdfDocument(path_or_stream)
elif isinstance(path_or_stream, BytesIO):
self._pdoc = pdfium.PdfDocument(
path_or_stream
) # TODO Fix me, won't accept bytes.
def page_count(self) -> int:
return len(self._pdoc)
def load_page(self, page_no: int) -> PdfPage:
return PyPdfiumPageBackend(self._pdoc[page_no])
def is_valid(self) -> bool:
return self.page_count() > 0
def unload(self):
self._pdoc.close()
self._pdoc = None

View File

View File

@ -0,0 +1,247 @@
from enum import Enum, auto
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict, model_validator
from docling.backend.abstract_backend import PdfPageBackend
class ConversionStatus(str, Enum):
PENDING = auto()
STARTED = auto()
FAILURE = auto()
SUCCESS = auto()
SUCCESS_WITH_ERRORS = auto()
class DocInputType(str, Enum):
PATH = auto()
STREAM = auto()
class CoordOrigin(str, Enum):
TOPLEFT = auto()
BOTTOMLEFT = auto()
class PageSize(BaseModel):
width: float = 0.0
height: float = 0.0
class BoundingBox(BaseModel):
l: float # left
t: float # top
r: float # right
b: float # bottom
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
@property
def width(self):
return self.r - self.l
@property
def height(self):
return abs(self.t - self.b)
def as_tuple(self):
if self.coord_origin == CoordOrigin.TOPLEFT:
return (self.l, self.t, self.r, self.b)
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
return (self.l, self.b, self.r, self.t)
@classmethod
def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
if origin == CoordOrigin.TOPLEFT:
return BoundingBox(
l=coord[0], t=coord[1], r=coord[2], b=coord[3], coord_origin=origin
)
elif origin == CoordOrigin.BOTTOMLEFT:
return BoundingBox(
l=coord[0], b=coord[1], r=coord[2], t=coord[3], coord_origin=origin
)
def area(self) -> float:
return (self.r - self.l) * (self.b - self.t)
def intersection_area_with(self, other: "BoundingBox") -> float:
# Calculate intersection coordinates
left = max(self.l, other.l)
top = max(self.t, other.t)
right = min(self.r, other.r)
bottom = min(self.b, other.b)
# Calculate intersection dimensions
width = right - left
height = bottom - top
# If the bounding boxes do not overlap, width or height will be negative
if width <= 0 or height <= 0:
return 0.0
return width * height
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
return self
elif self.coord_origin == CoordOrigin.TOPLEFT:
return BoundingBox(
l=self.l,
r=self.r,
t=page_height - self.t,
b=page_height - self.b,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
def to_top_left_origin(self, page_height):
if self.coord_origin == CoordOrigin.TOPLEFT:
return self
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
return BoundingBox(
l=self.l,
r=self.r,
t=page_height - self.t, # self.b
b=page_height - self.b, # self.t
coord_origin=CoordOrigin.TOPLEFT,
)
class Cell(BaseModel):
id: int
text: str
bbox: BoundingBox
class OcrCell(Cell):
confidence: float
class Cluster(BaseModel):
id: int
label: str
bbox: BoundingBox
confidence: float = 1.0
cells: List[Cell] = []
class BasePageElement(BaseModel):
label: str
id: int
page_no: int
cluster: Cluster
text: Optional[str] = None
class LayoutPrediction(BaseModel):
clusters: List[Cluster] = []
class TableCell(BaseModel):
bbox: BoundingBox
row_span: int
col_span: int
start_row_offset_idx: int
end_row_offset_idx: int
start_col_offset_idx: int
end_col_offset_idx: int
text: str
column_header: bool = False
row_header: bool = False
row_section: bool = False
@model_validator(mode="before")
@classmethod
def from_dict_format(cls, data: Any) -> Any:
if isinstance(data, Dict):
text = data["bbox"].get("token", "")
if not len(text):
text_cells = data.pop("text_cell_bboxes", None)
if text_cells:
for el in text_cells:
text += el["token"] + " "
text = text.strip()
data["text"] = text
return data
class TableElement(BasePageElement):
otsl_seq: List[str]
num_rows: int = 0
num_cols: int = 0
table_cells: List[TableCell]
class TableStructurePrediction(BaseModel):
table_map: Dict[int, TableElement] = {}
class TextElement(BasePageElement):
...
class FigureData(BaseModel):
pass
class FigureElement(BasePageElement):
data: Optional[FigureData] = None
provenance: Optional[str] = None
predicted_class: Optional[str] = None
confidence: Optional[float] = None
class FigureClassificationPrediction(BaseModel):
figure_count: int = 0
figure_map: Dict[int, FigureElement] = {}
class EquationPrediction(BaseModel):
equation_count: int = 0
equation_map: Dict[int, TextElement] = {}
class PagePredictions(BaseModel):
layout: LayoutPrediction = None
tablestructure: TableStructurePrediction = None
figures_classification: FigureClassificationPrediction = None
equations_prediction: EquationPrediction = None
PageElement = Union[TextElement, TableElement, FigureElement]
class AssembledUnit(BaseModel):
elements: List[PageElement]
body: List[PageElement]
headers: List[PageElement]
class Page(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
page_no: int
page_hash: str = None
size: PageSize = None
image: Image = None
cells: List[Cell] = None
predictions: PagePredictions = PagePredictions()
assembled: AssembledUnit = None
_backend: PdfPageBackend = None # Internal PDF backend
class DocumentStream(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
filename: str
stream: BytesIO
class PipelineOptions(BaseModel):
do_table_structure: bool = True
do_ocr: bool = False

View File

@ -0,0 +1,351 @@
import logging
from io import BytesIO
from pathlib import Path, PurePath
from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
from deepsearch.documents.core.export import export_to_markdown
from docling_core.types import BaseCell, BaseText
from docling_core.types import BoundingBox as DsBoundingBox
from docling_core.types import Document as DsDocument
from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable
from docling_core.types import TableCell
from pydantic import BaseModel
from docling.backend.abstract_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import (
AssembledUnit,
ConversionStatus,
DocumentStream,
FigureElement,
Page,
TableElement,
TextElement,
)
from docling.datamodel.settings import DocumentLimits
from docling.utils.utils import create_file_hash
_log = logging.getLogger(__name__)
layout_label_to_ds_type = {
"Title": "title",
"Document Index": "table-of-path_or_stream",
"Section-header": "subtitle-level-1",
"Checkbox-Selected": "checkbox-selected",
"Checkbox-Unselected": "checkbox-unselected",
"Caption": "caption",
"Page-header": "page-header",
"Page-footer": "page-footer",
"Footnote": "footnote",
"Table": "table",
"Formula": "equation",
"List-item": "paragraph",
"Code": "paragraph",
"Picture": "figure",
"Text": "paragraph",
}
class InputDocument(BaseModel):
file: PurePath = None
document_hash: Optional[str] = None
valid: bool = False
limits: DocumentLimits = DocumentLimits()
filesize: Optional[int] = None
page_count: Optional[int] = None
_backend: PdfDocumentBackend = None # Internal PDF backend used
def __init__(
self,
path_or_stream: Union[BytesIO, Path],
filename: Optional[str] = None,
limits: Optional[DocumentLimits] = None,
pdf_backend=PyPdfiumDocumentBackend,
):
super().__init__()
self.limits = limits or DocumentLimits()
try:
if isinstance(path_or_stream, Path):
self.file = path_or_stream
self.filesize = path_or_stream.stat().st_size
if self.filesize > self.limits.max_file_size:
self.valid = False
else:
self.document_hash = create_file_hash(path_or_stream)
self._backend = pdf_backend(path_or_stream=path_or_stream)
elif isinstance(path_or_stream, BytesIO):
self.file = PurePath(filename)
self.filesize = path_or_stream.getbuffer().nbytes
if self.filesize > self.limits.max_file_size:
self.valid = False
else:
self.document_hash = create_file_hash(path_or_stream)
self._backend = pdf_backend(path_or_stream=path_or_stream)
if self.document_hash and self._backend.page_count() > 0:
self.page_count = self._backend.page_count()
if self.page_count <= self.limits.max_num_pages:
self.valid = True
except (FileNotFoundError, OSError) as e:
_log.exception(
f"File {self.file.name} not found or cannot be opened.", exc_info=e
)
# raise
except RuntimeError as e:
_log.exception(
f"An unexpected error occurred while opening the document {self.file.name}",
exc_info=e,
)
# raise
class ConvertedDocument(BaseModel):
input: InputDocument
status: ConversionStatus = ConversionStatus.PENDING # failure, success
errors: List[Dict] = [] # structure to keep errors
pages: List[Page] = []
assembled: AssembledUnit = None
output: DsDocument = None
def to_ds_document(self) -> DsDocument:
title = ""
desc = DsDocumentDescription(logs=[])
page_hashes = [
PageReference(hash=p.page_hash, page=p.page_no, model="default")
for p in self.pages
]
file_info = DsFileInfoObject(
filename=self.input.file.name,
document_hash=self.input.document_hash,
num_pages=self.input.page_count,
page_hashes=page_hashes,
)
main_text = []
tables = []
figures = []
page_no_to_page = {p.page_no: p for p in self.pages}
for element in self.assembled.elements:
# Convert bboxes to lower-left origin.
target_bbox = DsBoundingBox(
element.cluster.bbox.to_bottom_left_origin(
page_no_to_page[element.page_no].size.height
).as_tuple()
)
if isinstance(element, TextElement):
main_text.append(
BaseText(
text=element.text,
obj_type=layout_label_to_ds_type.get(element.label),
name=element.label,
prov=[
Prov(
bbox=target_bbox,
page=element.page_no,
span=[0, len(element.text)],
)
],
)
)
elif isinstance(element, TableElement):
index = len(tables)
ref_str = f"#/tables/{index}"
main_text.append(
Ref(
name=element.label,
obj_type=layout_label_to_ds_type.get(element.label),
ref=ref_str,
),
)
# Initialise empty table data grid (only empty cells)
table_data = [
[
TableCell(
text="",
# bbox=[0,0,0,0],
spans=[[i, j]],
obj_type="body",
)
for j in range(element.num_cols)
]
for i in range(element.num_rows)
]
# Overwrite cells in table data for which there is actual cell content.
for cell in element.table_cells:
for i in range(
min(cell.start_row_offset_idx, element.num_rows),
min(cell.end_row_offset_idx, element.num_rows),
):
for j in range(
min(cell.start_col_offset_idx, element.num_cols),
min(cell.end_col_offset_idx, element.num_cols),
):
celltype = "body"
if cell.column_header:
celltype = "col_header"
elif cell.row_header:
celltype = "row_header"
def make_spans(cell):
for rspan in range(
min(cell.start_row_offset_idx, element.num_rows),
min(cell.end_row_offset_idx, element.num_rows),
):
for cspan in range(
min(
cell.start_col_offset_idx, element.num_cols
),
min(cell.end_col_offset_idx, element.num_cols),
):
yield [rspan, cspan]
spans = list(make_spans(cell))
table_data[i][j] = TableCell(
text=cell.text,
bbox=cell.bbox.to_bottom_left_origin(
page_no_to_page[element.page_no].size.height
).as_tuple(),
# col=j,
# row=i,
spans=spans,
obj_type=celltype,
# col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
# row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
)
tables.append(
DsSchemaTable(
num_cols=element.num_cols,
num_rows=element.num_rows,
obj_type=layout_label_to_ds_type.get(element.label),
data=table_data,
prov=[
Prov(
bbox=target_bbox,
page=element.page_no,
span=[0, 0],
)
],
)
)
elif isinstance(element, FigureElement):
index = len(figures)
ref_str = f"#/figures/{index}"
main_text.append(
Ref(
name=element.label,
obj_type=layout_label_to_ds_type.get(element.label),
ref=ref_str,
),
)
figures.append(
BaseCell(
prov=[
Prov(
bbox=target_bbox,
page=element.page_no,
span=[0, 0],
)
],
obj_type=layout_label_to_ds_type.get(element.label),
# data=[[]],
)
)
page_dimensions = [
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
for p in self.pages
]
ds_doc = DsDocument(
name=title,
description=desc,
file_info=file_info,
main_text=main_text,
tables=tables,
figures=figures,
page_dimensions=page_dimensions,
)
return ds_doc
def render_as_dict(self):
if self.output:
return self.output.model_dump(by_alias=True, exclude_none=True)
else:
return {}
def render_as_markdown(self):
if self.output:
return export_to_markdown(
self.output.model_dump(by_alias=True, exclude_none=True)
)
else:
return ""
class DocumentConversionInput(BaseModel):
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
limits: Optional[DocumentLimits] = DocumentLimits()
DEFAULT_BACKEND: ClassVar = PyPdfiumDocumentBackend
def docs(
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
) -> Iterable[InputDocument]:
pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
for obj in self._path_or_stream_iterator:
if isinstance(obj, Path):
yield InputDocument(
path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
)
elif isinstance(obj, DocumentStream):
yield InputDocument(
path_or_stream=obj.stream,
filename=obj.filename,
limits=self.limits,
pdf_backend=pdf_backend,
)
@classmethod
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
paths = [Path(p) for p in paths]
doc_input = cls(limits=limits)
doc_input._path_or_stream_iterator = paths
return doc_input
@classmethod
def from_streams(
cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
):
doc_input = cls(limits=limits)
doc_input._path_or_stream_iterator = streams
return doc_input

View File

@ -0,0 +1,32 @@
import sys
from pydantic import BaseModel
from pydantic_settings import BaseSettings
class DocumentLimits(BaseModel):
max_num_pages: int = sys.maxsize
max_file_size: int = sys.maxsize
class BatchConcurrencySettings(BaseModel):
doc_batch_size: int = 2
doc_batch_concurrency: int = 2
page_batch_size: int = 4
page_batch_concurrency: int = 2
# doc_batch_size: int = 1
# doc_batch_concurrency: int = 1
# page_batch_size: int = 1
# page_batch_concurrency: int = 1
# model_concurrency: int = 2
# To force models into single core: export OMP_NUM_THREADS=1
class AppSettings(BaseSettings):
perf: BatchConcurrencySettings
settings = AppSettings(perf=BatchConcurrencySettings())

View File

@ -0,0 +1,207 @@
import functools
import logging
import time
import traceback
from pathlib import Path
from typing import Iterable, Optional, Type, Union
from PIL import ImageDraw
from docling.backend.abstract_backend import PdfDocumentBackend
from docling.datamodel.base_models import (
AssembledUnit,
ConversionStatus,
Page,
PipelineOptions,
)
from docling.datamodel.document import (
ConvertedDocument,
DocumentConversionInput,
InputDocument,
)
from docling.datamodel.settings import settings
from docling.models.ds_glm_model import GlmModel
from docling.models.page_assemble_model import PageAssembleModel
from docling.pipeline.base_model_pipeline import BaseModelPipeline
from docling.pipeline.standard_model_pipeline import StandardModelPipeline
from docling.utils.utils import chunkify, create_hash
_log = logging.getLogger(__name__)
class DocumentConverter:
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
_table_model_path = "model_artifacts/tableformer"
def __init__(
self,
artifacts_path: Optional[Union[Path, str]] = None,
pipeline_options: PipelineOptions = PipelineOptions(),
pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
):
if not artifacts_path:
artifacts_path = self.download_models_hf()
artifacts_path = Path(artifacts_path)
self.model_pipeline = pipeline_cls(
artifacts_path=artifacts_path, pipeline_options=pipeline_options
)
self.page_assemble_model = PageAssembleModel(config={})
self.glm_model = GlmModel(config={})
self.pdf_backend = pdf_backend
@staticmethod
def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False
) -> Path:
from huggingface_hub import snapshot_download
download_path = snapshot_download(
repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
)
return Path(download_path)
def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument]:
for input_batch in chunkify(
input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
):
_log.info(f"Going to convert document batch...")
# parallel processing only within input_batch
# with ThreadPoolExecutor(
# max_workers=settings.perf.doc_batch_concurrency
# ) as pool:
# yield from pool.map(self.process_document, input_batch)
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
yield from map(self.process_document, input_batch)
def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
start_doc_time = time.time()
converted_doc = ConvertedDocument(input=in_doc)
if not in_doc.valid:
converted_doc.status = ConversionStatus.FAILURE
return converted_doc
for i in range(0, in_doc.page_count):
converted_doc.pages.append(Page(page_no=i))
all_assembled_pages = []
try:
# Iterate batches of pages (page_batch_size) in the doc
for page_batch in chunkify(
converted_doc.pages, settings.perf.page_batch_size
):
start_pb_time = time.time()
# Pipeline
# 1. Initialise the page resources
init_pages = map(
functools.partial(self.initialize_page, in_doc), page_batch
)
# 2. Populate page image
pages_with_images = map(
functools.partial(self.populate_page_images, in_doc), init_pages
)
# 3. Populate programmatic page cells
pages_with_cells = map(
functools.partial(self.parse_page_cells, in_doc),
pages_with_images,
)
pipeline_pages = self.model_pipeline.apply(pages_with_cells)
# 7. Assemble page elements (per page)
assembled_pages = self.page_assemble_model(pipeline_pages)
# exhaust assembled_pages
for assembled_page in assembled_pages:
# Free up mem resources before moving on with next batch
assembled_page.image = (
None # Comment this if you want to visualize page images
)
assembled_page._backend.unload()
all_assembled_pages.append(assembled_page)
end_pb_time = time.time() - start_pb_time
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
# Free up mem resources of PDF backend
in_doc._backend.unload()
converted_doc.pages = all_assembled_pages
self.assemble_doc(converted_doc)
converted_doc.status = ConversionStatus.SUCCESS
except Exception as e:
converted_doc.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e))
_log.info(f"Encountered an error during conversion: {trace}")
end_doc_time = time.time() - start_doc_time
_log.info(
f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
)
return converted_doc
# Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
page._backend = doc._backend.load_page(page.page_no)
page.size = page._backend.get_size()
page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
return page
# Generate the page image and store it in the page object
def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
page.image = page._backend.get_page_image()
return page
# Extract and populate the page cells and store it in the page object
def parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
page.cells = page._backend.get_text_cells()
# DEBUG code:
def draw_text_boxes(image, cells):
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
image.show()
# draw_text_boxes(page.image, cells)
return page
def assemble_doc(self, converted_doc: ConvertedDocument):
all_elements = []
all_headers = []
all_body = []
for p in converted_doc.pages:
for el in p.assembled.body:
all_body.append(el)
for el in p.assembled.headers:
all_headers.append(el)
for el in p.assembled.elements:
all_elements.append(el)
converted_doc.assembled = AssembledUnit(
elements=all_elements, headers=all_headers, body=all_body
)
converted_doc.output = self.glm_model(converted_doc)

View File

View File

@ -0,0 +1,82 @@
import copy
import random
from deepsearch_glm.nlp_utils import init_nlp_model
from deepsearch_glm.utils.ds_utils import to_legacy_document_format
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
from docling_core.types import BaseText
from docling_core.types import Document as DsDocument
from docling_core.types import Ref
from PIL import ImageDraw
from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
from docling.datamodel.document import ConvertedDocument
class GlmModel:
def __init__(self, config):
self.config = config
load_pretrained_nlp_models()
model = init_nlp_model(model_names="language;term;reference")
self.model = model
def __call__(self, document: ConvertedDocument) -> DsDocument:
ds_doc = document.to_ds_document()
ds_doc_dict = ds_doc.model_dump(by_alias=True)
glm_doc = self.model.apply_on_doc(ds_doc_dict)
ds_doc_dict = to_legacy_document_format(
glm_doc, ds_doc_dict, update_name_label=True
)
exported_doc = DsDocument.model_validate(ds_doc_dict)
# DEBUG code:
def draw_clusters_and_cells(ds_document, page_no):
clusters_to_draw = []
image = copy.deepcopy(document.pages[page_no].image)
for ix, elem in enumerate(ds_document.main_text):
if isinstance(elem, BaseText):
prov = elem.prov[0]
elif isinstance(elem, Ref):
_, arr, index = elem.ref.split("/")
index = int(index)
if arr == "tables":
prov = ds_document.tables[index].prov[0]
elif arr == "figures":
prov = ds_document.figures[index].prov[0]
else:
prov = None
if prov and prov.page == page_no:
clusters_to_draw.append(
Cluster(
id=ix,
label=elem.name,
bbox=BoundingBox.from_tuple(
coord=prov.bbox,
origin=CoordOrigin.BOTTOMLEFT,
).to_top_left_origin(document.pages[page_no].size.height),
)
)
draw = ImageDraw.Draw(image)
for c in clusters_to_draw:
x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
for tc in c.cells: # [:1]:
x0, y0, x1, y1 = tc.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# draw_clusters_and_cells(ds_doc, 0)
# draw_clusters_and_cells(exported_doc, 0)
return exported_doc

View File

@ -0,0 +1,77 @@
import copy
import logging
import random
from typing import Iterable
import numpy
from PIL import ImageDraw
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
_log = logging.getLogger(__name__)
class EasyOcrModel:
def __init__(self, config):
self.config = config
self.enabled = config["enabled"]
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
if self.enabled:
import easyocr
self.reader = easyocr.Reader(config["lang"])
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
return
for page in page_batch:
# rects = page._fpage.
high_res_image = page._backend.get_page_image(scale=self.scale)
im = numpy.array(high_res_image)
result = self.reader.readtext(im)
del high_res_image
del im
cells = [
OcrCell(
id=ix,
text=line[1],
confidence=line[2],
bbox=BoundingBox.from_tuple(
coord=(
line[0][0][0] / self.scale,
line[0][0][1] / self.scale,
line[0][2][0] / self.scale,
line[0][2][1] / self.scale,
),
origin=CoordOrigin.TOPLEFT,
),
)
for ix, line in enumerate(result)
]
page.cells = cells # For now, just overwrites all digital cells.
# DEBUG code:
def draw_clusters_and_cells():
image = copy.deepcopy(page.image)
draw = ImageDraw.Draw(image)
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
for tc in cells:
x0, y0, x1, y1 = tc.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# draw_clusters_and_cells()
yield page

View File

@ -0,0 +1,318 @@
import copy
import logging
import random
import time
from typing import Iterable, List
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import ImageDraw
from docling.datamodel.base_models import (
BoundingBox,
Cell,
Cluster,
CoordOrigin,
LayoutPrediction,
Page,
)
from docling.utils import layout_utils as lu
_log = logging.getLogger(__name__)
class LayoutModel:
TEXT_ELEM_LABELS = [
"Text",
"Footnote",
"Caption",
"Checkbox-Unselected",
"Checkbox-Selected",
"Section-header",
"Page-header",
"Page-footer",
"Code",
"List-item",
# "Formula",
]
PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
TABLE_LABEL = "Table"
FIGURE_LABEL = "Picture"
FORMULA_LABEL = "Formula"
def __init__(self, config):
self.config = config
self.layout_predictor = LayoutPredictor(
config["artifacts_path"]
) # TODO temporary
def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
MIN_INTERSECTION = 0.2
CLASS_THRESHOLDS = {
"Caption": 0.35,
"Footnote": 0.35,
"Formula": 0.35,
"List-item": 0.35,
"Page-footer": 0.35,
"Page-header": 0.35,
"Picture": 0.2, # low threshold adjust to capture chemical structures for examples.
"Section-header": 0.45,
"Table": 0.35,
"Text": 0.45,
"Title": 0.45,
"Document Index": 0.45,
"Code": 0.45,
"Checkbox-Selected": 0.45,
"Checkbox-Unselected": 0.45,
"Form": 0.45,
"Key-Value Region": 0.45,
}
_log.debug("================= Start postprocess function ====================")
start_time = time.time()
# Apply Confidence Threshold to cluster predictions
# confidence = self.conf_threshold
clusters_out = []
for cluster in clusters:
confidence = CLASS_THRESHOLDS[cluster.label]
if cluster.confidence >= confidence:
# annotation["created_by"] = "high_conf_pred"
clusters_out.append(cluster)
# map to dictionary clusters and cells, with bottom left origin
clusters = [
{
"id": c.id,
"bbox": list(
c.bbox.to_bottom_left_origin(page_height).as_tuple()
), # TODO
"confidence": c.confidence,
"cell_ids": [],
"type": c.label,
}
for c in clusters
]
clusters_out = [
{
"id": c.id,
"bbox": list(
c.bbox.to_bottom_left_origin(page_height).as_tuple()
), # TODO
"confidence": c.confidence,
"created_by": "high_conf_pred",
"cell_ids": [],
"type": c.label,
}
for c in clusters_out
]
raw_cells = [
{
"id": c.id,
"bbox": list(
c.bbox.to_bottom_left_origin(page_height).as_tuple()
), # TODO
"text": c.text,
}
for c in cells
]
cell_count = len(raw_cells)
_log.debug("---- 0. Treat cluster overlaps ------")
clusters_out = lu.remove_cluster_duplicates_by_conf(clusters_out, 0.8)
_log.debug(
"---- 1. Initially assign cells to clusters based on minimum intersection ------"
)
## Check for cells included in or touched by clusters:
clusters_out = lu.assigning_cell_ids_to_clusters(
clusters_out, raw_cells, MIN_INTERSECTION
)
_log.debug("---- 2. Assign Orphans with Low Confidence Detections")
# Creates a map of cell_id->cluster_id
(
clusters_around_cells,
orphan_cell_indices,
ambiguous_cell_indices,
) = lu.cell_id_state_map(clusters_out, cell_count)
# Assign orphan cells with lower confidence predictions
clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred(
clusters_out, clusters, raw_cells, orphan_cell_indices
)
# Refresh the cell_ids assignment, after creating new clusters using low conf predictions
clusters_out = lu.assigning_cell_ids_to_clusters(
clusters_out, raw_cells, MIN_INTERSECTION
)
_log.debug("---- 3. Settle Ambigous Cells")
# Creates an update map after assignment of cell_id->cluster_id
(
clusters_around_cells,
orphan_cell_indices,
ambiguous_cell_indices,
) = lu.cell_id_state_map(clusters_out, cell_count)
# Settle pdf cells that belong to multiple clusters
clusters_out, ambiguous_cell_indices = lu.remove_ambigous_pdf_cell_by_conf(
clusters_out, raw_cells, ambiguous_cell_indices
)
_log.debug("---- 4. Set Orphans as Text")
(
clusters_around_cells,
orphan_cell_indices,
ambiguous_cell_indices,
) = lu.cell_id_state_map(clusters_out, cell_count)
clusters_out, orphan_cell_indices = lu.set_orphan_as_text(
clusters_out, clusters, raw_cells, orphan_cell_indices
)
_log.debug("---- 5. Merge Cells & and adapt the bounding boxes")
# Merge cells orphan cells
clusters_out = lu.merge_cells(clusters_out)
# Clean up clusters that remain from merged and unreasonable clusters
clusters_out = lu.clean_up_clusters(
clusters_out,
raw_cells,
merge_cells=True,
img_table=True,
one_cell_table=True,
)
new_clusters = lu.adapt_bboxes(raw_cells, clusters_out, orphan_cell_indices)
clusters_out = new_clusters
## We first rebuild where every cell is now:
## Now we write into a prediction cells list, not into the raw cells list.
## As we don't need previous labels, we best overwrite any old list, because that might
## have been sorted differently.
(
clusters_around_cells,
orphan_cell_indices,
ambiguous_cell_indices,
) = lu.cell_id_state_map(clusters_out, cell_count)
target_cells = []
for ix, cell in enumerate(raw_cells):
new_cell = {
"id": ix,
"rawcell_id": ix,
"label": "None",
"bbox": cell["bbox"],
"text": cell["text"],
}
for cluster_index in clusters_around_cells[
ix
]: # By previous analysis, this is always 1 cluster.
new_cell["label"] = clusters_out[cluster_index]["type"]
target_cells.append(new_cell)
# _log.debug("New label of cell " + str(ix) + " is " + str(new_cell["label"]))
cells_out = target_cells
## -------------------------------
## Sort clusters into reasonable reading order, and sort the cells inside each cluster
_log.debug("---- 5. Sort clusters in reading order ------")
sorted_clusters = lu.produce_reading_order(
clusters_out, "raw_cell_ids", "raw_cell_ids", True
)
clusters_out = sorted_clusters
# end_time = timer()
_log.debug("---- End of postprocessing function ------")
end_time = time.time() - start_time
_log.debug(f"Finished post processing in seconds={end_time:.3f}")
cells_out = [
Cell(
id=c["id"],
bbox=BoundingBox.from_tuple(
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height),
text=c["text"],
)
for c in cells_out
]
clusters_out_new = []
for c in clusters_out:
cluster_cells = [ccell for ccell in cells_out if ccell.id in c["cell_ids"]]
c_new = Cluster(
id=c["id"],
bbox=BoundingBox.from_tuple(
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height),
confidence=c["confidence"],
label=c["type"],
cells=cluster_cells,
)
clusters_out_new.append(c_new)
return clusters_out_new, cells_out
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch:
clusters = []
for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)):
cluster = Cluster(
id=ix,
label=pred_item["label"],
confidence=pred_item["confidence"],
bbox=BoundingBox.model_validate(pred_item),
cells=[],
)
clusters.append(cluster)
# Map cells to clusters
# TODO: Remove, postprocess should take care of it anyway.
for cell in page.cells:
for cluster in clusters:
if not cell.bbox.area() > 0:
overlap_frac = 0.0
else:
overlap_frac = (
cell.bbox.intersection_area_with(cluster.bbox)
/ cell.bbox.area()
)
if overlap_frac > 0.5:
cluster.cells.append(cell)
# Pre-sort clusters
# clusters = self.sort_clusters_by_cell_order(clusters)
# DEBUG code:
def draw_clusters_and_cells():
image = copy.deepcopy(page.image)
draw = ImageDraw.Draw(image)
for c in clusters:
x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
for tc in c.cells: # [:1]:
x0, y0, x1, y1 = tc.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# draw_clusters_and_cells()
clusters, page.cells = self.postprocess(
clusters, page.cells, page.size.height
)
# draw_clusters_and_cells()
page.predictions.layout = LayoutPrediction(clusters=clusters)
yield page

View File

@ -0,0 +1,160 @@
import logging
import re
from typing import Iterable, List
from docling.datamodel.base_models import (
AssembledUnit,
FigureElement,
Page,
PageElement,
TableElement,
TextElement,
)
from docling.models.layout_model import LayoutModel
_log = logging.getLogger(__name__)
class PageAssembleModel:
def __init__(self, config):
self.config = config
# self.line_wrap_pattern = re.compile(r'(?<=[^\W_])- \n(?=\w)')
# def sanitize_text_poor(self, lines):
# text = '\n'.join(lines)
#
# # treat line wraps.
# sanitized_text = self.line_wrap_pattern.sub('', text)
#
# sanitized_text = sanitized_text.replace('\n', ' ')
#
# return sanitized_text
def sanitize_text(self, lines):
if len(lines) <= 1:
return " ".join(lines)
for ix, line in enumerate(lines[1:]):
prev_line = lines[ix]
if prev_line.endswith("-"):
prev_words = re.findall(r"\b[\w]+\b", prev_line)
line_words = re.findall(r"\b[\w]+\b", line)
if (
len(prev_words)
and len(line_words)
and prev_words[-1].isalnum()
and line_words[0].isalnum()
):
lines[ix] = prev_line[:-1]
else:
lines[ix] += " "
sanitized_text = "".join(lines)
return sanitized_text.strip() # Strip any leading or trailing whitespace
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch:
# assembles some JSON output page by page.
elements: List[PageElement] = []
headers: List[PageElement] = []
body: List[PageElement] = []
for cluster in page.predictions.layout.clusters:
# _log.info("Cluster label seen:", cluster.label)
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
textlines = [
cell.text.replace("\x02", "-").strip()
for cell in cluster.cells
if len(cell.text.strip()) > 0
]
text = self.sanitize_text(textlines)
text_el = TextElement(
label=cluster.label,
id=cluster.id,
text=text,
page_no=page.page_no,
cluster=cluster,
)
elements.append(text_el)
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
headers.append(text_el)
else:
body.append(text_el)
elif cluster.label == LayoutModel.TABLE_LABEL:
tbl = None
if page.predictions.tablestructure:
tbl = page.predictions.tablestructure.table_map.get(
cluster.id, None
)
if (
not tbl
): # fallback: add table without structure, if it isn't present
tbl = TableElement(
label=cluster.label,
id=cluster.id,
text="",
otsl_seq=[],
table_cells=[],
cluster=cluster,
page_no=page.page_no,
)
elements.append(tbl)
body.append(tbl)
elif cluster.label == LayoutModel.FIGURE_LABEL:
fig = None
if page.predictions.figures_classification:
fig = page.predictions.figures_classification.figure_map.get(
cluster.id, None
)
if (
not fig
): # fallback: add figure without classification, if it isn't present
fig = FigureElement(
label=cluster.label,
id=cluster.id,
text="",
data=None,
cluster=cluster,
page_no=page.page_no,
)
elements.append(fig)
body.append(fig)
elif cluster.label == LayoutModel.FORMULA_LABEL:
equation = None
if page.predictions.equations_prediction:
equation = (
page.predictions.equations_prediction.equation_map.get(
cluster.id, None
)
)
if not equation: # fallback: add empty formula, if it isn't present
text = self.sanitize_text(
[
cell.text.replace("\x02", "-").strip()
for cell in cluster.cells
if len(cell.text.strip()) > 0
]
)
equation = TextElement(
label=cluster.label,
id=cluster.id,
cluster=cluster,
page_no=page.page_no,
text=text,
)
elements.append(equation)
body.append(equation)
page.assembled = AssembledUnit(
elements=elements, headers=headers, body=body
)
yield page

View File

@ -0,0 +1,114 @@
from typing import Iterable
import numpy
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
from docling.datamodel.base_models import (
BoundingBox,
Page,
TableCell,
TableElement,
TableStructurePrediction,
)
class TableStructureModel:
def __init__(self, config):
self.config = config
self.do_cell_matching = config["do_cell_matching"]
self.enabled = config["enabled"]
if self.enabled:
artifacts_path = config["artifacts_path"]
# Third Party
import docling_ibm_models.tableformer.common as c
self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
self.tm_config["model"]["save_dir"] = artifacts_path
self.tm_model_type = self.tm_config["model"]["type"]
self.tf_predictor = TFPredictor(self.tm_config)
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
return
for page in page_batch:
page.predictions.tablestructure = TableStructurePrediction() # dummy
in_tables = [
(
cluster,
[
round(cluster.bbox.l),
round(cluster.bbox.t),
round(cluster.bbox.r),
round(cluster.bbox.b),
],
)
for cluster in page.predictions.layout.clusters
if cluster.label == "Table"
]
if not len(in_tables):
yield page
continue
tokens = []
for c in page.cells:
for cluster, _ in in_tables:
if c.bbox.area() > 0:
if (
c.bbox.intersection_area_with(cluster.bbox) / c.bbox.area()
> 0.2
):
# Only allow non empty stings (spaces) into the cells of a table
if len(c.text.strip()) > 0:
tokens.append(c.model_dump())
iocr_page = {
"image": numpy.asarray(page.image),
"tokens": tokens,
"width": page.size.width,
"height": page.size.height,
}
table_clusters, table_bboxes = zip(*in_tables)
if len(table_bboxes):
tf_output = self.tf_predictor.multi_table_predict(
iocr_page, table_bboxes, do_matching=self.do_cell_matching
)
for table_cluster, table_out in zip(table_clusters, tf_output):
table_cells = []
for element in table_out["tf_responses"]:
if not self.do_cell_matching:
the_bbox = BoundingBox.model_validate(element["bbox"])
text_piece = page._backend.get_text_in_rect(the_bbox)
element["bbox"]["token"] = text_piece
tc = TableCell.model_validate(element)
table_cells.append(tc)
# Retrieving cols/rows, after post processing:
num_rows = table_out["predict_details"]["num_rows"]
num_cols = table_out["predict_details"]["num_cols"]
otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
tbl = TableElement(
otsl_seq=otsl_seq,
table_cells=table_cells,
num_rows=num_rows,
num_cols=num_cols,
id=table_cluster.id,
page_no=page.page_no,
cluster=table_cluster,
label="Table",
)
page.predictions.tablestructure.table_map[table_cluster.id] = tbl
yield page

View File

View File

@ -0,0 +1,18 @@
from abc import abstractmethod
from pathlib import Path
from typing import Iterable
from docling.datamodel.base_models import Page, PipelineOptions
class BaseModelPipeline:
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
self.model_pipe = []
self.artifacts_path = artifacts_path
self.pipeline_options = pipeline_options
def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for model in self.model_pipe:
page_batch = model(page_batch)
yield from page_batch

View File

@ -0,0 +1,40 @@
from pathlib import Path
from typing import Iterable
from docling.datamodel.base_models import Page, PipelineOptions
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.page_assemble_model import PageAssembleModel
from docling.models.table_structure_model import TableStructureModel
from docling.pipeline.base_model_pipeline import BaseModelPipeline
class StandardModelPipeline(BaseModelPipeline):
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
_table_model_path = "model_artifacts/tableformer"
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
super().__init__(artifacts_path, pipeline_options)
self.model_pipe = [
EasyOcrModel(
config={
"lang": ["fr", "de", "es", "en"],
"enabled": pipeline_options.do_ocr,
}
),
LayoutModel(
config={
"artifacts_path": artifacts_path
/ StandardModelPipeline._layout_model_path
}
),
TableStructureModel(
config={
"artifacts_path": artifacts_path
/ StandardModelPipeline._table_model_path,
"enabled": pipeline_options.do_table_structure,
"do_cell_matching": False,
}
),
]

View File

View File

@ -0,0 +1,806 @@
import copy
import logging
import networkx as nx
logger = logging.getLogger("layout_utils")
## -------------------------------
## Geometric helper functions
## The coordinates grow left to right, and bottom to top.
## The bounding box list elements 0 to 3 are x_left, y_bottom, x_right, y_top.
def area(bbox):
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
def contains(bbox_i, bbox_j):
## Returns True if bbox_i contains bbox_j, else False
return (
bbox_i[0] <= bbox_j[0]
and bbox_i[1] <= bbox_j[1]
and bbox_i[2] >= bbox_j[2]
and bbox_i[3] >= bbox_j[3]
)
def is_intersecting(bbox_i, bbox_j):
return not (
bbox_i[2] < bbox_j[0]
or bbox_i[0] > bbox_j[2]
or bbox_i[3] < bbox_j[1]
or bbox_i[1] > bbox_j[3]
)
def bb_iou(boxA, boxB):
# determine the (x, y)-coordinates of the intersection rectangle
xA = max(boxA[0], boxB[0])
yA = max(boxA[1], boxB[1])
xB = min(boxA[2], boxB[2])
yB = min(boxA[3], boxB[3])
# compute the area of intersection rectangle
interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
# compute the area of both the prediction and ground-truth
# rectangles
boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
# compute the intersection over union by taking the intersection
# area and dividing it by the sum of prediction + ground-truth
# areas - the interesection area
iou = interArea / float(boxAArea + boxBArea - interArea)
# return the intersection over union value
return iou
def compute_intersection(bbox_i, bbox_j):
## Returns the size of the intersection area of the two boxes
if not is_intersecting(bbox_i, bbox_j):
return 0
## Determine the (x, y)-coordinates of the intersection rectangle:
xA = max(bbox_i[0], bbox_j[0])
yA = max(bbox_i[1], bbox_j[1])
xB = min(bbox_i[2], bbox_j[2])
yB = min(bbox_i[3], bbox_j[3])
## Compute the area of intersection rectangle:
interArea = (xB - xA) * (yB - yA)
if interArea < 0:
logger.debug("Warning: Negative intersection detected!")
return 0
return interArea
def surrounding(bbox_i, bbox_j):
## Computes minimal box that contains both input boxes
sbox = []
sbox.append(min(bbox_i[0], bbox_j[0]))
sbox.append(min(bbox_i[1], bbox_j[1]))
sbox.append(max(bbox_i[2], bbox_j[2]))
sbox.append(max(bbox_i[3], bbox_j[3]))
return sbox
def surrounding_list(bbox_list):
## Computes minimal box that contains all boxes in the input list
## The list should be non-empty, but just in case it's not:
if len(bbox_list) == 0:
sbox = [0, 0, 0, 0]
else:
sbox = []
sbox.append(min([bbox[0] for bbox in bbox_list]))
sbox.append(min([bbox[1] for bbox in bbox_list]))
sbox.append(max([bbox[2] for bbox in bbox_list]))
sbox.append(max([bbox[3] for bbox in bbox_list]))
return sbox
def vertical_overlap(bboxA, bboxB):
## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
if bboxB[3] < bboxA[1]: ## B below A
return False
elif bboxA[3] < bboxB[1]: ## A below B
return False
else:
return True
def vertical_overlap_fraction(bboxA, bboxB):
## Returns the vertical overlap as fraction of the lower bbox height.
## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
## Height 0 is permitted in the input.
heightA = bboxA[3] - bboxA[1]
heightB = bboxB[3] - bboxB[1]
min_height = min(heightA, heightB)
if bboxA[3] >= bboxB[3]: ## A starts higher or equal
if (
bboxA[1] <= bboxB[1]
): ## B is completely in A; this can include height of B = 0:
fraction = 1
else:
overlap = max(bboxB[3] - bboxA[1], 0)
fraction = overlap / max(min_height, 0.001)
else:
if (
bboxB[1] <= bboxA[1]
): ## A is completely in B; this can include height of A = 0:
fraction = 1
else:
overlap = max(bboxA[3] - bboxB[1], 0)
fraction = overlap / max(min_height, 0.001)
return fraction
## -------------------------------
## Cluster-and-cell relations
def compute_enclosed_cells(
cluster_bbox, raw_cells, min_cell_intersection_with_cluster=0.2
):
cells_in_cluster = []
cells_in_cluster_int = []
for ix, cell in enumerate(raw_cells):
cell_bbox = cell["bbox"]
intersection = compute_intersection(cell_bbox, cluster_bbox)
frac_area = area(cell_bbox) * min_cell_intersection_with_cluster
if (
intersection > frac_area and frac_area > 0
): # intersect > certain fraction of cell
cells_in_cluster.append(ix)
cells_in_cluster_int.append(intersection)
elif contains(
cluster_bbox,
[cell_bbox[0] + 3, cell_bbox[1] + 3, cell_bbox[2] - 3, cell_bbox[3] - 3],
):
cells_in_cluster.append(ix)
return cells_in_cluster, cells_in_cluster_int
def find_clusters_around_cells(cell_count, clusters):
## Per raw cell, find to which clusters it belongs.
## Return list of these indices in the raw-cell order.
clusters_around_cells = [[] for _ in range(cell_count)]
for cl_ix, cluster in enumerate(clusters):
for ix in cluster["cell_ids"]:
clusters_around_cells[ix].append(cl_ix)
return clusters_around_cells
def find_cell_index(raw_ix, cell_array):
## "raw_ix" is a rawcell_id.
## "cell_array" has the structure of an (annotation) cells array.
## Returns index of cell in cell_array that has this rawcell_id.
for ix, cell in enumerate(cell_array):
if cell["rawcell_id"] == raw_ix:
return ix
def find_cell_indices(cluster, cell_array):
## "cluster" must have the structure as in a clusters array in a prediction,
## "cell_array" that of a cells array.
## Returns list of indices of cells in cell_array that have the rawcell_ids as in the cluster,
## in the order of the rawcell_ids.
result = []
for raw_ix in sorted(cluster["cell_ids"]):
## Find the cell with this rawcell_id (if any)
for ix, cell in enumerate(cell_array):
if cell["rawcell_id"] == raw_ix:
result.append(ix)
return result
def find_first_cell_index(cluster, cell_array):
## "cluster" must be a dict with key "cell_ids"; it can also be a line.
## "cell_array" has the structure of a cells array in an annotation.
## Returns index of cell in cell_array that has the lowest rawcell_id from the cluster.
result = [] ## We keep it a list as it can be empty (picture without text cells)
if len(cluster["cell_ids"]) == 0:
return result
raw_ix = min(cluster["cell_ids"])
## Find the cell with this rawcell_id (if any)
for ix, cell in enumerate(cell_array):
if cell["rawcell_id"] == raw_ix:
result.append(ix)
break ## One is enough; should be only one anyway.
if result == []:
logger.debug(
" Warning: Raw cell " + str(raw_ix) + " not found in annotation cells"
)
return result
## -------------------------------
## Cluster labels and text
def relabel_cluster(cluster, cl_ix, new_label, target_pred):
## "cluster" must have the structure as in a clusters array in a prediction,
## "cl_ix" is its index in target_pred,
## "new_label" is the intended new label,
## "target_pred" is the entire current target prediction.
## Sets label on the cluster itself, and on the cells in the target_pred.
## Returns new_label so that also the cl_label variable in the main code is easily set.
target_pred["clusters"][cl_ix]["type"] = new_label
cluster_target_cells = find_cell_indices(cluster, target_pred["cells"])
for ix in cluster_target_cells:
target_pred["cells"][ix]["label"] = new_label
return new_label
def find_cluster_text(cluster, raw_cells):
## "cluster" must be a dict with "cell_ids"; it can also be a line.
## "raw_cells" must have the format of item["raw"]["cells"]
## Returns the text of the cluster, with blanks between the cell contents
## (which seem to be words or phrases without starting or trailing blanks).
## Note that in formulas, this may give a lot more blanks than originally
cluster_text = ""
for raw_ix in sorted(cluster["cell_ids"]):
cluster_text = cluster_text + raw_cells[raw_ix]["text"] + " "
return cluster_text.rstrip()
def find_cluster_text_without_blanks(cluster, raw_cells):
## "cluster" must be a dict with "cell_ids"; it can also be a line.
## "raw_cells" must have the format of item["raw"]["cells"]
## Returns the text of the cluster, without blanks between the cell contents
## Interesting in formula analysis.
cluster_text = ""
for raw_ix in sorted(cluster["cell_ids"]):
cluster_text = cluster_text + raw_cells[raw_ix]["text"]
return cluster_text.rstrip()
## -------------------------------
## Clusters and lines
## (Most line-oriented functions are only needed in TextAnalysisGivenClusters,
## but this one also in FormulaAnalysis)
def build_cluster_from_lines(lines, label, id):
## Lines must be a non-empty list of dicts (lines) with elements "cell_ids" and "bbox"
## (There is no condition that they are really geometrically lines)
## A cluster in standard format is returned with given label and id
local_lines = copy.deepcopy(
lines
) ## without this, it changes "lines" also outside this function
first_line = local_lines.pop(0)
cluster = {
"id": id,
"type": label,
"cell_ids": first_line["cell_ids"],
"bbox": first_line["bbox"],
"confidence": 0,
"created_by": "merged_cells",
}
confidence = 0
counter = 0
for line in local_lines:
new_cell_ids = cluster["cell_ids"] + line["cell_ids"]
cluster["cell_ids"] = new_cell_ids
cluster["bbox"] = surrounding(cluster["bbox"], line["bbox"])
counter += 1
confidence += line["confidence"]
confidence = confidence / counter
cluster["confidence"] = confidence
return cluster
## -------------------------------
## Reading order
def produce_reading_order(clusters, cluster_sort_type, cell_sort_type, sort_ids):
## In:
## Clusters: list as in predictions.
## cluster_sort_type: string, currently only "raw_cells".
## cell_sort_type: string, currently only "raw_cells".
## sort_ids: Boolean, whether the cluster ids should be adapted to their new position
## Out: Another clusters list, sorted according to the type.
logger.debug("---- Start cluster sorting ------")
if cell_sort_type == "raw_cell_ids":
for cl in clusters:
sorted_cell_ids = sorted(cl["cell_ids"])
cl["cell_ids"] = sorted_cell_ids
else:
logger.debug(
"Unknown cell_sort_type `"
+ cell_sort_type
+ "`, no cell sorting will happen."
)
if cluster_sort_type == "raw_cell_ids":
clusters_with_cells = [cl for cl in clusters if cl["cell_ids"] != []]
clusters_without_cells = [cl for cl in clusters if cl["cell_ids"] == []]
logger.debug(
"Clusters with cells: " + str([cl["id"] for cl in clusters_with_cells])
)
logger.debug(
" Their first cell ids: "
+ str([cl["cell_ids"][0] for cl in clusters_with_cells])
)
logger.debug(
"Clusters without cells: "
+ str([cl["id"] for cl in clusters_without_cells])
)
clusters_with_cells_sorted = sorted(
clusters_with_cells, key=lambda cluster: cluster["cell_ids"][0]
)
logger.debug(
" First cell ids after sorting: "
+ str([cl["cell_ids"][0] for cl in clusters_with_cells_sorted])
)
sorted_clusters = clusters_with_cells_sorted + clusters_without_cells
else:
logger.debug(
"Unknown cluster_sort_type: `"
+ cluster_sort_type
+ "`, no cluster sorting will happen."
)
if sort_ids:
for i, cl in enumerate(sorted_clusters):
cl["id"] = i
return sorted_clusters
## -------------------------------
## Line Splitting
def sort_cells_horizontal(line_cell_ids, raw_cells):
## "line_cells" should be a non-empty list of (raw) cell_ids
## "raw_cells" has the structure of item["raw"]["cells"].
## Sorts the cells in the line by x0 (left start).
new_line_cell_ids = sorted(
line_cell_ids, key=lambda cell_id: raw_cells[cell_id]["bbox"][0]
)
return new_line_cell_ids
def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
new_clusters = []
for ix, cluster in enumerate(clusters):
new_cluster = copy.deepcopy(cluster)
logger.debug(
"Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
)
logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
logger.debug(" Empty non-picture, removed")
continue ## Skip this former cluster, now without cells.
new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
new_cluster["bbox"] = new_bbox
new_clusters.append(new_cluster)
return new_clusters
def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
if not (cluster["type"] in ["Table", "Picture"]):
## A text-like cluster. The bbox only needs to be around the text cells:
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
new_bbox = surrounding_list(
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
)
logger.debug(" New bounding box:" + str(new_bbox))
if cluster["type"] == "Picture":
## We only make the bbox completely comprise included text cells:
logger.debug(" Picture")
if len(cluster["cell_ids"]) != 0:
min_bbox = surrounding_list(
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
)
logger.debug(" Minimum bbox: " + str(min_bbox))
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
new_bbox = surrounding(min_bbox, cluster["bbox"])
logger.debug(" New bbox (initial and text cells): " + str(new_bbox))
else:
logger.debug(" without text cells, no change.")
new_bbox = cluster["bbox"]
else: ## A table
## At least we have to keep the included text cells, and we make the bbox completely comprise them
min_bbox = surrounding_list(
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
)
logger.debug(" Minimum bbox: " + str(min_bbox))
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
new_bbox = surrounding(min_bbox, cluster["bbox"])
logger.debug(" Possibly increased bbox: " + str(new_bbox))
## Now we look which non-belonging cells are covered.
## (To decrease dependencies, we don't make use of which cells we actually removed.)
## We don't worry about orphan cells, those could still be added to the table.
enclosed_cells = compute_enclosed_cells(
new_bbox, raw_cells, min_cell_intersection_with_cluster=0.3
)[0]
additional_cells = set(enclosed_cells) - set(cluster["cell_ids"])
logger.debug(
" Additional cells enclosed by Table bbox: " + str(additional_cells)
)
spurious_cells = additional_cells - set(orphan_cell_indices)
logger.debug(
" Spurious cells enclosed by Table bbox (additional minus orphans): "
+ str(spurious_cells)
)
if len(spurious_cells) == 0:
return new_bbox
## Else we want to keep as much as possible, e.g., grid lines, but not the spurious cells if we can.
## We initialize possible cuts with the current bbox.
left_cut = new_bbox[0]
right_cut = new_bbox[2]
upper_cut = new_bbox[3]
lower_cut = new_bbox[1]
for cell_ix in spurious_cells:
cell = raw_cells[cell_ix]
# logger.debug(" Spurious cell bbox: " + str(cell["bbox"]))
is_left = cell["bbox"][2] < min_bbox[0]
is_right = cell["bbox"][0] > min_bbox[2]
is_above = cell["bbox"][1] > min_bbox[3]
is_below = cell["bbox"][3] < min_bbox[1]
# logger.debug(" Left, right, above, below? " + str([is_left, is_right, is_above, is_below]))
if is_left:
if cell["bbox"][2] > left_cut:
## We move the left cut to exclude this cell:
left_cut = cell["bbox"][2]
if is_right:
if cell["bbox"][0] < right_cut:
## We move the right cut to exclude this cell:
right_cut = cell["bbox"][0]
if is_above:
if cell["bbox"][1] < upper_cut:
## We move the upper cut to exclude this cell:
upper_cut = cell["bbox"][1]
if is_below:
if cell["bbox"][3] > lower_cut:
## We move the left cut to exclude this cell:
lower_cut = cell["bbox"][3]
# logger.debug(" Current bbox: " + str([left_cut, lower_cut, right_cut, upper_cut]))
new_bbox = [left_cut, lower_cut, right_cut, upper_cut]
logger.debug(" Final bbox: " + str(new_bbox))
return new_bbox
def remove_cluster_duplicates_by_conf(cluster_predictions, threshold=0.5):
DuplicateDeletedClusterIDs = []
for cluster_1 in cluster_predictions:
for cluster_2 in cluster_predictions:
if cluster_1["id"] != cluster_2["id"]:
if_conf = False
if cluster_1["confidence"] > cluster_2["confidence"]:
if_conf = True
if if_conf == True:
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > threshold:
DuplicateDeletedClusterIDs.append(cluster_2["id"])
elif contains(
cluster_1["bbox"],
[
cluster_2["bbox"][0] + 3,
cluster_2["bbox"][1] + 3,
cluster_2["bbox"][2] - 3,
cluster_2["bbox"][3] - 3,
],
):
DuplicateDeletedClusterIDs.append(cluster_2["id"])
DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
for cl_id in DuplicateDeletedClusterIDs:
for cluster in cluster_predictions:
if cl_id == cluster["id"]:
cluster_predictions.remove(cluster)
return cluster_predictions
# Assign orphan cells by a low confidence prediction that is below the assigned confidence
def assign_orphans_with_low_conf_pred(
cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
):
for orph_id in orphan_cell_indices:
cluster_chosen = {}
iou_thresh = 0.05
confidence = 0.05
# Loop over all predictions, and find the one with the highest IOU, and confidence
for cluster in cluster_predictions_low:
calc_iou = bb_iou(cluster["bbox"], raw_cells[orph_id]["bbox"])
cluster_area = (cluster["bbox"][3] - cluster["bbox"][1]) * (
cluster["bbox"][2] - cluster["bbox"][0]
)
cell_area = (
raw_cells[orph_id]["bbox"][3] - raw_cells[orph_id]["bbox"][1]
) * (raw_cells[orph_id]["bbox"][2] - raw_cells[orph_id]["bbox"][0])
if (
(iou_thresh < calc_iou)
and (cluster["confidence"] > confidence)
and (cell_area * 3 > cluster_area)
):
cluster_chosen = cluster
iou_thresh = calc_iou
confidence = cluster["confidence"]
# If a candidate is found, assign to it the PDF cell ids, and tag that it was created by this function for tracking
if iou_thresh != 0.05 and confidence != 0.05:
cluster_chosen["cell_ids"].append(orph_id)
cluster_chosen["created_by"] = "orph_low_conf"
cluster_predictions.append(cluster_chosen)
orphan_cell_indices.remove(orph_id)
return cluster_predictions, orphan_cell_indices
def remove_ambigous_pdf_cell_by_conf(cluster_predictions, raw_cells, amb_cell_idxs):
for amb_cell_id in amb_cell_idxs:
highest_conf = 0
highest_bbox_iou = 0
cluster_chosen = None
problamatic_clusters = []
# Find clusters in question
for cluster in cluster_predictions:
if amb_cell_id in cluster["cell_ids"]:
problamatic_clusters.append(amb_cell_id)
# If the cell_id is in a cluster of high conf, and highest iou score, and smaller in area
bbox_iou_val = bb_iou(cluster["bbox"], raw_cells[amb_cell_id]["bbox"])
if (
cluster["confidence"] > highest_conf
and bbox_iou_val > highest_bbox_iou
):
cluster_chosen = cluster
highest_conf = cluster["confidence"]
highest_bbox_iou = bbox_iou_val
if cluster["id"] in problamatic_clusters:
problamatic_clusters.remove(cluster["id"])
# now remove the assigning of cell id from lower confidence, and threshold
for cluster in cluster_predictions:
for prob_amb_id in problamatic_clusters:
if prob_amb_id in cluster["cell_ids"]:
cluster["cell_ids"].remove(prob_amb_id)
amb_cell_idxs.remove(amb_cell_id)
return cluster_predictions, amb_cell_idxs
def ranges(nums):
# Find if consecutive numbers exist within pdf cells
# Used to remove line numbers for review manuscripts
nums = sorted(set(nums))
gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s + 1 < e]
edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
return list(zip(edges, edges))
def set_orphan_as_text(
cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
):
max_id = -1
figures = []
for cluster in cluster_predictions:
if cluster["type"] == "Picture":
figures.append(cluster)
if cluster["id"] > max_id:
max_id = cluster["id"]
max_id += 1
lines_detector = False
content_of_orphans = []
for orph_id in orphan_cell_indices:
orph_cell = raw_cells[orph_id]
content_of_orphans.append(raw_cells[orph_id]["text"])
fil_content_of_orphans = []
for cell_content in content_of_orphans:
if cell_content.isnumeric():
try:
num = int(cell_content)
fil_content_of_orphans.append(num)
except ValueError: # ignore the cell
pass
# line_orphans = []
# Check if there are more than 2 pdf orphan cells, if there are more than 2,
# then check between the orphan cells if they are numeric
# and if they are a consecutive series of numbers (using ranges function) to decide
if len(fil_content_of_orphans) > 2:
out_ranges = ranges(fil_content_of_orphans)
if len(out_ranges) > 1:
cnt_range = 0
for ranges_ in out_ranges:
if ranges_[0] != ranges_[1]:
# If there are more than 75 (half the total line number of a review manuscript page)
# decide that there are line numbers on page to be ignored.
if len(list(range(ranges_[0], ranges_[1]))) > 75:
lines_detector = True
# line_orphans = line_orphans + list(range(ranges_[0], ranges_[1]))
for orph_id in orphan_cell_indices:
orph_cell = raw_cells[orph_id]
if bool(orph_cell["text"] and not orph_cell["text"].isspace()):
fig_flag = False
# Do not assign orphan cells if they are inside a figure
for fig in figures:
if contains(fig["bbox"], orph_cell["bbox"]):
fig_flag = True
# if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
if fig_flag == False and lines_detector == False:
# get class from low confidence detections if not set as text:
class_type = "Text"
for cluster in cluster_predictions_low:
intersection = compute_intersection(
orph_cell["bbox"], cluster["bbox"]
)
class_type = "Text"
if (
cluster["confidence"] > 0.1
and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
):
class_type = cluster["type"]
elif contains(
cluster["bbox"],
[
orph_cell["bbox"][0] + 3,
orph_cell["bbox"][1] + 3,
orph_cell["bbox"][2] - 3,
orph_cell["bbox"][3] - 3,
],
):
class_type = cluster["type"]
elif intersection > area(orph_cell["bbox"]) * 0.2:
class_type = cluster["type"]
new_cluster = {
"id": max_id,
"bbox": orph_cell["bbox"],
"type": class_type,
"cell_ids": [orph_id],
"confidence": -1,
"created_by": "orphan_default",
}
max_id += 1
cluster_predictions.append(new_cluster)
return cluster_predictions, orphan_cell_indices
def merge_cells(cluster_predictions):
# Using graph component creates clusters if orphan cells are touching or too close.
G = nx.Graph()
for cluster in cluster_predictions:
if cluster["created_by"] == "orphan_default":
G.add_node(cluster["id"])
for cluster_1 in cluster_predictions:
for cluster_2 in cluster_predictions:
if (
cluster_1["id"] != cluster_2["id"]
and cluster_2["created_by"] == "orphan_default"
and cluster_1["created_by"] == "orphan_default"
):
cl1 = copy.deepcopy(cluster_1["bbox"])
cl2 = copy.deepcopy(cluster_2["bbox"])
cl1[0] = cl1[0] - 2
cl1[1] = cl1[1] - 2
cl1[2] = cl1[2] + 2
cl1[3] = cl1[3] + 2
cl2[0] = cl2[0] - 2
cl2[1] = cl2[1] - 2
cl2[2] = cl2[2] + 2
cl2[3] = cl2[3] + 2
if is_intersecting(cl1, cl2):
G.add_edge(cluster_1["id"], cluster_2["id"])
component = sorted(map(sorted, nx.k_edge_components(G, k=1)))
max_id = -1
for cluster_1 in cluster_predictions:
if cluster_1["id"] > max_id:
max_id = cluster_1["id"]
for nodes in component:
if len(nodes) > 1:
max_id += 1
lines = []
for node in nodes:
for cluster in cluster_predictions:
if cluster["id"] == node:
lines.append(cluster)
cluster_predictions.remove(cluster)
new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
cluster_predictions.append(new_merged_cluster)
return cluster_predictions
def clean_up_clusters(
cluster_predictions,
raw_cells,
merge_cells=False,
img_table=False,
one_cell_table=False,
):
DuplicateDeletedClusterIDs = []
for cluster_1 in cluster_predictions:
for cluster_2 in cluster_predictions:
if cluster_1["id"] != cluster_2["id"]:
# remove any artifcats created by merging clusters
if merge_cells == True:
if contains(
cluster_1["bbox"],
[
cluster_2["bbox"][0] + 3,
cluster_2["bbox"][1] + 3,
cluster_2["bbox"][2] - 3,
cluster_2["bbox"][3] - 3,
],
):
cluster_1["cell_ids"] = (
cluster_1["cell_ids"] + cluster_2["cell_ids"]
)
DuplicateDeletedClusterIDs.append(cluster_2["id"])
# remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
elif img_table == True:
if (
cluster_1["type"] == "Text"
and cluster_2["type"] == "Picture"
or cluster_2["type"] == "Table"
):
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
DuplicateDeletedClusterIDs.append(cluster_1["id"])
elif contains(
[
cluster_2["bbox"][0] - 3,
cluster_2["bbox"][1] - 3,
cluster_2["bbox"][2] + 3,
cluster_2["bbox"][3] + 3,
],
cluster_1["bbox"],
):
DuplicateDeletedClusterIDs.append(cluster_1["id"])
# remove tables that have one pdf cell
if one_cell_table == True:
if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
DuplicateDeletedClusterIDs.append(cluster_1["id"])
DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
for cl_id in DuplicateDeletedClusterIDs:
for cluster in cluster_predictions:
if cl_id == cluster["id"]:
cluster_predictions.remove(cluster)
return cluster_predictions
def assigning_cell_ids_to_clusters(clusters, raw_cells, threshold):
for cluster in clusters:
cells_in_cluster, _ = compute_enclosed_cells(
cluster["bbox"], raw_cells, min_cell_intersection_with_cluster=threshold
)
cluster["cell_ids"] = cells_in_cluster
## These cell_ids are ids of the raw cells.
## They are often, but not always, the same as the "id" or the index of the "cells" list in a prediction.
return clusters
# Creates a map of cell_id->cluster_id
def cell_id_state_map(clusters, cell_count):
clusters_around_cells = find_clusters_around_cells(cell_count, clusters)
orphan_cell_indices = [
ix for ix in range(cell_count) if len(clusters_around_cells[ix]) == 0
] # which cells are assigned no cluster?
ambiguous_cell_indices = [
ix for ix in range(cell_count) if len(clusters_around_cells[ix]) > 1
] # which cells are assigned > 1 clusters?
return clusters_around_cells, orphan_cell_indices, ambiguous_cell_indices

41
docling/utils/utils.py Normal file
View File

@ -0,0 +1,41 @@
import hashlib
from io import BytesIO
from itertools import islice
from pathlib import Path
from typing import List, Union
def chunkify(iterator, chunk_size):
"""Yield successive chunks of chunk_size from the iterable."""
if isinstance(iterator, List):
iterator = iter(iterator)
for first in iterator: # Take the first element from the iterator
yield [first] + list(islice(iterator, chunk_size - 1))
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
"""Create a stable page_hash of the path_or_stream of a file"""
block_size = 65536
hasher = hashlib.sha256()
def _hash_buf(binary_stream):
buf = binary_stream.read(block_size) # read and page_hash in chunks
while len(buf) > 0:
hasher.update(buf)
buf = binary_stream.read(block_size)
if isinstance(path_or_stream, Path):
with path_or_stream.open("rb") as afile:
_hash_buf(afile)
elif isinstance(path_or_stream, BytesIO):
_hash_buf(path_or_stream)
return hasher.hexdigest()
def create_hash(string: str):
hasher = hashlib.sha256()
hasher.update(string.encode("utf-8"))
return hasher.hexdigest()

73
examples/convert.py Normal file
View File

@ -0,0 +1,73 @@
import json
import logging
import time
from pathlib import Path
from typing import Iterable
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
def export_documents(
converted_docs: Iterable[ConvertedDocument],
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)
success_count = 0
failure_count = 0
for doc in converted_docs:
if doc.status == ConversionStatus.SUCCESS:
success_count += 1
doc_filename = doc.input.file.stem
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open("w") as fp:
fp.write(json.dumps(doc.render_as_dict()))
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(doc.render_as_markdown())
else:
_log.info(f"Document {doc.input.file} failed to convert.")
failure_count += 1
_log.info(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
)
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
# Path("/Users/cau/Downloads/Issue-36122.pdf"),
# Path("/Users/cau/Downloads/IBM_Storage_Insights_Fact_Sheet.pdf"),
Path("./test/data/2206.01062.pdf"),
Path("./test/data/2203.01017v2.pdf"),
Path("./test/data/2305.03393v1.pdf"),
]
artifacts_path = DocumentConverter.download_models_hf()
doc_converter = DocumentConverter(artifacts_path=artifacts_path)
input = DocumentConversionInput.from_paths(input_doc_paths)
start_time = time.time()
converted_docs = doc_converter.convert(input)
export_documents(converted_docs, output_dir=Path("./scratch"))
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if __name__ == "__main__":
main()

11
examples/minimal.py Normal file
View File

@ -0,0 +1,11 @@
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
artifacts_path = DocumentConverter.download_models_hf()
doc_converter = DocumentConverter(artifacts_path=artifacts_path)
input = DocumentConversionInput.from_paths(["factsheet.pdf"])
converted_docs = doc_converter.convert(input)
for d in converted_docs:
print(d.render_as_dict())

BIN
logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 690 KiB

4865
poetry.lock generated Normal file

File diff suppressed because it is too large Load Diff

72
pyproject.toml Normal file
View File

@ -0,0 +1,72 @@
[tool.poetry]
name = "docling"
version = "0.1.0"
description = "Docling PDF conversion package"
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
license = "MIT"
readme = "README.md"
keywords= ["docling", "convert", "document", "pdf", "layout model", "segmentation", "table structure", "table former"]
classifiers = [
"License :: OSI Approved :: MIT License",
"Operating System :: MacOS :: MacOS X",
"Operating System :: POSIX :: Linux",
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Programming Language :: Python :: 3"
]
packages = [{include = "docling"}]
[tool.poetry.dependencies]
python = "^3.11"
pydantic = "^2.0.0"
docling-core = "^0.2.0"
docling-ibm-models = "^0.2.0"
deepsearch-glm = ">=0.18.4,<1"
deepsearch-toolkit = ">=0.47.0,<1"
filetype = "^1.2.0"
pypdfium2 = "^4.30.0"
pydantic-settings = "^2.3.0"
huggingface_hub = ">=0.23,<1"
[tool.poetry.group.ocr.dependencies]
easyocr = "^1.7"
[tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"}
pytest = "^7.2.2"
pre-commit = "^3.7.1"
mypy = "^1.10.1"
isort = "^5.10.1"
python-semantic-release = "^7.32.2"
flake8 = "^6.0.0"
pyproject-flake8 = "^6.0.0"
pytest-xdist = "^3.3.1"
types-requests = "^2.31.0.2"
flake8-pyproject = "^1.2.3"
pylint = "^2.17.5"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.black]
line-length = 88
target-version = ["py311"]
include = '\.pyi?$'
[tool.isort]
profile = "black"
line_length = 88
py_version=311
[tool.mypy]
pretty = true
# strict = true
no_implicit_optional = true
python_version = "3.11"
[tool.flake8]
max-line-length = 88
extend-ignore = ["E203", "E501"]

BIN
test/data/2203.01017v2.pdf Normal file

Binary file not shown.

BIN
test/data/2206.01062.pdf Normal file

Binary file not shown.

BIN
test/data/2305.03393v1.pdf Normal file

Binary file not shown.

View File

@ -0,0 +1,33 @@
from pathlib import Path
import pytest
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend
from docling.datamodel.base_models import BoundingBox
@pytest.fixture
def test_doc_path():
return Path("./data/2206.01062.pdf")
def test_get_text_from_rect(test_doc_path):
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
# Get the title text of the DocLayNet paper
textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124))
ref = "DocLayNet: A Large Human-Annotated Dataset for\r\nDocument-Layout Analysis"
assert textpiece.strip() == ref
def test_crop_page_image(test_doc_path):
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527))
# im.show()
def test_num_pages(test_doc_path):
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
doc_backend.page_count() == 9