Initial commit
This commit is contained in:
commit
e2d996753b
442
.gitignore
vendored
Normal file
442
.gitignore
vendored
Normal file
@ -0,0 +1,442 @@
|
||||
model_artifacts/
|
||||
scratch/
|
||||
ds_convert_models/
|
||||
|
||||
# Created by https://www.toptal.com/developers/gitignore/api/python,macos,virtualenv,pycharm,visualstudiocode,emacs,vim,jupyternotebooks
|
||||
# Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,virtualenv,pycharm,visualstudiocode,emacs,vim,jupyternotebooks
|
||||
|
||||
### Emacs ###
|
||||
# -*- mode: gitignore; -*-
|
||||
*~
|
||||
\#*\#
|
||||
/.emacs.desktop
|
||||
/.emacs.desktop.lock
|
||||
*.elc
|
||||
auto-save-list
|
||||
tramp
|
||||
.\#*
|
||||
|
||||
# Org-mode
|
||||
.org-id-locations
|
||||
*_archive
|
||||
|
||||
# flymake-mode
|
||||
*_flymake.*
|
||||
|
||||
# eshell files
|
||||
/eshell/history
|
||||
/eshell/lastdir
|
||||
|
||||
# elpa packages
|
||||
/elpa/
|
||||
|
||||
# reftex files
|
||||
*.rel
|
||||
|
||||
# AUCTeX auto folder
|
||||
/auto/
|
||||
|
||||
# cask packages
|
||||
.cask/
|
||||
dist/
|
||||
|
||||
# Flycheck
|
||||
flycheck_*.el
|
||||
|
||||
# server auth directory
|
||||
/server/
|
||||
|
||||
# projectiles files
|
||||
.projectile
|
||||
|
||||
# directory configuration
|
||||
.dir-locals.el
|
||||
|
||||
# network security
|
||||
/network-security.data
|
||||
|
||||
|
||||
### JupyterNotebooks ###
|
||||
# gitignore template for Jupyter Notebooks
|
||||
# website: http://jupyter.org/
|
||||
|
||||
.ipynb_checkpoints
|
||||
*/.ipynb_checkpoints/*
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# Remove previous ipynb_checkpoints
|
||||
# git rm -r .ipynb_checkpoints/
|
||||
|
||||
### macOS ###
|
||||
# General
|
||||
.DS_Store
|
||||
.AppleDouble
|
||||
.LSOverride
|
||||
|
||||
# Icon must end with two \r
|
||||
Icon
|
||||
|
||||
|
||||
# Thumbnails
|
||||
._*
|
||||
|
||||
# Files that might appear in the root of a volume
|
||||
.DocumentRevisions-V100
|
||||
.fseventsd
|
||||
.Spotlight-V100
|
||||
.TemporaryItems
|
||||
.Trashes
|
||||
.VolumeIcon.icns
|
||||
.com.apple.timemachine.donotpresent
|
||||
|
||||
# Directories potentially created on remote AFP share
|
||||
.AppleDB
|
||||
.AppleDesktop
|
||||
Network Trash Folder
|
||||
Temporary Items
|
||||
.apdisk
|
||||
|
||||
### macOS Patch ###
|
||||
# iCloud generated files
|
||||
*.icloud
|
||||
|
||||
### PyCharm ###
|
||||
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
|
||||
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
||||
|
||||
# User-specific stuff
|
||||
.idea/**/workspace.xml
|
||||
.idea/**/tasks.xml
|
||||
.idea/**/usage.statistics.xml
|
||||
.idea/**/dictionaries
|
||||
.idea/**/shelf
|
||||
|
||||
# AWS User-specific
|
||||
.idea/**/aws.xml
|
||||
|
||||
# Generated files
|
||||
.idea/**/contentModel.xml
|
||||
|
||||
# Sensitive or high-churn files
|
||||
.idea/**/dataSources/
|
||||
.idea/**/dataSources.ids
|
||||
.idea/**/dataSources.local.xml
|
||||
.idea/**/sqlDataSources.xml
|
||||
.idea/**/dynamic.xml
|
||||
.idea/**/uiDesigner.xml
|
||||
.idea/**/dbnavigator.xml
|
||||
|
||||
# Gradle
|
||||
.idea/**/gradle.xml
|
||||
.idea/**/libraries
|
||||
|
||||
# Gradle and Maven with auto-import
|
||||
# When using Gradle or Maven with auto-import, you should exclude module files,
|
||||
# since they will be recreated, and may cause churn. Uncomment if using
|
||||
# auto-import.
|
||||
# .idea/artifacts
|
||||
# .idea/compiler.xml
|
||||
# .idea/jarRepositories.xml
|
||||
# .idea/modules.xml
|
||||
# .idea/*.iml
|
||||
# .idea/modules
|
||||
# *.iml
|
||||
# *.ipr
|
||||
|
||||
# CMake
|
||||
cmake-build-*/
|
||||
|
||||
# Mongo Explorer plugin
|
||||
.idea/**/mongoSettings.xml
|
||||
|
||||
# File-based project format
|
||||
*.iws
|
||||
|
||||
# IntelliJ
|
||||
out/
|
||||
|
||||
# mpeltonen/sbt-idea plugin
|
||||
.idea_modules/
|
||||
|
||||
# JIRA plugin
|
||||
atlassian-ide-plugin.xml
|
||||
|
||||
# Cursive Clojure plugin
|
||||
.idea/replstate.xml
|
||||
|
||||
# SonarLint plugin
|
||||
.idea/sonarlint/
|
||||
|
||||
# Crashlytics plugin (for Android Studio and IntelliJ)
|
||||
com_crashlytics_export_strings.xml
|
||||
crashlytics.properties
|
||||
crashlytics-build.properties
|
||||
fabric.properties
|
||||
|
||||
# Editor-based Rest Client
|
||||
.idea/httpRequests
|
||||
|
||||
# Android studio 3.1+ serialized cache file
|
||||
.idea/caches/build_file_checksums.ser
|
||||
|
||||
### PyCharm Patch ###
|
||||
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
|
||||
|
||||
# *.iml
|
||||
# modules.xml
|
||||
# .idea/misc.xml
|
||||
# *.ipr
|
||||
|
||||
# Sonarlint plugin
|
||||
# https://plugins.jetbrains.com/plugin/7973-sonarlint
|
||||
.idea/**/sonarlint/
|
||||
|
||||
# SonarQube Plugin
|
||||
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
|
||||
.idea/**/sonarIssues.xml
|
||||
|
||||
# Markdown Navigator plugin
|
||||
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
|
||||
.idea/**/markdown-navigator.xml
|
||||
.idea/**/markdown-navigator-enh.xml
|
||||
.idea/**/markdown-navigator/
|
||||
|
||||
# Cache file creation bug
|
||||
# See https://youtrack.jetbrains.com/issue/JBR-2257
|
||||
.idea/$CACHE_FILE$
|
||||
|
||||
# CodeStream plugin
|
||||
# https://plugins.jetbrains.com/plugin/12206-codestream
|
||||
.idea/codestream.xml
|
||||
|
||||
# Azure Toolkit for IntelliJ plugin
|
||||
# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
|
||||
.idea/**/azureSettings.xml
|
||||
|
||||
### Python ###
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
|
||||
# IPython
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
.idea/
|
||||
|
||||
### Python Patch ###
|
||||
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
||||
poetry.toml
|
||||
|
||||
# ruff
|
||||
.ruff_cache/
|
||||
|
||||
### Vim ###
|
||||
# Swap
|
||||
[._]*.s[a-v][a-z]
|
||||
!*.svg # comment out if you don't need vector files
|
||||
[._]*.sw[a-p]
|
||||
[._]s[a-rt-v][a-z]
|
||||
[._]ss[a-gi-z]
|
||||
[._]sw[a-p]
|
||||
|
||||
# Session
|
||||
Session.vim
|
||||
Sessionx.vim
|
||||
|
||||
# Temporary
|
||||
.netrwhist
|
||||
# Auto-generated tag files
|
||||
tags
|
||||
# Persistent undo
|
||||
[._]*.un~
|
||||
|
||||
|
||||
### Visual Studio Code ###
|
||||
.vscode/
|
||||
|
||||
### VirtualEnv ###
|
||||
# Virtualenv
|
||||
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
|
||||
[Bb]in
|
||||
[Ii]nclude
|
||||
[Ll]ib
|
||||
[Ll]ib64
|
||||
[Ll]ocal
|
||||
[Ss]cripts
|
||||
pyvenv.cfg
|
||||
pip-selfcheck.json
|
||||
|
||||
### VisualStudioCode ###
|
||||
.vscode/*
|
||||
!.vscode/settings.json
|
||||
!.vscode/tasks.json
|
||||
!.vscode/launch.json
|
||||
!.vscode/extensions.json
|
||||
!.vscode/*.code-snippets
|
||||
|
||||
# Local History for Visual Studio Code
|
||||
.history/
|
||||
|
||||
# Built Visual Studio Code Extensions
|
||||
*.vsix
|
||||
|
||||
### VisualStudioCode Patch ###
|
||||
# Ignore all local history of files
|
||||
.history
|
||||
.ionide
|
||||
|
||||
|
||||
# Docs
|
||||
# docs/**/*.png
|
||||
# docs/**/*.svg
|
34
.pre-commit-config.yaml
Normal file
34
.pre-commit-config.yaml
Normal file
@ -0,0 +1,34 @@
|
||||
fail_fast: true
|
||||
repos:
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: system
|
||||
name: Black
|
||||
entry: poetry run black docling examples
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.py$'
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: system
|
||||
name: isort
|
||||
entry: poetry run isort docling examples
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.py$'
|
||||
# - repo: local
|
||||
# hooks:
|
||||
# - id: system
|
||||
# name: flake8
|
||||
# entry: poetry run flake8 docling
|
||||
# pass_filenames: false
|
||||
# language: system
|
||||
# files: '\.py$'
|
||||
# - repo: local
|
||||
# hooks:
|
||||
# - id: system
|
||||
# name: MyPy
|
||||
# entry: poetry run mypy docling
|
||||
# pass_filenames: false
|
||||
# language: system
|
||||
# files: '\.py$'
|
129
CODE_OF_CONDUCT.md
Normal file
129
CODE_OF_CONDUCT.md
Normal file
@ -0,0 +1,129 @@
|
||||
# Contributor Covenant Code of Conduct
|
||||
|
||||
## Our Pledge
|
||||
|
||||
We as members, contributors, and leaders pledge to make participation in our
|
||||
community a harassment-free experience for everyone, regardless of age, body
|
||||
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
||||
identity and expression, level of experience, education, socio-economic status,
|
||||
nationality, personal appearance, race, religion, or sexual identity
|
||||
and orientation.
|
||||
|
||||
We pledge to act and interact in ways that contribute to an open, welcoming,
|
||||
diverse, inclusive, and healthy community.
|
||||
|
||||
## Our Standards
|
||||
|
||||
Examples of behavior that contributes to a positive environment for our
|
||||
community include:
|
||||
|
||||
* Demonstrating empathy and kindness toward other people
|
||||
* Being respectful of differing opinions, viewpoints, and experiences
|
||||
* Giving and gracefully accepting constructive feedback
|
||||
* Accepting responsibility and apologizing to those affected by our mistakes,
|
||||
and learning from the experience
|
||||
* Focusing on what is best not just for us as individuals, but for the
|
||||
overall community
|
||||
|
||||
Examples of unacceptable behavior include:
|
||||
|
||||
* The use of sexualized language or imagery, and sexual attention or
|
||||
advances of any kind
|
||||
* Trolling, insulting or derogatory comments, and personal or political attacks
|
||||
* Public or private harassment
|
||||
* Publishing others' private information, such as a physical or email
|
||||
address, without their explicit permission
|
||||
* Other conduct which could reasonably be considered inappropriate in a
|
||||
professional setting
|
||||
|
||||
## Enforcement Responsibilities
|
||||
|
||||
Community leaders are responsible for clarifying and enforcing our standards of
|
||||
acceptable behavior and will take appropriate and fair corrective action in
|
||||
response to any behavior that they deem inappropriate, threatening, offensive,
|
||||
or harmful.
|
||||
|
||||
Community leaders have the right and responsibility to remove, edit, or reject
|
||||
comments, commits, code, wiki edits, issues, and other contributions that are
|
||||
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
||||
decisions when appropriate.
|
||||
|
||||
## Scope
|
||||
|
||||
This Code of Conduct applies within all community spaces, and also applies when
|
||||
an individual is officially representing the community in public spaces.
|
||||
Examples of representing our community include using an official e-mail address,
|
||||
posting via an official social media account, or acting as an appointed
|
||||
representative at an online or offline event.
|
||||
|
||||
## Enforcement
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||
reported to the community leaders responsible for enforcement using
|
||||
[deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com).
|
||||
|
||||
All complaints will be reviewed and investigated promptly and fairly.
|
||||
|
||||
All community leaders are obligated to respect the privacy and security of the
|
||||
reporter of any incident.
|
||||
|
||||
## Enforcement Guidelines
|
||||
|
||||
Community leaders will follow these Community Impact Guidelines in determining
|
||||
the consequences for any action they deem in violation of this Code of Conduct:
|
||||
|
||||
### 1. Correction
|
||||
|
||||
**Community Impact**: Use of inappropriate language or other behavior deemed
|
||||
unprofessional or unwelcome in the community.
|
||||
|
||||
**Consequence**: A private, written warning from community leaders, providing
|
||||
clarity around the nature of the violation and an explanation of why the
|
||||
behavior was inappropriate. A public apology may be requested.
|
||||
|
||||
### 2. Warning
|
||||
|
||||
**Community Impact**: A violation through a single incident or series
|
||||
of actions.
|
||||
|
||||
**Consequence**: A warning with consequences for continued behavior. No
|
||||
interaction with the people involved, including unsolicited interaction with
|
||||
those enforcing the Code of Conduct, for a specified period of time. This
|
||||
includes avoiding interactions in community spaces as well as external channels
|
||||
like social media. Violating these terms may lead to a temporary or
|
||||
permanent ban.
|
||||
|
||||
### 3. Temporary Ban
|
||||
|
||||
**Community Impact**: A serious violation of community standards, including
|
||||
sustained inappropriate behavior.
|
||||
|
||||
**Consequence**: A temporary ban from any sort of interaction or public
|
||||
communication with the community for a specified period of time. No public or
|
||||
private interaction with the people involved, including unsolicited interaction
|
||||
with those enforcing the Code of Conduct, is allowed during this period.
|
||||
Violating these terms may lead to a permanent ban.
|
||||
|
||||
### 4. Permanent Ban
|
||||
|
||||
**Community Impact**: Demonstrating a pattern of violation of community
|
||||
standards, including sustained inappropriate behavior, harassment of an
|
||||
individual, or aggression toward or disparagement of classes of individuals.
|
||||
|
||||
**Consequence**: A permanent ban from any sort of public interaction within
|
||||
the community.
|
||||
|
||||
## Attribution
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
||||
version 2.0, available at
|
||||
[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html](https://www.contributor-covenant.org/version/2/0/code_of_conduct.html).
|
||||
|
||||
Community Impact Guidelines were inspired by [Mozilla's code of conduct
|
||||
enforcement ladder](https://github.com/mozilla/diversity).
|
||||
|
||||
Homepage: [https://www.contributor-covenant.org](https://www.contributor-covenant.org)
|
||||
|
||||
For answers to common questions about this code of conduct, see the FAQ at
|
||||
[https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq). Translations are available at
|
||||
[https://www.contributor-covenant.org/translations](https://www.contributor-covenant.org/translations).
|
184
CONTRIBUTING.md
Normal file
184
CONTRIBUTING.md
Normal file
@ -0,0 +1,184 @@
|
||||
## Contributing In General
|
||||
Our project welcomes external contributions. If you have an itch, please feel
|
||||
free to scratch it.
|
||||
|
||||
To contribute code or documentation, please submit a [pull request](https://github.com/DS4SD/docling/pulls).
|
||||
|
||||
A good way to familiarize yourself with the codebase and contribution process is
|
||||
to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/DS4SD/docling/issues).
|
||||
Before embarking on a more ambitious contribution, please quickly [get in touch](#communication) with us.
|
||||
|
||||
For general questions or support requests, please refer to the [discussion section](https://github.com/DS4SD/docling/discussions).
|
||||
|
||||
**Note: We appreciate your effort, and want to avoid a situation where a contribution
|
||||
requires extensive rework (by you or by us), sits in backlog for a long time, or
|
||||
cannot be accepted at all!**
|
||||
|
||||
### Proposing new features
|
||||
|
||||
If you would like to implement a new feature, please [raise an issue](https://github.com/DS4SD/docling/issues)
|
||||
before sending a pull request so the feature can be discussed. This is to avoid
|
||||
you wasting your valuable time working on a feature that the project developers
|
||||
are not interested in accepting into the code base.
|
||||
|
||||
### Fixing bugs
|
||||
|
||||
If you would like to fix a bug, please [raise an issue](https://github.com/DS4SD/docling/issues) before sending a
|
||||
pull request so it can be tracked.
|
||||
|
||||
### Merge approval
|
||||
|
||||
The project maintainers use LGTM (Looks Good To Me) in comments on the code
|
||||
review to indicate acceptance. A change requires LGTMs from two of the
|
||||
maintainers of each component affected.
|
||||
|
||||
For a list of the maintainers, see the [MAINTAINERS.md](MAINTAINERS.md) page.
|
||||
|
||||
|
||||
## Legal
|
||||
|
||||
Each source file must include a license header for the MIT
|
||||
Software. Using the SPDX format is the simplest approach.
|
||||
e.g.
|
||||
|
||||
```
|
||||
/*
|
||||
Copyright IBM Inc. All rights reserved.
|
||||
|
||||
SPDX-License-Identifier: MIT
|
||||
*/
|
||||
```
|
||||
|
||||
We have tried to make it as easy as possible to make contributions. This
|
||||
applies to how we handle the legal aspects of contribution. We use the
|
||||
same approach - the [Developer's Certificate of Origin 1.1 (DCO)](https://github.com/hyperledger/fabric/blob/master/docs/source/DCO1.1.txt) - that the Linux® Kernel [community](https://elinux.org/Developer_Certificate_Of_Origin)
|
||||
uses to manage code contributions.
|
||||
|
||||
We simply ask that when submitting a patch for review, the developer
|
||||
must include a sign-off statement in the commit message.
|
||||
|
||||
Here is an example Signed-off-by line, which indicates that the
|
||||
submitter accepts the DCO:
|
||||
|
||||
```
|
||||
Signed-off-by: John Doe <john.doe@example.com>
|
||||
```
|
||||
|
||||
You can include this automatically when you commit a change to your
|
||||
local git repository using the following command:
|
||||
|
||||
```
|
||||
git commit -s
|
||||
```
|
||||
|
||||
|
||||
## Communication
|
||||
|
||||
Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
|
||||
|
||||
|
||||
|
||||
## Developing
|
||||
|
||||
### Usage of Poetry
|
||||
|
||||
We use Poetry to manage dependencies.
|
||||
|
||||
|
||||
#### Install
|
||||
|
||||
To install, see the documentation here: https://python-poetry.org/docs/master/#installing-with-the-official-installer
|
||||
|
||||
1. Install the Poetry globally in your machine
|
||||
```bash
|
||||
curl -sSL https://install.python-poetry.org | python3 -
|
||||
```
|
||||
The installation script will print the installation bin folder `POETRY_BIN` which you need in the next steps.
|
||||
|
||||
2. Make sure Poetry is in your `$PATH`
|
||||
- for `zsh`
|
||||
```sh
|
||||
echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.zshrc
|
||||
```
|
||||
- for `bash`
|
||||
```sh
|
||||
echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.bashrc
|
||||
```
|
||||
|
||||
3. The official guidelines linked above include useful details on the configuration of autocomplete for most shell environments, e.g. Bash and Zsh.
|
||||
|
||||
|
||||
#### Create a Virtual Environment and Install Dependencies
|
||||
|
||||
To activate the Virtual Environment, run:
|
||||
|
||||
```bash
|
||||
poetry shell
|
||||
```
|
||||
|
||||
To spawn a shell with the Virtual Environment activated. If the Virtual Environment doesn't exist, Poetry will create one for you. Then, to install dependencies, run:
|
||||
|
||||
```bash
|
||||
poetry install
|
||||
```
|
||||
|
||||
**(Advanced) Use a Specific Python Version**
|
||||
|
||||
If for whatever reason you need to work in a specific (older) version of Python, run:
|
||||
|
||||
```bash
|
||||
poetry env use $(which python3.8)
|
||||
```
|
||||
|
||||
This creates a Virtual Environment with Python 3.8. For other versions, replace `$(which python3.8)` by the path to the interpreter (e.g., `/usr/bin/python3.8`) or use `$(which pythonX.Y)`.
|
||||
|
||||
|
||||
#### Add a new dependency
|
||||
|
||||
```bash
|
||||
poetry add NAME
|
||||
```
|
||||
|
||||
## Coding style guidelines
|
||||
|
||||
We use the following tools to enforce code style:
|
||||
|
||||
- iSort, to sort imports
|
||||
- Black, to format code
|
||||
|
||||
|
||||
We run a series of checks on the code base on every commit, using `pre-commit`. To install the hooks, run:
|
||||
|
||||
```bash
|
||||
pre-commit install
|
||||
```
|
||||
|
||||
To run the checks on-demand, run:
|
||||
|
||||
```
|
||||
pre-commit run --all-files
|
||||
```
|
||||
|
||||
Note: Checks like `Black` and `isort` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by their Hooks. In these cases, `git add` the modified files and `git commit` again.
|
||||
|
||||
|
||||
|
||||
## Documentation
|
||||
|
||||
We use [MkDocs](https://www.mkdocs.org/) to write documentation.
|
||||
|
||||
To run the documentation server, do:
|
||||
|
||||
```bash
|
||||
mkdocs serve
|
||||
```
|
||||
|
||||
The server will be available on [http://localhost:8000](http://localhost:8000).
|
||||
|
||||
### Pushing Documentation to GitHub pages
|
||||
|
||||
Run the following:
|
||||
|
||||
```bash
|
||||
mkdocs gh-deploy
|
||||
```
|
23
Dockerfile
Normal file
23
Dockerfile
Normal file
@ -0,0 +1,23 @@
|
||||
FROM python:3.11-slim-bookworm
|
||||
|
||||
ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
|
||||
&& apt-get clean
|
||||
|
||||
RUN --mount=type=ssh \
|
||||
pip install --no-cache-dir https://github.com/DS4SD/docling.git
|
||||
|
||||
ENV HF_HOME=/tmp/
|
||||
ENV TORCH_HOME=/tmp/
|
||||
|
||||
COPY examples/minimal.py /root/minimal.py
|
||||
|
||||
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
|
||||
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
|
||||
RUN wget "https://www.ibm.com/docs/en/SSQRB8/com.ibm.spectrum.si.pdfs/IBM_Storage_Insights_Fact_Sheet.pdf" -O /root/factsheet.pdf
|
||||
|
||||
# On container shell:
|
||||
# > cd /root/
|
||||
# > python minimal.py
|
21
LICENSE
Normal file
21
LICENSE
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) [year] [fullname]
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
10
MAINTAINERS.md
Normal file
10
MAINTAINERS.md
Normal file
@ -0,0 +1,10 @@
|
||||
# MAINTAINERS
|
||||
|
||||
- Christoph Auer - [@cau-git](https://github.com/cau-git)
|
||||
- Michele Dolfi - [@dolfim-ibm](https://github.com/dolfim-ibm)
|
||||
- Maxim Lysak - [@maxmnemonic](https://github.com/maxmnemonic)
|
||||
- Nikos Livathinos - [@nikos-livathinos](https://github.com/nikos-livathinos)
|
||||
- Ahmed Nassar [@nassarofficial](https://github.com/nassarofficial)
|
||||
- Peter Staar - [@PeterStaar-IBM](https://github.com/PeterStaar-IBM)
|
||||
|
||||
Maintainers can be contacted at [deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com).
|
99
README.md
Normal file
99
README.md
Normal file
@ -0,0 +1,99 @@
|
||||
<p align="center">
|
||||
<a href="https://github.com/ds4sd/docling"> <img loading="lazy" alt="Docling" src="logo.png" width="150" /> </a>
|
||||
</p>
|
||||
|
||||
# Docling
|
||||
|
||||
Dockling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
|
||||
|
||||
## Features
|
||||
* ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
|
||||
* 📑 Understands detailed page layout, reading order and recovers table structures
|
||||
* 📝 Extracts metadata from the document, such as title, authors, references and language
|
||||
* 🔍 Optionally applies OCR (use with scanned PDFs)
|
||||
|
||||
## Setup
|
||||
|
||||
You need Python 3.11 and poetry. Install poetry from [here](https://python-poetry.org/docs/#installing-with-the-official-installer).
|
||||
|
||||
Once you have `poetry` installed, create an environment and install the package:
|
||||
|
||||
```bash
|
||||
poetry env use $(which python3.11)
|
||||
poetry shell
|
||||
poetry install
|
||||
```
|
||||
|
||||
**Notes**:
|
||||
* Works on macOS and Linux environments. Windows platforms are currently not tested.
|
||||
|
||||
|
||||
## Usage
|
||||
|
||||
For basic usage, see the [convert.py](examples/convert.py) example module. Run with:
|
||||
|
||||
```
|
||||
python examples/convert.py
|
||||
```
|
||||
The output of the above command will be written to `./scratch`.
|
||||
|
||||
### Enable or disable pipeline features
|
||||
|
||||
You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`
|
||||
```python
|
||||
doc_converter = DocumentConverter(
|
||||
artifacts_path=artifacts_path,
|
||||
pipeline_options=PipelineOptions(do_table_structure=False, # Controls if table structure is recovered.
|
||||
do_ocr=True), # Controls if OCR is applied (ignores programmatic content)
|
||||
)
|
||||
```
|
||||
|
||||
### Impose limits on the document size
|
||||
|
||||
You can limit the file size and number of pages which should be allowed to process per document.
|
||||
```python
|
||||
paths = [Path("./test/data/2206.01062.pdf")]
|
||||
|
||||
input = DocumentConversionInput.from_paths(
|
||||
paths, limits=DocumentLimits(max_num_pages=100, max_file_size=20971520)
|
||||
)
|
||||
```
|
||||
|
||||
### Convert from binary PDF streams
|
||||
|
||||
You can convert PDFs from a binary stream instead of from the filesystem as follows:
|
||||
```python
|
||||
buf = BytesIO(your_binary_stream)
|
||||
docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
||||
input = DocumentConversionInput.from_streams(docs)
|
||||
converted_docs = doc_converter.convert(input)
|
||||
```
|
||||
### Limit resource usage
|
||||
|
||||
You can limit the CPU threads used by `docling` by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
||||
|
||||
|
||||
## Contributing
|
||||
|
||||
Please read [Contributing to Docling](./CONTRIBUTING.md) for details.
|
||||
|
||||
|
||||
## References
|
||||
|
||||
If you use `Docling` in your projects, please consider citing the following:
|
||||
|
||||
```bib
|
||||
@software{Docling,
|
||||
author = {Deep Search Team},
|
||||
month = {7},
|
||||
title = {{Docling}},
|
||||
url = {https://github.com/DS4SD/docling},
|
||||
version = {main},
|
||||
year = {2024}
|
||||
}
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
The `Docling` codebase is under MIT license.
|
||||
For individual model usage, please refer to the model licenses found in the original packages.
|
0
docling/__init__.py
Normal file
0
docling/__init__.py
Normal file
0
docling/backend/__init__.py
Normal file
0
docling/backend/__init__.py
Normal file
55
docling/backend/abstract_backend.py
Normal file
55
docling/backend/abstract_backend.py
Normal file
@ -0,0 +1,55 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, Optional, Union
|
||||
|
||||
from PIL import Image
|
||||
|
||||
|
||||
class PdfPageBackend(ABC):
|
||||
def __init__(self, page_obj: Any) -> object:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_text_cells(self) -> Iterable["Cell"]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_page_image(
|
||||
self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
|
||||
) -> Image.Image:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_size(self) -> "PageSize":
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def unload(self):
|
||||
pass
|
||||
|
||||
|
||||
class PdfDocumentBackend(ABC):
|
||||
@abstractmethod
|
||||
def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def load_page(self, page_no: int) -> PdfPageBackend:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def page_count(self) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def is_valid(self) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def unload(self):
|
||||
pass
|
223
docling/backend/pypdfium2_backend.py
Normal file
223
docling/backend/pypdfium2_backend.py
Normal file
@ -0,0 +1,223 @@
|
||||
import random
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
||||
|
||||
|
||||
class PyPdfiumPageBackend(PdfPageBackend):
|
||||
def __init__(self, page_obj: PdfPage):
|
||||
super().__init__(page_obj)
|
||||
self._ppage = page_obj
|
||||
self.text_page = None
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
if not self.text_page:
|
||||
self.text_page = self._ppage.get_textpage()
|
||||
|
||||
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
||||
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
||||
|
||||
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
||||
|
||||
return text_piece
|
||||
|
||||
def get_text_cells(self) -> Iterable[Cell]:
|
||||
if not self.text_page:
|
||||
self.text_page = self._ppage.get_textpage()
|
||||
|
||||
cells = []
|
||||
cell_counter = 0
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
for i in range(self.text_page.count_rects()):
|
||||
rect = self.text_page.get_rect(i)
|
||||
text_piece = self.text_page.get_text_bounded(*rect)
|
||||
x0, y0, x1, y1 = rect
|
||||
cells.append(
|
||||
Cell(
|
||||
id=cell_counter,
|
||||
text=text_piece,
|
||||
bbox=BoundingBox(
|
||||
l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
cell_counter += 1
|
||||
|
||||
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
|
||||
# The cell merging code below is to clean this up.
|
||||
def merge_horizontal_cells(
|
||||
cells: List[Cell],
|
||||
horizontal_threshold_factor: float = 1.0,
|
||||
vertical_threshold_factor: float = 0.5,
|
||||
) -> List[Cell]:
|
||||
if not cells:
|
||||
return []
|
||||
|
||||
def group_rows(cells: List[Cell]) -> List[List[Cell]]:
|
||||
rows = []
|
||||
current_row = [cells[0]]
|
||||
row_top = cells[0].bbox.t
|
||||
row_bottom = cells[0].bbox.b
|
||||
row_height = cells[0].bbox.height
|
||||
|
||||
for cell in cells[1:]:
|
||||
vertical_threshold = row_height * vertical_threshold_factor
|
||||
if (
|
||||
abs(cell.bbox.t - row_top) <= vertical_threshold
|
||||
and abs(cell.bbox.b - row_bottom) <= vertical_threshold
|
||||
):
|
||||
current_row.append(cell)
|
||||
row_top = min(row_top, cell.bbox.t)
|
||||
row_bottom = max(row_bottom, cell.bbox.b)
|
||||
row_height = row_bottom - row_top
|
||||
else:
|
||||
rows.append(current_row)
|
||||
current_row = [cell]
|
||||
row_top = cell.bbox.t
|
||||
row_bottom = cell.bbox.b
|
||||
row_height = cell.bbox.height
|
||||
|
||||
if current_row:
|
||||
rows.append(current_row)
|
||||
|
||||
return rows
|
||||
|
||||
def merge_row(row: List[Cell]) -> List[Cell]:
|
||||
merged = []
|
||||
current_group = [row[0]]
|
||||
|
||||
for cell in row[1:]:
|
||||
prev_cell = current_group[-1]
|
||||
avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
|
||||
if (
|
||||
cell.bbox.l - prev_cell.bbox.r
|
||||
<= avg_height * horizontal_threshold_factor
|
||||
):
|
||||
current_group.append(cell)
|
||||
else:
|
||||
merged.append(merge_group(current_group))
|
||||
current_group = [cell]
|
||||
|
||||
if current_group:
|
||||
merged.append(merge_group(current_group))
|
||||
|
||||
return merged
|
||||
|
||||
def merge_group(group: List[Cell]) -> Cell:
|
||||
if len(group) == 1:
|
||||
return group[0]
|
||||
|
||||
merged_text = "".join(cell.text for cell in group)
|
||||
merged_bbox = BoundingBox(
|
||||
l=min(cell.bbox.l for cell in group),
|
||||
t=min(cell.bbox.t for cell in group),
|
||||
r=max(cell.bbox.r for cell in group),
|
||||
b=max(cell.bbox.b for cell in group),
|
||||
)
|
||||
return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
|
||||
|
||||
rows = group_rows(cells)
|
||||
merged_cells = [cell for row in rows for cell in merge_row(row)]
|
||||
|
||||
for i, cell in enumerate(merged_cells, 1):
|
||||
cell.id = i
|
||||
|
||||
return merged_cells
|
||||
|
||||
def draw_clusters_and_cells():
|
||||
image = self.get_page_image()
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
)
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
|
||||
# before merge:
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
cells = merge_horizontal_cells(cells)
|
||||
|
||||
# after merge:
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
return cells
|
||||
|
||||
def get_page_image(
|
||||
self, scale: int = 1, cropbox: Optional[BoundingBox] = None
|
||||
) -> Image.Image:
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
if not cropbox:
|
||||
cropbox = BoundingBox(
|
||||
l=0,
|
||||
r=page_size.width,
|
||||
t=0,
|
||||
b=page_size.height,
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
padbox = BoundingBox(
|
||||
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
||||
)
|
||||
else:
|
||||
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
||||
padbox.r = page_size.width - padbox.r
|
||||
padbox.t = page_size.height - padbox.t
|
||||
|
||||
image = (
|
||||
self._ppage.render(
|
||||
scale=scale * 1.5,
|
||||
rotation=0, # no additional rotation
|
||||
crop=padbox.as_tuple(),
|
||||
)
|
||||
.to_pil()
|
||||
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
||||
) # We resize the image from 1.5x the given scale to make it sharper.
|
||||
|
||||
return image
|
||||
|
||||
def get_size(self) -> PageSize:
|
||||
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
|
||||
|
||||
def unload(self):
|
||||
self._ppage = None
|
||||
self.text_page = None
|
||||
|
||||
|
||||
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
||||
def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
|
||||
super().__init__(path_or_stream)
|
||||
|
||||
if isinstance(path_or_stream, Path):
|
||||
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
||||
elif isinstance(path_or_stream, BytesIO):
|
||||
self._pdoc = pdfium.PdfDocument(
|
||||
path_or_stream
|
||||
) # TODO Fix me, won't accept bytes.
|
||||
|
||||
def page_count(self) -> int:
|
||||
return len(self._pdoc)
|
||||
|
||||
def load_page(self, page_no: int) -> PdfPage:
|
||||
return PyPdfiumPageBackend(self._pdoc[page_no])
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.page_count() > 0
|
||||
|
||||
def unload(self):
|
||||
self._pdoc.close()
|
||||
self._pdoc = None
|
0
docling/datamodel/__init__.py
Normal file
0
docling/datamodel/__init__.py
Normal file
247
docling/datamodel/base_models.py
Normal file
247
docling/datamodel/base_models.py
Normal file
@ -0,0 +1,247 @@
|
||||
from enum import Enum, auto
|
||||
from io import BytesIO
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict, model_validator
|
||||
|
||||
from docling.backend.abstract_backend import PdfPageBackend
|
||||
|
||||
|
||||
class ConversionStatus(str, Enum):
|
||||
PENDING = auto()
|
||||
STARTED = auto()
|
||||
FAILURE = auto()
|
||||
SUCCESS = auto()
|
||||
SUCCESS_WITH_ERRORS = auto()
|
||||
|
||||
|
||||
class DocInputType(str, Enum):
|
||||
PATH = auto()
|
||||
STREAM = auto()
|
||||
|
||||
|
||||
class CoordOrigin(str, Enum):
|
||||
TOPLEFT = auto()
|
||||
BOTTOMLEFT = auto()
|
||||
|
||||
|
||||
class PageSize(BaseModel):
|
||||
width: float = 0.0
|
||||
height: float = 0.0
|
||||
|
||||
|
||||
class BoundingBox(BaseModel):
|
||||
l: float # left
|
||||
t: float # top
|
||||
r: float # right
|
||||
b: float # bottom
|
||||
|
||||
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
|
||||
|
||||
@property
|
||||
def width(self):
|
||||
return self.r - self.l
|
||||
|
||||
@property
|
||||
def height(self):
|
||||
return abs(self.t - self.b)
|
||||
|
||||
def as_tuple(self):
|
||||
if self.coord_origin == CoordOrigin.TOPLEFT:
|
||||
return (self.l, self.t, self.r, self.b)
|
||||
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
||||
return (self.l, self.b, self.r, self.t)
|
||||
|
||||
@classmethod
|
||||
def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
|
||||
if origin == CoordOrigin.TOPLEFT:
|
||||
return BoundingBox(
|
||||
l=coord[0], t=coord[1], r=coord[2], b=coord[3], coord_origin=origin
|
||||
)
|
||||
elif origin == CoordOrigin.BOTTOMLEFT:
|
||||
return BoundingBox(
|
||||
l=coord[0], b=coord[1], r=coord[2], t=coord[3], coord_origin=origin
|
||||
)
|
||||
|
||||
def area(self) -> float:
|
||||
return (self.r - self.l) * (self.b - self.t)
|
||||
|
||||
def intersection_area_with(self, other: "BoundingBox") -> float:
|
||||
# Calculate intersection coordinates
|
||||
left = max(self.l, other.l)
|
||||
top = max(self.t, other.t)
|
||||
right = min(self.r, other.r)
|
||||
bottom = min(self.b, other.b)
|
||||
|
||||
# Calculate intersection dimensions
|
||||
width = right - left
|
||||
height = bottom - top
|
||||
|
||||
# If the bounding boxes do not overlap, width or height will be negative
|
||||
if width <= 0 or height <= 0:
|
||||
return 0.0
|
||||
|
||||
return width * height
|
||||
|
||||
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
|
||||
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
||||
return self
|
||||
elif self.coord_origin == CoordOrigin.TOPLEFT:
|
||||
return BoundingBox(
|
||||
l=self.l,
|
||||
r=self.r,
|
||||
t=page_height - self.t,
|
||||
b=page_height - self.b,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
|
||||
def to_top_left_origin(self, page_height):
|
||||
if self.coord_origin == CoordOrigin.TOPLEFT:
|
||||
return self
|
||||
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
||||
return BoundingBox(
|
||||
l=self.l,
|
||||
r=self.r,
|
||||
t=page_height - self.t, # self.b
|
||||
b=page_height - self.b, # self.t
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
|
||||
|
||||
class Cell(BaseModel):
|
||||
id: int
|
||||
text: str
|
||||
bbox: BoundingBox
|
||||
|
||||
|
||||
class OcrCell(Cell):
|
||||
confidence: float
|
||||
|
||||
|
||||
class Cluster(BaseModel):
|
||||
id: int
|
||||
label: str
|
||||
bbox: BoundingBox
|
||||
confidence: float = 1.0
|
||||
cells: List[Cell] = []
|
||||
|
||||
|
||||
class BasePageElement(BaseModel):
|
||||
label: str
|
||||
id: int
|
||||
page_no: int
|
||||
cluster: Cluster
|
||||
text: Optional[str] = None
|
||||
|
||||
|
||||
class LayoutPrediction(BaseModel):
|
||||
clusters: List[Cluster] = []
|
||||
|
||||
|
||||
class TableCell(BaseModel):
|
||||
bbox: BoundingBox
|
||||
row_span: int
|
||||
col_span: int
|
||||
start_row_offset_idx: int
|
||||
end_row_offset_idx: int
|
||||
start_col_offset_idx: int
|
||||
end_col_offset_idx: int
|
||||
text: str
|
||||
column_header: bool = False
|
||||
row_header: bool = False
|
||||
row_section: bool = False
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def from_dict_format(cls, data: Any) -> Any:
|
||||
if isinstance(data, Dict):
|
||||
text = data["bbox"].get("token", "")
|
||||
if not len(text):
|
||||
text_cells = data.pop("text_cell_bboxes", None)
|
||||
if text_cells:
|
||||
for el in text_cells:
|
||||
text += el["token"] + " "
|
||||
|
||||
text = text.strip()
|
||||
data["text"] = text
|
||||
|
||||
return data
|
||||
|
||||
|
||||
class TableElement(BasePageElement):
|
||||
otsl_seq: List[str]
|
||||
num_rows: int = 0
|
||||
num_cols: int = 0
|
||||
table_cells: List[TableCell]
|
||||
|
||||
|
||||
class TableStructurePrediction(BaseModel):
|
||||
table_map: Dict[int, TableElement] = {}
|
||||
|
||||
|
||||
class TextElement(BasePageElement):
|
||||
...
|
||||
|
||||
|
||||
class FigureData(BaseModel):
|
||||
pass
|
||||
|
||||
|
||||
class FigureElement(BasePageElement):
|
||||
data: Optional[FigureData] = None
|
||||
provenance: Optional[str] = None
|
||||
predicted_class: Optional[str] = None
|
||||
confidence: Optional[float] = None
|
||||
|
||||
|
||||
class FigureClassificationPrediction(BaseModel):
|
||||
figure_count: int = 0
|
||||
figure_map: Dict[int, FigureElement] = {}
|
||||
|
||||
|
||||
class EquationPrediction(BaseModel):
|
||||
equation_count: int = 0
|
||||
equation_map: Dict[int, TextElement] = {}
|
||||
|
||||
|
||||
class PagePredictions(BaseModel):
|
||||
layout: LayoutPrediction = None
|
||||
tablestructure: TableStructurePrediction = None
|
||||
figures_classification: FigureClassificationPrediction = None
|
||||
equations_prediction: EquationPrediction = None
|
||||
|
||||
|
||||
PageElement = Union[TextElement, TableElement, FigureElement]
|
||||
|
||||
|
||||
class AssembledUnit(BaseModel):
|
||||
elements: List[PageElement]
|
||||
body: List[PageElement]
|
||||
headers: List[PageElement]
|
||||
|
||||
|
||||
class Page(BaseModel):
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
page_no: int
|
||||
page_hash: str = None
|
||||
size: PageSize = None
|
||||
image: Image = None
|
||||
cells: List[Cell] = None
|
||||
predictions: PagePredictions = PagePredictions()
|
||||
assembled: AssembledUnit = None
|
||||
|
||||
_backend: PdfPageBackend = None # Internal PDF backend
|
||||
|
||||
|
||||
class DocumentStream(BaseModel):
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
filename: str
|
||||
stream: BytesIO
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
do_table_structure: bool = True
|
||||
do_ocr: bool = False
|
351
docling/datamodel/document.py
Normal file
351
docling/datamodel/document.py
Normal file
@ -0,0 +1,351 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
|
||||
|
||||
from deepsearch.documents.core.export import export_to_markdown
|
||||
from docling_core.types import BaseCell, BaseText
|
||||
from docling_core.types import BoundingBox as DsBoundingBox
|
||||
from docling_core.types import Document as DsDocument
|
||||
from docling_core.types import DocumentDescription as DsDocumentDescription
|
||||
from docling_core.types import FileInfoObject as DsFileInfoObject
|
||||
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
||||
from docling_core.types import Table as DsSchemaTable
|
||||
from docling_core.types import TableCell
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling.backend.abstract_backend import PdfDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
AssembledUnit,
|
||||
ConversionStatus,
|
||||
DocumentStream,
|
||||
FigureElement,
|
||||
Page,
|
||||
TableElement,
|
||||
TextElement,
|
||||
)
|
||||
from docling.datamodel.settings import DocumentLimits
|
||||
from docling.utils.utils import create_file_hash
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
layout_label_to_ds_type = {
|
||||
"Title": "title",
|
||||
"Document Index": "table-of-path_or_stream",
|
||||
"Section-header": "subtitle-level-1",
|
||||
"Checkbox-Selected": "checkbox-selected",
|
||||
"Checkbox-Unselected": "checkbox-unselected",
|
||||
"Caption": "caption",
|
||||
"Page-header": "page-header",
|
||||
"Page-footer": "page-footer",
|
||||
"Footnote": "footnote",
|
||||
"Table": "table",
|
||||
"Formula": "equation",
|
||||
"List-item": "paragraph",
|
||||
"Code": "paragraph",
|
||||
"Picture": "figure",
|
||||
"Text": "paragraph",
|
||||
}
|
||||
|
||||
|
||||
class InputDocument(BaseModel):
|
||||
file: PurePath = None
|
||||
document_hash: Optional[str] = None
|
||||
valid: bool = False
|
||||
limits: DocumentLimits = DocumentLimits()
|
||||
|
||||
filesize: Optional[int] = None
|
||||
page_count: Optional[int] = None
|
||||
|
||||
_backend: PdfDocumentBackend = None # Internal PDF backend used
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
filename: Optional[str] = None,
|
||||
limits: Optional[DocumentLimits] = None,
|
||||
pdf_backend=PyPdfiumDocumentBackend,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.limits = limits or DocumentLimits()
|
||||
|
||||
try:
|
||||
if isinstance(path_or_stream, Path):
|
||||
self.file = path_or_stream
|
||||
self.filesize = path_or_stream.stat().st_size
|
||||
if self.filesize > self.limits.max_file_size:
|
||||
self.valid = False
|
||||
else:
|
||||
self.document_hash = create_file_hash(path_or_stream)
|
||||
self._backend = pdf_backend(path_or_stream=path_or_stream)
|
||||
|
||||
elif isinstance(path_or_stream, BytesIO):
|
||||
self.file = PurePath(filename)
|
||||
self.filesize = path_or_stream.getbuffer().nbytes
|
||||
|
||||
if self.filesize > self.limits.max_file_size:
|
||||
self.valid = False
|
||||
else:
|
||||
self.document_hash = create_file_hash(path_or_stream)
|
||||
self._backend = pdf_backend(path_or_stream=path_or_stream)
|
||||
|
||||
if self.document_hash and self._backend.page_count() > 0:
|
||||
self.page_count = self._backend.page_count()
|
||||
|
||||
if self.page_count <= self.limits.max_num_pages:
|
||||
self.valid = True
|
||||
|
||||
except (FileNotFoundError, OSError) as e:
|
||||
_log.exception(
|
||||
f"File {self.file.name} not found or cannot be opened.", exc_info=e
|
||||
)
|
||||
# raise
|
||||
except RuntimeError as e:
|
||||
_log.exception(
|
||||
f"An unexpected error occurred while opening the document {self.file.name}",
|
||||
exc_info=e,
|
||||
)
|
||||
# raise
|
||||
|
||||
|
||||
class ConvertedDocument(BaseModel):
|
||||
input: InputDocument
|
||||
|
||||
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
||||
errors: List[Dict] = [] # structure to keep errors
|
||||
|
||||
pages: List[Page] = []
|
||||
assembled: AssembledUnit = None
|
||||
|
||||
output: DsDocument = None
|
||||
|
||||
def to_ds_document(self) -> DsDocument:
|
||||
title = ""
|
||||
desc = DsDocumentDescription(logs=[])
|
||||
|
||||
page_hashes = [
|
||||
PageReference(hash=p.page_hash, page=p.page_no, model="default")
|
||||
for p in self.pages
|
||||
]
|
||||
|
||||
file_info = DsFileInfoObject(
|
||||
filename=self.input.file.name,
|
||||
document_hash=self.input.document_hash,
|
||||
num_pages=self.input.page_count,
|
||||
page_hashes=page_hashes,
|
||||
)
|
||||
|
||||
main_text = []
|
||||
tables = []
|
||||
figures = []
|
||||
|
||||
page_no_to_page = {p.page_no: p for p in self.pages}
|
||||
|
||||
for element in self.assembled.elements:
|
||||
# Convert bboxes to lower-left origin.
|
||||
target_bbox = DsBoundingBox(
|
||||
element.cluster.bbox.to_bottom_left_origin(
|
||||
page_no_to_page[element.page_no].size.height
|
||||
).as_tuple()
|
||||
)
|
||||
|
||||
if isinstance(element, TextElement):
|
||||
main_text.append(
|
||||
BaseText(
|
||||
text=element.text,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
name=element.label,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no,
|
||||
span=[0, len(element.text)],
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
elif isinstance(element, TableElement):
|
||||
index = len(tables)
|
||||
ref_str = f"#/tables/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=element.label,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
|
||||
# Initialise empty table data grid (only empty cells)
|
||||
table_data = [
|
||||
[
|
||||
TableCell(
|
||||
text="",
|
||||
# bbox=[0,0,0,0],
|
||||
spans=[[i, j]],
|
||||
obj_type="body",
|
||||
)
|
||||
for j in range(element.num_cols)
|
||||
]
|
||||
for i in range(element.num_rows)
|
||||
]
|
||||
|
||||
# Overwrite cells in table data for which there is actual cell content.
|
||||
for cell in element.table_cells:
|
||||
for i in range(
|
||||
min(cell.start_row_offset_idx, element.num_rows),
|
||||
min(cell.end_row_offset_idx, element.num_rows),
|
||||
):
|
||||
for j in range(
|
||||
min(cell.start_col_offset_idx, element.num_cols),
|
||||
min(cell.end_col_offset_idx, element.num_cols),
|
||||
):
|
||||
celltype = "body"
|
||||
if cell.column_header:
|
||||
celltype = "col_header"
|
||||
elif cell.row_header:
|
||||
celltype = "row_header"
|
||||
|
||||
def make_spans(cell):
|
||||
for rspan in range(
|
||||
min(cell.start_row_offset_idx, element.num_rows),
|
||||
min(cell.end_row_offset_idx, element.num_rows),
|
||||
):
|
||||
for cspan in range(
|
||||
min(
|
||||
cell.start_col_offset_idx, element.num_cols
|
||||
),
|
||||
min(cell.end_col_offset_idx, element.num_cols),
|
||||
):
|
||||
yield [rspan, cspan]
|
||||
|
||||
spans = list(make_spans(cell))
|
||||
table_data[i][j] = TableCell(
|
||||
text=cell.text,
|
||||
bbox=cell.bbox.to_bottom_left_origin(
|
||||
page_no_to_page[element.page_no].size.height
|
||||
).as_tuple(),
|
||||
# col=j,
|
||||
# row=i,
|
||||
spans=spans,
|
||||
obj_type=celltype,
|
||||
# col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
|
||||
# row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
|
||||
)
|
||||
|
||||
tables.append(
|
||||
DsSchemaTable(
|
||||
num_cols=element.num_cols,
|
||||
num_rows=element.num_rows,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
data=table_data,
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no,
|
||||
span=[0, 0],
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
elif isinstance(element, FigureElement):
|
||||
index = len(figures)
|
||||
ref_str = f"#/figures/{index}"
|
||||
main_text.append(
|
||||
Ref(
|
||||
name=element.label,
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
ref=ref_str,
|
||||
),
|
||||
)
|
||||
figures.append(
|
||||
BaseCell(
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
page=element.page_no,
|
||||
span=[0, 0],
|
||||
)
|
||||
],
|
||||
obj_type=layout_label_to_ds_type.get(element.label),
|
||||
# data=[[]],
|
||||
)
|
||||
)
|
||||
|
||||
page_dimensions = [
|
||||
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
|
||||
for p in self.pages
|
||||
]
|
||||
|
||||
ds_doc = DsDocument(
|
||||
name=title,
|
||||
description=desc,
|
||||
file_info=file_info,
|
||||
main_text=main_text,
|
||||
tables=tables,
|
||||
figures=figures,
|
||||
page_dimensions=page_dimensions,
|
||||
)
|
||||
|
||||
return ds_doc
|
||||
|
||||
def render_as_dict(self):
|
||||
if self.output:
|
||||
return self.output.model_dump(by_alias=True, exclude_none=True)
|
||||
else:
|
||||
return {}
|
||||
|
||||
def render_as_markdown(self):
|
||||
if self.output:
|
||||
return export_to_markdown(
|
||||
self.output.model_dump(by_alias=True, exclude_none=True)
|
||||
)
|
||||
else:
|
||||
return ""
|
||||
|
||||
|
||||
class DocumentConversionInput(BaseModel):
|
||||
|
||||
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
||||
limits: Optional[DocumentLimits] = DocumentLimits()
|
||||
|
||||
DEFAULT_BACKEND: ClassVar = PyPdfiumDocumentBackend
|
||||
|
||||
def docs(
|
||||
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
|
||||
) -> Iterable[InputDocument]:
|
||||
|
||||
pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
|
||||
|
||||
for obj in self._path_or_stream_iterator:
|
||||
if isinstance(obj, Path):
|
||||
yield InputDocument(
|
||||
path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
|
||||
)
|
||||
elif isinstance(obj, DocumentStream):
|
||||
yield InputDocument(
|
||||
path_or_stream=obj.stream,
|
||||
filename=obj.filename,
|
||||
limits=self.limits,
|
||||
pdf_backend=pdf_backend,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
|
||||
paths = [Path(p) for p in paths]
|
||||
|
||||
doc_input = cls(limits=limits)
|
||||
doc_input._path_or_stream_iterator = paths
|
||||
|
||||
return doc_input
|
||||
|
||||
@classmethod
|
||||
def from_streams(
|
||||
cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
|
||||
):
|
||||
doc_input = cls(limits=limits)
|
||||
doc_input._path_or_stream_iterator = streams
|
||||
|
||||
return doc_input
|
32
docling/datamodel/settings.py
Normal file
32
docling/datamodel/settings.py
Normal file
@ -0,0 +1,32 @@
|
||||
import sys
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
|
||||
class DocumentLimits(BaseModel):
|
||||
max_num_pages: int = sys.maxsize
|
||||
max_file_size: int = sys.maxsize
|
||||
|
||||
|
||||
class BatchConcurrencySettings(BaseModel):
|
||||
doc_batch_size: int = 2
|
||||
doc_batch_concurrency: int = 2
|
||||
page_batch_size: int = 4
|
||||
page_batch_concurrency: int = 2
|
||||
|
||||
# doc_batch_size: int = 1
|
||||
# doc_batch_concurrency: int = 1
|
||||
# page_batch_size: int = 1
|
||||
# page_batch_concurrency: int = 1
|
||||
|
||||
# model_concurrency: int = 2
|
||||
|
||||
# To force models into single core: export OMP_NUM_THREADS=1
|
||||
|
||||
|
||||
class AppSettings(BaseSettings):
|
||||
perf: BatchConcurrencySettings
|
||||
|
||||
|
||||
settings = AppSettings(perf=BatchConcurrencySettings())
|
207
docling/document_converter.py
Normal file
207
docling/document_converter.py
Normal file
@ -0,0 +1,207 @@
|
||||
import functools
|
||||
import logging
|
||||
import time
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Type, Union
|
||||
|
||||
from PIL import ImageDraw
|
||||
|
||||
from docling.backend.abstract_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
AssembledUnit,
|
||||
ConversionStatus,
|
||||
Page,
|
||||
PipelineOptions,
|
||||
)
|
||||
from docling.datamodel.document import (
|
||||
ConvertedDocument,
|
||||
DocumentConversionInput,
|
||||
InputDocument,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.ds_glm_model import GlmModel
|
||||
from docling.models.page_assemble_model import PageAssembleModel
|
||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||
from docling.pipeline.standard_model_pipeline import StandardModelPipeline
|
||||
from docling.utils.utils import chunkify, create_hash
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentConverter:
|
||||
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
||||
_table_model_path = "model_artifacts/tableformer"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
artifacts_path: Optional[Union[Path, str]] = None,
|
||||
pipeline_options: PipelineOptions = PipelineOptions(),
|
||||
pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
|
||||
pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
|
||||
):
|
||||
if not artifacts_path:
|
||||
artifacts_path = self.download_models_hf()
|
||||
|
||||
artifacts_path = Path(artifacts_path)
|
||||
|
||||
self.model_pipeline = pipeline_cls(
|
||||
artifacts_path=artifacts_path, pipeline_options=pipeline_options
|
||||
)
|
||||
|
||||
self.page_assemble_model = PageAssembleModel(config={})
|
||||
self.glm_model = GlmModel(config={})
|
||||
self.pdf_backend = pdf_backend
|
||||
|
||||
@staticmethod
|
||||
def download_models_hf(
|
||||
local_dir: Optional[Path] = None, force: bool = False
|
||||
) -> Path:
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
download_path = snapshot_download(
|
||||
repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
|
||||
)
|
||||
|
||||
return Path(download_path)
|
||||
|
||||
def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument]:
|
||||
|
||||
for input_batch in chunkify(
|
||||
input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
|
||||
):
|
||||
_log.info(f"Going to convert document batch...")
|
||||
# parallel processing only within input_batch
|
||||
# with ThreadPoolExecutor(
|
||||
# max_workers=settings.perf.doc_batch_concurrency
|
||||
# ) as pool:
|
||||
# yield from pool.map(self.process_document, input_batch)
|
||||
|
||||
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
|
||||
yield from map(self.process_document, input_batch)
|
||||
|
||||
def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
|
||||
start_doc_time = time.time()
|
||||
converted_doc = ConvertedDocument(input=in_doc)
|
||||
|
||||
if not in_doc.valid:
|
||||
converted_doc.status = ConversionStatus.FAILURE
|
||||
return converted_doc
|
||||
|
||||
for i in range(0, in_doc.page_count):
|
||||
converted_doc.pages.append(Page(page_no=i))
|
||||
|
||||
all_assembled_pages = []
|
||||
|
||||
try:
|
||||
# Iterate batches of pages (page_batch_size) in the doc
|
||||
for page_batch in chunkify(
|
||||
converted_doc.pages, settings.perf.page_batch_size
|
||||
):
|
||||
|
||||
start_pb_time = time.time()
|
||||
# Pipeline
|
||||
|
||||
# 1. Initialise the page resources
|
||||
init_pages = map(
|
||||
functools.partial(self.initialize_page, in_doc), page_batch
|
||||
)
|
||||
|
||||
# 2. Populate page image
|
||||
pages_with_images = map(
|
||||
functools.partial(self.populate_page_images, in_doc), init_pages
|
||||
)
|
||||
|
||||
# 3. Populate programmatic page cells
|
||||
pages_with_cells = map(
|
||||
functools.partial(self.parse_page_cells, in_doc),
|
||||
pages_with_images,
|
||||
)
|
||||
|
||||
pipeline_pages = self.model_pipeline.apply(pages_with_cells)
|
||||
|
||||
# 7. Assemble page elements (per page)
|
||||
assembled_pages = self.page_assemble_model(pipeline_pages)
|
||||
|
||||
# exhaust assembled_pages
|
||||
for assembled_page in assembled_pages:
|
||||
# Free up mem resources before moving on with next batch
|
||||
assembled_page.image = (
|
||||
None # Comment this if you want to visualize page images
|
||||
)
|
||||
assembled_page._backend.unload()
|
||||
|
||||
all_assembled_pages.append(assembled_page)
|
||||
|
||||
end_pb_time = time.time() - start_pb_time
|
||||
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||
|
||||
# Free up mem resources of PDF backend
|
||||
in_doc._backend.unload()
|
||||
|
||||
converted_doc.pages = all_assembled_pages
|
||||
self.assemble_doc(converted_doc)
|
||||
|
||||
converted_doc.status = ConversionStatus.SUCCESS
|
||||
|
||||
except Exception as e:
|
||||
converted_doc.status = ConversionStatus.FAILURE
|
||||
trace = "\n".join(traceback.format_exception(e))
|
||||
_log.info(f"Encountered an error during conversion: {trace}")
|
||||
|
||||
end_doc_time = time.time() - start_doc_time
|
||||
_log.info(
|
||||
f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
|
||||
)
|
||||
|
||||
return converted_doc
|
||||
|
||||
# Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
|
||||
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
||||
page._backend = doc._backend.load_page(page.page_no)
|
||||
page.size = page._backend.get_size()
|
||||
page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
|
||||
|
||||
return page
|
||||
|
||||
# Generate the page image and store it in the page object
|
||||
def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
|
||||
page.image = page._backend.get_page_image()
|
||||
|
||||
return page
|
||||
|
||||
# Extract and populate the page cells and store it in the page object
|
||||
def parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
|
||||
page.cells = page._backend.get_text_cells()
|
||||
|
||||
# DEBUG code:
|
||||
def draw_text_boxes(image, cells):
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||
image.show()
|
||||
|
||||
# draw_text_boxes(page.image, cells)
|
||||
|
||||
return page
|
||||
|
||||
def assemble_doc(self, converted_doc: ConvertedDocument):
|
||||
all_elements = []
|
||||
all_headers = []
|
||||
all_body = []
|
||||
|
||||
for p in converted_doc.pages:
|
||||
|
||||
for el in p.assembled.body:
|
||||
all_body.append(el)
|
||||
for el in p.assembled.headers:
|
||||
all_headers.append(el)
|
||||
for el in p.assembled.elements:
|
||||
all_elements.append(el)
|
||||
|
||||
converted_doc.assembled = AssembledUnit(
|
||||
elements=all_elements, headers=all_headers, body=all_body
|
||||
)
|
||||
|
||||
converted_doc.output = self.glm_model(converted_doc)
|
0
docling/models/__init__.py
Normal file
0
docling/models/__init__.py
Normal file
82
docling/models/ds_glm_model.py
Normal file
82
docling/models/ds_glm_model.py
Normal file
@ -0,0 +1,82 @@
|
||||
import copy
|
||||
import random
|
||||
|
||||
from deepsearch_glm.nlp_utils import init_nlp_model
|
||||
from deepsearch_glm.utils.ds_utils import to_legacy_document_format
|
||||
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
||||
from docling_core.types import BaseText
|
||||
from docling_core.types import Document as DsDocument
|
||||
from docling_core.types import Ref
|
||||
from PIL import ImageDraw
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
|
||||
from docling.datamodel.document import ConvertedDocument
|
||||
|
||||
|
||||
class GlmModel:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
load_pretrained_nlp_models()
|
||||
model = init_nlp_model(model_names="language;term;reference")
|
||||
self.model = model
|
||||
|
||||
def __call__(self, document: ConvertedDocument) -> DsDocument:
|
||||
ds_doc = document.to_ds_document()
|
||||
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
||||
|
||||
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
||||
ds_doc_dict = to_legacy_document_format(
|
||||
glm_doc, ds_doc_dict, update_name_label=True
|
||||
)
|
||||
|
||||
exported_doc = DsDocument.model_validate(ds_doc_dict)
|
||||
|
||||
# DEBUG code:
|
||||
def draw_clusters_and_cells(ds_document, page_no):
|
||||
clusters_to_draw = []
|
||||
image = copy.deepcopy(document.pages[page_no].image)
|
||||
for ix, elem in enumerate(ds_document.main_text):
|
||||
if isinstance(elem, BaseText):
|
||||
prov = elem.prov[0]
|
||||
elif isinstance(elem, Ref):
|
||||
_, arr, index = elem.ref.split("/")
|
||||
index = int(index)
|
||||
if arr == "tables":
|
||||
prov = ds_document.tables[index].prov[0]
|
||||
elif arr == "figures":
|
||||
prov = ds_document.figures[index].prov[0]
|
||||
else:
|
||||
prov = None
|
||||
|
||||
if prov and prov.page == page_no:
|
||||
clusters_to_draw.append(
|
||||
Cluster(
|
||||
id=ix,
|
||||
label=elem.name,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=prov.bbox,
|
||||
origin=CoordOrigin.BOTTOMLEFT,
|
||||
).to_top_left_origin(document.pages[page_no].size.height),
|
||||
)
|
||||
)
|
||||
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in clusters_to_draw:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||
draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
|
||||
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
)
|
||||
for tc in c.cells: # [:1]:
|
||||
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
|
||||
# draw_clusters_and_cells(ds_doc, 0)
|
||||
# draw_clusters_and_cells(exported_doc, 0)
|
||||
|
||||
return exported_doc
|
77
docling/models/easyocr_model.py
Normal file
77
docling/models/easyocr_model.py
Normal file
@ -0,0 +1,77 @@
|
||||
import copy
|
||||
import logging
|
||||
import random
|
||||
from typing import Iterable
|
||||
|
||||
import numpy
|
||||
from PIL import ImageDraw
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EasyOcrModel:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.enabled = config["enabled"]
|
||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||
|
||||
if self.enabled:
|
||||
import easyocr
|
||||
|
||||
self.reader = easyocr.Reader(config["lang"])
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
return
|
||||
|
||||
for page in page_batch:
|
||||
# rects = page._fpage.
|
||||
high_res_image = page._backend.get_page_image(scale=self.scale)
|
||||
im = numpy.array(high_res_image)
|
||||
result = self.reader.readtext(im)
|
||||
|
||||
del high_res_image
|
||||
del im
|
||||
|
||||
cells = [
|
||||
OcrCell(
|
||||
id=ix,
|
||||
text=line[1],
|
||||
confidence=line[2],
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(
|
||||
line[0][0][0] / self.scale,
|
||||
line[0][0][1] / self.scale,
|
||||
line[0][2][0] / self.scale,
|
||||
line[0][2][1] / self.scale,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
),
|
||||
)
|
||||
for ix, line in enumerate(result)
|
||||
]
|
||||
|
||||
page.cells = cells # For now, just overwrites all digital cells.
|
||||
|
||||
# DEBUG code:
|
||||
def draw_clusters_and_cells():
|
||||
image = copy.deepcopy(page.image)
|
||||
draw = ImageDraw.Draw(image)
|
||||
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
)
|
||||
for tc in cells:
|
||||
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
yield page
|
318
docling/models/layout_model.py
Normal file
318
docling/models/layout_model.py
Normal file
@ -0,0 +1,318 @@
|
||||
import copy
|
||||
import logging
|
||||
import random
|
||||
import time
|
||||
from typing import Iterable, List
|
||||
|
||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||
from PIL import ImageDraw
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
BoundingBox,
|
||||
Cell,
|
||||
Cluster,
|
||||
CoordOrigin,
|
||||
LayoutPrediction,
|
||||
Page,
|
||||
)
|
||||
from docling.utils import layout_utils as lu
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LayoutModel:
|
||||
|
||||
TEXT_ELEM_LABELS = [
|
||||
"Text",
|
||||
"Footnote",
|
||||
"Caption",
|
||||
"Checkbox-Unselected",
|
||||
"Checkbox-Selected",
|
||||
"Section-header",
|
||||
"Page-header",
|
||||
"Page-footer",
|
||||
"Code",
|
||||
"List-item",
|
||||
# "Formula",
|
||||
]
|
||||
PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
|
||||
|
||||
TABLE_LABEL = "Table"
|
||||
FIGURE_LABEL = "Picture"
|
||||
FORMULA_LABEL = "Formula"
|
||||
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.layout_predictor = LayoutPredictor(
|
||||
config["artifacts_path"]
|
||||
) # TODO temporary
|
||||
|
||||
def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
|
||||
MIN_INTERSECTION = 0.2
|
||||
CLASS_THRESHOLDS = {
|
||||
"Caption": 0.35,
|
||||
"Footnote": 0.35,
|
||||
"Formula": 0.35,
|
||||
"List-item": 0.35,
|
||||
"Page-footer": 0.35,
|
||||
"Page-header": 0.35,
|
||||
"Picture": 0.2, # low threshold adjust to capture chemical structures for examples.
|
||||
"Section-header": 0.45,
|
||||
"Table": 0.35,
|
||||
"Text": 0.45,
|
||||
"Title": 0.45,
|
||||
"Document Index": 0.45,
|
||||
"Code": 0.45,
|
||||
"Checkbox-Selected": 0.45,
|
||||
"Checkbox-Unselected": 0.45,
|
||||
"Form": 0.45,
|
||||
"Key-Value Region": 0.45,
|
||||
}
|
||||
|
||||
_log.debug("================= Start postprocess function ====================")
|
||||
start_time = time.time()
|
||||
# Apply Confidence Threshold to cluster predictions
|
||||
# confidence = self.conf_threshold
|
||||
clusters_out = []
|
||||
|
||||
for cluster in clusters:
|
||||
confidence = CLASS_THRESHOLDS[cluster.label]
|
||||
if cluster.confidence >= confidence:
|
||||
# annotation["created_by"] = "high_conf_pred"
|
||||
clusters_out.append(cluster)
|
||||
|
||||
# map to dictionary clusters and cells, with bottom left origin
|
||||
clusters = [
|
||||
{
|
||||
"id": c.id,
|
||||
"bbox": list(
|
||||
c.bbox.to_bottom_left_origin(page_height).as_tuple()
|
||||
), # TODO
|
||||
"confidence": c.confidence,
|
||||
"cell_ids": [],
|
||||
"type": c.label,
|
||||
}
|
||||
for c in clusters
|
||||
]
|
||||
|
||||
clusters_out = [
|
||||
{
|
||||
"id": c.id,
|
||||
"bbox": list(
|
||||
c.bbox.to_bottom_left_origin(page_height).as_tuple()
|
||||
), # TODO
|
||||
"confidence": c.confidence,
|
||||
"created_by": "high_conf_pred",
|
||||
"cell_ids": [],
|
||||
"type": c.label,
|
||||
}
|
||||
for c in clusters_out
|
||||
]
|
||||
|
||||
raw_cells = [
|
||||
{
|
||||
"id": c.id,
|
||||
"bbox": list(
|
||||
c.bbox.to_bottom_left_origin(page_height).as_tuple()
|
||||
), # TODO
|
||||
"text": c.text,
|
||||
}
|
||||
for c in cells
|
||||
]
|
||||
cell_count = len(raw_cells)
|
||||
|
||||
_log.debug("---- 0. Treat cluster overlaps ------")
|
||||
clusters_out = lu.remove_cluster_duplicates_by_conf(clusters_out, 0.8)
|
||||
|
||||
_log.debug(
|
||||
"---- 1. Initially assign cells to clusters based on minimum intersection ------"
|
||||
)
|
||||
## Check for cells included in or touched by clusters:
|
||||
clusters_out = lu.assigning_cell_ids_to_clusters(
|
||||
clusters_out, raw_cells, MIN_INTERSECTION
|
||||
)
|
||||
|
||||
_log.debug("---- 2. Assign Orphans with Low Confidence Detections")
|
||||
# Creates a map of cell_id->cluster_id
|
||||
(
|
||||
clusters_around_cells,
|
||||
orphan_cell_indices,
|
||||
ambiguous_cell_indices,
|
||||
) = lu.cell_id_state_map(clusters_out, cell_count)
|
||||
|
||||
# Assign orphan cells with lower confidence predictions
|
||||
clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred(
|
||||
clusters_out, clusters, raw_cells, orphan_cell_indices
|
||||
)
|
||||
|
||||
# Refresh the cell_ids assignment, after creating new clusters using low conf predictions
|
||||
clusters_out = lu.assigning_cell_ids_to_clusters(
|
||||
clusters_out, raw_cells, MIN_INTERSECTION
|
||||
)
|
||||
|
||||
_log.debug("---- 3. Settle Ambigous Cells")
|
||||
# Creates an update map after assignment of cell_id->cluster_id
|
||||
(
|
||||
clusters_around_cells,
|
||||
orphan_cell_indices,
|
||||
ambiguous_cell_indices,
|
||||
) = lu.cell_id_state_map(clusters_out, cell_count)
|
||||
|
||||
# Settle pdf cells that belong to multiple clusters
|
||||
clusters_out, ambiguous_cell_indices = lu.remove_ambigous_pdf_cell_by_conf(
|
||||
clusters_out, raw_cells, ambiguous_cell_indices
|
||||
)
|
||||
|
||||
_log.debug("---- 4. Set Orphans as Text")
|
||||
(
|
||||
clusters_around_cells,
|
||||
orphan_cell_indices,
|
||||
ambiguous_cell_indices,
|
||||
) = lu.cell_id_state_map(clusters_out, cell_count)
|
||||
|
||||
clusters_out, orphan_cell_indices = lu.set_orphan_as_text(
|
||||
clusters_out, clusters, raw_cells, orphan_cell_indices
|
||||
)
|
||||
|
||||
_log.debug("---- 5. Merge Cells & and adapt the bounding boxes")
|
||||
# Merge cells orphan cells
|
||||
clusters_out = lu.merge_cells(clusters_out)
|
||||
|
||||
# Clean up clusters that remain from merged and unreasonable clusters
|
||||
clusters_out = lu.clean_up_clusters(
|
||||
clusters_out,
|
||||
raw_cells,
|
||||
merge_cells=True,
|
||||
img_table=True,
|
||||
one_cell_table=True,
|
||||
)
|
||||
|
||||
new_clusters = lu.adapt_bboxes(raw_cells, clusters_out, orphan_cell_indices)
|
||||
clusters_out = new_clusters
|
||||
|
||||
## We first rebuild where every cell is now:
|
||||
## Now we write into a prediction cells list, not into the raw cells list.
|
||||
## As we don't need previous labels, we best overwrite any old list, because that might
|
||||
## have been sorted differently.
|
||||
(
|
||||
clusters_around_cells,
|
||||
orphan_cell_indices,
|
||||
ambiguous_cell_indices,
|
||||
) = lu.cell_id_state_map(clusters_out, cell_count)
|
||||
|
||||
target_cells = []
|
||||
for ix, cell in enumerate(raw_cells):
|
||||
new_cell = {
|
||||
"id": ix,
|
||||
"rawcell_id": ix,
|
||||
"label": "None",
|
||||
"bbox": cell["bbox"],
|
||||
"text": cell["text"],
|
||||
}
|
||||
for cluster_index in clusters_around_cells[
|
||||
ix
|
||||
]: # By previous analysis, this is always 1 cluster.
|
||||
new_cell["label"] = clusters_out[cluster_index]["type"]
|
||||
target_cells.append(new_cell)
|
||||
# _log.debug("New label of cell " + str(ix) + " is " + str(new_cell["label"]))
|
||||
cells_out = target_cells
|
||||
|
||||
## -------------------------------
|
||||
## Sort clusters into reasonable reading order, and sort the cells inside each cluster
|
||||
_log.debug("---- 5. Sort clusters in reading order ------")
|
||||
sorted_clusters = lu.produce_reading_order(
|
||||
clusters_out, "raw_cell_ids", "raw_cell_ids", True
|
||||
)
|
||||
clusters_out = sorted_clusters
|
||||
|
||||
# end_time = timer()
|
||||
_log.debug("---- End of postprocessing function ------")
|
||||
end_time = time.time() - start_time
|
||||
_log.debug(f"Finished post processing in seconds={end_time:.3f}")
|
||||
|
||||
cells_out = [
|
||||
Cell(
|
||||
id=c["id"],
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
||||
).to_top_left_origin(page_height),
|
||||
text=c["text"],
|
||||
)
|
||||
for c in cells_out
|
||||
]
|
||||
clusters_out_new = []
|
||||
for c in clusters_out:
|
||||
cluster_cells = [ccell for ccell in cells_out if ccell.id in c["cell_ids"]]
|
||||
c_new = Cluster(
|
||||
id=c["id"],
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
||||
).to_top_left_origin(page_height),
|
||||
confidence=c["confidence"],
|
||||
label=c["type"],
|
||||
cells=cluster_cells,
|
||||
)
|
||||
clusters_out_new.append(c_new)
|
||||
|
||||
return clusters_out_new, cells_out
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
for page in page_batch:
|
||||
clusters = []
|
||||
for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)):
|
||||
cluster = Cluster(
|
||||
id=ix,
|
||||
label=pred_item["label"],
|
||||
confidence=pred_item["confidence"],
|
||||
bbox=BoundingBox.model_validate(pred_item),
|
||||
cells=[],
|
||||
)
|
||||
clusters.append(cluster)
|
||||
|
||||
# Map cells to clusters
|
||||
# TODO: Remove, postprocess should take care of it anyway.
|
||||
for cell in page.cells:
|
||||
for cluster in clusters:
|
||||
if not cell.bbox.area() > 0:
|
||||
overlap_frac = 0.0
|
||||
else:
|
||||
overlap_frac = (
|
||||
cell.bbox.intersection_area_with(cluster.bbox)
|
||||
/ cell.bbox.area()
|
||||
)
|
||||
|
||||
if overlap_frac > 0.5:
|
||||
cluster.cells.append(cell)
|
||||
|
||||
# Pre-sort clusters
|
||||
# clusters = self.sort_clusters_by_cell_order(clusters)
|
||||
|
||||
# DEBUG code:
|
||||
def draw_clusters_and_cells():
|
||||
image = copy.deepcopy(page.image)
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in clusters:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
||||
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
)
|
||||
for tc in c.cells: # [:1]:
|
||||
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
clusters, page.cells = self.postprocess(
|
||||
clusters, page.cells, page.size.height
|
||||
)
|
||||
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
page.predictions.layout = LayoutPrediction(clusters=clusters)
|
||||
|
||||
yield page
|
160
docling/models/page_assemble_model.py
Normal file
160
docling/models/page_assemble_model.py
Normal file
@ -0,0 +1,160 @@
|
||||
import logging
|
||||
import re
|
||||
from typing import Iterable, List
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
AssembledUnit,
|
||||
FigureElement,
|
||||
Page,
|
||||
PageElement,
|
||||
TableElement,
|
||||
TextElement,
|
||||
)
|
||||
from docling.models.layout_model import LayoutModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PageAssembleModel:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
|
||||
# self.line_wrap_pattern = re.compile(r'(?<=[^\W_])- \n(?=\w)')
|
||||
|
||||
# def sanitize_text_poor(self, lines):
|
||||
# text = '\n'.join(lines)
|
||||
#
|
||||
# # treat line wraps.
|
||||
# sanitized_text = self.line_wrap_pattern.sub('', text)
|
||||
#
|
||||
# sanitized_text = sanitized_text.replace('\n', ' ')
|
||||
#
|
||||
# return sanitized_text
|
||||
|
||||
def sanitize_text(self, lines):
|
||||
if len(lines) <= 1:
|
||||
return " ".join(lines)
|
||||
|
||||
for ix, line in enumerate(lines[1:]):
|
||||
prev_line = lines[ix]
|
||||
|
||||
if prev_line.endswith("-"):
|
||||
prev_words = re.findall(r"\b[\w]+\b", prev_line)
|
||||
line_words = re.findall(r"\b[\w]+\b", line)
|
||||
|
||||
if (
|
||||
len(prev_words)
|
||||
and len(line_words)
|
||||
and prev_words[-1].isalnum()
|
||||
and line_words[0].isalnum()
|
||||
):
|
||||
lines[ix] = prev_line[:-1]
|
||||
else:
|
||||
lines[ix] += " "
|
||||
|
||||
sanitized_text = "".join(lines)
|
||||
|
||||
return sanitized_text.strip() # Strip any leading or trailing whitespace
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
for page in page_batch:
|
||||
# assembles some JSON output page by page.
|
||||
|
||||
elements: List[PageElement] = []
|
||||
headers: List[PageElement] = []
|
||||
body: List[PageElement] = []
|
||||
|
||||
for cluster in page.predictions.layout.clusters:
|
||||
# _log.info("Cluster label seen:", cluster.label)
|
||||
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
||||
|
||||
textlines = [
|
||||
cell.text.replace("\x02", "-").strip()
|
||||
for cell in cluster.cells
|
||||
if len(cell.text.strip()) > 0
|
||||
]
|
||||
text = self.sanitize_text(textlines)
|
||||
text_el = TextElement(
|
||||
label=cluster.label,
|
||||
id=cluster.id,
|
||||
text=text,
|
||||
page_no=page.page_no,
|
||||
cluster=cluster,
|
||||
)
|
||||
elements.append(text_el)
|
||||
|
||||
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
|
||||
headers.append(text_el)
|
||||
else:
|
||||
body.append(text_el)
|
||||
elif cluster.label == LayoutModel.TABLE_LABEL:
|
||||
tbl = None
|
||||
if page.predictions.tablestructure:
|
||||
tbl = page.predictions.tablestructure.table_map.get(
|
||||
cluster.id, None
|
||||
)
|
||||
if (
|
||||
not tbl
|
||||
): # fallback: add table without structure, if it isn't present
|
||||
tbl = TableElement(
|
||||
label=cluster.label,
|
||||
id=cluster.id,
|
||||
text="",
|
||||
otsl_seq=[],
|
||||
table_cells=[],
|
||||
cluster=cluster,
|
||||
page_no=page.page_no,
|
||||
)
|
||||
|
||||
elements.append(tbl)
|
||||
body.append(tbl)
|
||||
elif cluster.label == LayoutModel.FIGURE_LABEL:
|
||||
fig = None
|
||||
if page.predictions.figures_classification:
|
||||
fig = page.predictions.figures_classification.figure_map.get(
|
||||
cluster.id, None
|
||||
)
|
||||
if (
|
||||
not fig
|
||||
): # fallback: add figure without classification, if it isn't present
|
||||
fig = FigureElement(
|
||||
label=cluster.label,
|
||||
id=cluster.id,
|
||||
text="",
|
||||
data=None,
|
||||
cluster=cluster,
|
||||
page_no=page.page_no,
|
||||
)
|
||||
elements.append(fig)
|
||||
body.append(fig)
|
||||
elif cluster.label == LayoutModel.FORMULA_LABEL:
|
||||
equation = None
|
||||
if page.predictions.equations_prediction:
|
||||
equation = (
|
||||
page.predictions.equations_prediction.equation_map.get(
|
||||
cluster.id, None
|
||||
)
|
||||
)
|
||||
if not equation: # fallback: add empty formula, if it isn't present
|
||||
text = self.sanitize_text(
|
||||
[
|
||||
cell.text.replace("\x02", "-").strip()
|
||||
for cell in cluster.cells
|
||||
if len(cell.text.strip()) > 0
|
||||
]
|
||||
)
|
||||
equation = TextElement(
|
||||
label=cluster.label,
|
||||
id=cluster.id,
|
||||
cluster=cluster,
|
||||
page_no=page.page_no,
|
||||
text=text,
|
||||
)
|
||||
elements.append(equation)
|
||||
body.append(equation)
|
||||
|
||||
page.assembled = AssembledUnit(
|
||||
elements=elements, headers=headers, body=body
|
||||
)
|
||||
|
||||
yield page
|
114
docling/models/table_structure_model.py
Normal file
114
docling/models/table_structure_model.py
Normal file
@ -0,0 +1,114 @@
|
||||
from typing import Iterable
|
||||
|
||||
import numpy
|
||||
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
BoundingBox,
|
||||
Page,
|
||||
TableCell,
|
||||
TableElement,
|
||||
TableStructurePrediction,
|
||||
)
|
||||
|
||||
|
||||
class TableStructureModel:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.do_cell_matching = config["do_cell_matching"]
|
||||
|
||||
self.enabled = config["enabled"]
|
||||
if self.enabled:
|
||||
artifacts_path = config["artifacts_path"]
|
||||
# Third Party
|
||||
import docling_ibm_models.tableformer.common as c
|
||||
|
||||
self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
|
||||
self.tm_config["model"]["save_dir"] = artifacts_path
|
||||
self.tm_model_type = self.tm_config["model"]["type"]
|
||||
|
||||
self.tf_predictor = TFPredictor(self.tm_config)
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
return
|
||||
|
||||
for page in page_batch:
|
||||
page.predictions.tablestructure = TableStructurePrediction() # dummy
|
||||
|
||||
in_tables = [
|
||||
(
|
||||
cluster,
|
||||
[
|
||||
round(cluster.bbox.l),
|
||||
round(cluster.bbox.t),
|
||||
round(cluster.bbox.r),
|
||||
round(cluster.bbox.b),
|
||||
],
|
||||
)
|
||||
for cluster in page.predictions.layout.clusters
|
||||
if cluster.label == "Table"
|
||||
]
|
||||
if not len(in_tables):
|
||||
yield page
|
||||
continue
|
||||
|
||||
tokens = []
|
||||
for c in page.cells:
|
||||
for cluster, _ in in_tables:
|
||||
if c.bbox.area() > 0:
|
||||
if (
|
||||
c.bbox.intersection_area_with(cluster.bbox) / c.bbox.area()
|
||||
> 0.2
|
||||
):
|
||||
# Only allow non empty stings (spaces) into the cells of a table
|
||||
if len(c.text.strip()) > 0:
|
||||
tokens.append(c.model_dump())
|
||||
|
||||
iocr_page = {
|
||||
"image": numpy.asarray(page.image),
|
||||
"tokens": tokens,
|
||||
"width": page.size.width,
|
||||
"height": page.size.height,
|
||||
}
|
||||
|
||||
table_clusters, table_bboxes = zip(*in_tables)
|
||||
|
||||
if len(table_bboxes):
|
||||
tf_output = self.tf_predictor.multi_table_predict(
|
||||
iocr_page, table_bboxes, do_matching=self.do_cell_matching
|
||||
)
|
||||
|
||||
for table_cluster, table_out in zip(table_clusters, tf_output):
|
||||
table_cells = []
|
||||
for element in table_out["tf_responses"]:
|
||||
|
||||
if not self.do_cell_matching:
|
||||
the_bbox = BoundingBox.model_validate(element["bbox"])
|
||||
text_piece = page._backend.get_text_in_rect(the_bbox)
|
||||
element["bbox"]["token"] = text_piece
|
||||
|
||||
tc = TableCell.model_validate(element)
|
||||
table_cells.append(tc)
|
||||
|
||||
# Retrieving cols/rows, after post processing:
|
||||
num_rows = table_out["predict_details"]["num_rows"]
|
||||
num_cols = table_out["predict_details"]["num_cols"]
|
||||
otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
|
||||
|
||||
tbl = TableElement(
|
||||
otsl_seq=otsl_seq,
|
||||
table_cells=table_cells,
|
||||
num_rows=num_rows,
|
||||
num_cols=num_cols,
|
||||
id=table_cluster.id,
|
||||
page_no=page.page_no,
|
||||
cluster=table_cluster,
|
||||
label="Table",
|
||||
)
|
||||
|
||||
page.predictions.tablestructure.table_map[table_cluster.id] = tbl
|
||||
|
||||
yield page
|
0
docling/pipeline/__init__.py
Normal file
0
docling/pipeline/__init__.py
Normal file
18
docling/pipeline/base_model_pipeline.py
Normal file
18
docling/pipeline/base_model_pipeline.py
Normal file
@ -0,0 +1,18 @@
|
||||
from abc import abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling.datamodel.base_models import Page, PipelineOptions
|
||||
|
||||
|
||||
class BaseModelPipeline:
|
||||
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
||||
self.model_pipe = []
|
||||
self.artifacts_path = artifacts_path
|
||||
self.pipeline_options = pipeline_options
|
||||
|
||||
def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
for model in self.model_pipe:
|
||||
page_batch = model(page_batch)
|
||||
|
||||
yield from page_batch
|
40
docling/pipeline/standard_model_pipeline.py
Normal file
40
docling/pipeline/standard_model_pipeline.py
Normal file
@ -0,0 +1,40 @@
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling.datamodel.base_models import Page, PipelineOptions
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.page_assemble_model import PageAssembleModel
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||
|
||||
|
||||
class StandardModelPipeline(BaseModelPipeline):
|
||||
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
||||
_table_model_path = "model_artifacts/tableformer"
|
||||
|
||||
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
||||
super().__init__(artifacts_path, pipeline_options)
|
||||
|
||||
self.model_pipe = [
|
||||
EasyOcrModel(
|
||||
config={
|
||||
"lang": ["fr", "de", "es", "en"],
|
||||
"enabled": pipeline_options.do_ocr,
|
||||
}
|
||||
),
|
||||
LayoutModel(
|
||||
config={
|
||||
"artifacts_path": artifacts_path
|
||||
/ StandardModelPipeline._layout_model_path
|
||||
}
|
||||
),
|
||||
TableStructureModel(
|
||||
config={
|
||||
"artifacts_path": artifacts_path
|
||||
/ StandardModelPipeline._table_model_path,
|
||||
"enabled": pipeline_options.do_table_structure,
|
||||
"do_cell_matching": False,
|
||||
}
|
||||
),
|
||||
]
|
0
docling/utils/__init__.py
Normal file
0
docling/utils/__init__.py
Normal file
806
docling/utils/layout_utils.py
Normal file
806
docling/utils/layout_utils.py
Normal file
@ -0,0 +1,806 @@
|
||||
import copy
|
||||
import logging
|
||||
|
||||
import networkx as nx
|
||||
|
||||
logger = logging.getLogger("layout_utils")
|
||||
|
||||
|
||||
## -------------------------------
|
||||
## Geometric helper functions
|
||||
## The coordinates grow left to right, and bottom to top.
|
||||
## The bounding box list elements 0 to 3 are x_left, y_bottom, x_right, y_top.
|
||||
|
||||
|
||||
def area(bbox):
|
||||
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
|
||||
|
||||
|
||||
def contains(bbox_i, bbox_j):
|
||||
## Returns True if bbox_i contains bbox_j, else False
|
||||
return (
|
||||
bbox_i[0] <= bbox_j[0]
|
||||
and bbox_i[1] <= bbox_j[1]
|
||||
and bbox_i[2] >= bbox_j[2]
|
||||
and bbox_i[3] >= bbox_j[3]
|
||||
)
|
||||
|
||||
|
||||
def is_intersecting(bbox_i, bbox_j):
|
||||
return not (
|
||||
bbox_i[2] < bbox_j[0]
|
||||
or bbox_i[0] > bbox_j[2]
|
||||
or bbox_i[3] < bbox_j[1]
|
||||
or bbox_i[1] > bbox_j[3]
|
||||
)
|
||||
|
||||
|
||||
def bb_iou(boxA, boxB):
|
||||
# determine the (x, y)-coordinates of the intersection rectangle
|
||||
xA = max(boxA[0], boxB[0])
|
||||
yA = max(boxA[1], boxB[1])
|
||||
xB = min(boxA[2], boxB[2])
|
||||
yB = min(boxA[3], boxB[3])
|
||||
# compute the area of intersection rectangle
|
||||
interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
|
||||
# compute the area of both the prediction and ground-truth
|
||||
# rectangles
|
||||
boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
|
||||
boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
|
||||
# compute the intersection over union by taking the intersection
|
||||
# area and dividing it by the sum of prediction + ground-truth
|
||||
# areas - the interesection area
|
||||
iou = interArea / float(boxAArea + boxBArea - interArea)
|
||||
# return the intersection over union value
|
||||
return iou
|
||||
|
||||
|
||||
def compute_intersection(bbox_i, bbox_j):
|
||||
## Returns the size of the intersection area of the two boxes
|
||||
if not is_intersecting(bbox_i, bbox_j):
|
||||
return 0
|
||||
## Determine the (x, y)-coordinates of the intersection rectangle:
|
||||
xA = max(bbox_i[0], bbox_j[0])
|
||||
yA = max(bbox_i[1], bbox_j[1])
|
||||
xB = min(bbox_i[2], bbox_j[2])
|
||||
yB = min(bbox_i[3], bbox_j[3])
|
||||
## Compute the area of intersection rectangle:
|
||||
interArea = (xB - xA) * (yB - yA)
|
||||
if interArea < 0:
|
||||
logger.debug("Warning: Negative intersection detected!")
|
||||
return 0
|
||||
return interArea
|
||||
|
||||
|
||||
def surrounding(bbox_i, bbox_j):
|
||||
## Computes minimal box that contains both input boxes
|
||||
sbox = []
|
||||
sbox.append(min(bbox_i[0], bbox_j[0]))
|
||||
sbox.append(min(bbox_i[1], bbox_j[1]))
|
||||
sbox.append(max(bbox_i[2], bbox_j[2]))
|
||||
sbox.append(max(bbox_i[3], bbox_j[3]))
|
||||
return sbox
|
||||
|
||||
|
||||
def surrounding_list(bbox_list):
|
||||
## Computes minimal box that contains all boxes in the input list
|
||||
## The list should be non-empty, but just in case it's not:
|
||||
if len(bbox_list) == 0:
|
||||
sbox = [0, 0, 0, 0]
|
||||
else:
|
||||
sbox = []
|
||||
sbox.append(min([bbox[0] for bbox in bbox_list]))
|
||||
sbox.append(min([bbox[1] for bbox in bbox_list]))
|
||||
sbox.append(max([bbox[2] for bbox in bbox_list]))
|
||||
sbox.append(max([bbox[3] for bbox in bbox_list]))
|
||||
return sbox
|
||||
|
||||
|
||||
def vertical_overlap(bboxA, bboxB):
|
||||
## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
|
||||
if bboxB[3] < bboxA[1]: ## B below A
|
||||
return False
|
||||
elif bboxA[3] < bboxB[1]: ## A below B
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def vertical_overlap_fraction(bboxA, bboxB):
|
||||
## Returns the vertical overlap as fraction of the lower bbox height.
|
||||
## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
|
||||
## Height 0 is permitted in the input.
|
||||
heightA = bboxA[3] - bboxA[1]
|
||||
heightB = bboxB[3] - bboxB[1]
|
||||
min_height = min(heightA, heightB)
|
||||
if bboxA[3] >= bboxB[3]: ## A starts higher or equal
|
||||
if (
|
||||
bboxA[1] <= bboxB[1]
|
||||
): ## B is completely in A; this can include height of B = 0:
|
||||
fraction = 1
|
||||
else:
|
||||
overlap = max(bboxB[3] - bboxA[1], 0)
|
||||
fraction = overlap / max(min_height, 0.001)
|
||||
else:
|
||||
if (
|
||||
bboxB[1] <= bboxA[1]
|
||||
): ## A is completely in B; this can include height of A = 0:
|
||||
fraction = 1
|
||||
else:
|
||||
overlap = max(bboxA[3] - bboxB[1], 0)
|
||||
fraction = overlap / max(min_height, 0.001)
|
||||
return fraction
|
||||
|
||||
|
||||
## -------------------------------
|
||||
## Cluster-and-cell relations
|
||||
|
||||
|
||||
def compute_enclosed_cells(
|
||||
cluster_bbox, raw_cells, min_cell_intersection_with_cluster=0.2
|
||||
):
|
||||
cells_in_cluster = []
|
||||
cells_in_cluster_int = []
|
||||
for ix, cell in enumerate(raw_cells):
|
||||
cell_bbox = cell["bbox"]
|
||||
intersection = compute_intersection(cell_bbox, cluster_bbox)
|
||||
frac_area = area(cell_bbox) * min_cell_intersection_with_cluster
|
||||
|
||||
if (
|
||||
intersection > frac_area and frac_area > 0
|
||||
): # intersect > certain fraction of cell
|
||||
cells_in_cluster.append(ix)
|
||||
cells_in_cluster_int.append(intersection)
|
||||
elif contains(
|
||||
cluster_bbox,
|
||||
[cell_bbox[0] + 3, cell_bbox[1] + 3, cell_bbox[2] - 3, cell_bbox[3] - 3],
|
||||
):
|
||||
cells_in_cluster.append(ix)
|
||||
return cells_in_cluster, cells_in_cluster_int
|
||||
|
||||
|
||||
def find_clusters_around_cells(cell_count, clusters):
|
||||
## Per raw cell, find to which clusters it belongs.
|
||||
## Return list of these indices in the raw-cell order.
|
||||
clusters_around_cells = [[] for _ in range(cell_count)]
|
||||
for cl_ix, cluster in enumerate(clusters):
|
||||
for ix in cluster["cell_ids"]:
|
||||
clusters_around_cells[ix].append(cl_ix)
|
||||
return clusters_around_cells
|
||||
|
||||
|
||||
def find_cell_index(raw_ix, cell_array):
|
||||
## "raw_ix" is a rawcell_id.
|
||||
## "cell_array" has the structure of an (annotation) cells array.
|
||||
## Returns index of cell in cell_array that has this rawcell_id.
|
||||
for ix, cell in enumerate(cell_array):
|
||||
if cell["rawcell_id"] == raw_ix:
|
||||
return ix
|
||||
|
||||
|
||||
def find_cell_indices(cluster, cell_array):
|
||||
## "cluster" must have the structure as in a clusters array in a prediction,
|
||||
## "cell_array" that of a cells array.
|
||||
## Returns list of indices of cells in cell_array that have the rawcell_ids as in the cluster,
|
||||
## in the order of the rawcell_ids.
|
||||
result = []
|
||||
for raw_ix in sorted(cluster["cell_ids"]):
|
||||
## Find the cell with this rawcell_id (if any)
|
||||
for ix, cell in enumerate(cell_array):
|
||||
if cell["rawcell_id"] == raw_ix:
|
||||
result.append(ix)
|
||||
return result
|
||||
|
||||
|
||||
def find_first_cell_index(cluster, cell_array):
|
||||
## "cluster" must be a dict with key "cell_ids"; it can also be a line.
|
||||
## "cell_array" has the structure of a cells array in an annotation.
|
||||
## Returns index of cell in cell_array that has the lowest rawcell_id from the cluster.
|
||||
result = [] ## We keep it a list as it can be empty (picture without text cells)
|
||||
if len(cluster["cell_ids"]) == 0:
|
||||
return result
|
||||
raw_ix = min(cluster["cell_ids"])
|
||||
## Find the cell with this rawcell_id (if any)
|
||||
for ix, cell in enumerate(cell_array):
|
||||
if cell["rawcell_id"] == raw_ix:
|
||||
result.append(ix)
|
||||
break ## One is enough; should be only one anyway.
|
||||
if result == []:
|
||||
logger.debug(
|
||||
" Warning: Raw cell " + str(raw_ix) + " not found in annotation cells"
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
## -------------------------------
|
||||
## Cluster labels and text
|
||||
|
||||
|
||||
def relabel_cluster(cluster, cl_ix, new_label, target_pred):
|
||||
## "cluster" must have the structure as in a clusters array in a prediction,
|
||||
## "cl_ix" is its index in target_pred,
|
||||
## "new_label" is the intended new label,
|
||||
## "target_pred" is the entire current target prediction.
|
||||
## Sets label on the cluster itself, and on the cells in the target_pred.
|
||||
## Returns new_label so that also the cl_label variable in the main code is easily set.
|
||||
target_pred["clusters"][cl_ix]["type"] = new_label
|
||||
cluster_target_cells = find_cell_indices(cluster, target_pred["cells"])
|
||||
for ix in cluster_target_cells:
|
||||
target_pred["cells"][ix]["label"] = new_label
|
||||
return new_label
|
||||
|
||||
|
||||
def find_cluster_text(cluster, raw_cells):
|
||||
## "cluster" must be a dict with "cell_ids"; it can also be a line.
|
||||
## "raw_cells" must have the format of item["raw"]["cells"]
|
||||
## Returns the text of the cluster, with blanks between the cell contents
|
||||
## (which seem to be words or phrases without starting or trailing blanks).
|
||||
## Note that in formulas, this may give a lot more blanks than originally
|
||||
cluster_text = ""
|
||||
for raw_ix in sorted(cluster["cell_ids"]):
|
||||
cluster_text = cluster_text + raw_cells[raw_ix]["text"] + " "
|
||||
return cluster_text.rstrip()
|
||||
|
||||
|
||||
def find_cluster_text_without_blanks(cluster, raw_cells):
|
||||
## "cluster" must be a dict with "cell_ids"; it can also be a line.
|
||||
## "raw_cells" must have the format of item["raw"]["cells"]
|
||||
## Returns the text of the cluster, without blanks between the cell contents
|
||||
## Interesting in formula analysis.
|
||||
cluster_text = ""
|
||||
for raw_ix in sorted(cluster["cell_ids"]):
|
||||
cluster_text = cluster_text + raw_cells[raw_ix]["text"]
|
||||
return cluster_text.rstrip()
|
||||
|
||||
|
||||
## -------------------------------
|
||||
## Clusters and lines
|
||||
## (Most line-oriented functions are only needed in TextAnalysisGivenClusters,
|
||||
## but this one also in FormulaAnalysis)
|
||||
|
||||
|
||||
def build_cluster_from_lines(lines, label, id):
|
||||
## Lines must be a non-empty list of dicts (lines) with elements "cell_ids" and "bbox"
|
||||
## (There is no condition that they are really geometrically lines)
|
||||
## A cluster in standard format is returned with given label and id
|
||||
local_lines = copy.deepcopy(
|
||||
lines
|
||||
) ## without this, it changes "lines" also outside this function
|
||||
first_line = local_lines.pop(0)
|
||||
cluster = {
|
||||
"id": id,
|
||||
"type": label,
|
||||
"cell_ids": first_line["cell_ids"],
|
||||
"bbox": first_line["bbox"],
|
||||
"confidence": 0,
|
||||
"created_by": "merged_cells",
|
||||
}
|
||||
confidence = 0
|
||||
counter = 0
|
||||
for line in local_lines:
|
||||
new_cell_ids = cluster["cell_ids"] + line["cell_ids"]
|
||||
cluster["cell_ids"] = new_cell_ids
|
||||
cluster["bbox"] = surrounding(cluster["bbox"], line["bbox"])
|
||||
counter += 1
|
||||
confidence += line["confidence"]
|
||||
confidence = confidence / counter
|
||||
cluster["confidence"] = confidence
|
||||
return cluster
|
||||
|
||||
|
||||
## -------------------------------
|
||||
## Reading order
|
||||
|
||||
|
||||
def produce_reading_order(clusters, cluster_sort_type, cell_sort_type, sort_ids):
|
||||
## In:
|
||||
## Clusters: list as in predictions.
|
||||
## cluster_sort_type: string, currently only "raw_cells".
|
||||
## cell_sort_type: string, currently only "raw_cells".
|
||||
## sort_ids: Boolean, whether the cluster ids should be adapted to their new position
|
||||
## Out: Another clusters list, sorted according to the type.
|
||||
|
||||
logger.debug("---- Start cluster sorting ------")
|
||||
|
||||
if cell_sort_type == "raw_cell_ids":
|
||||
for cl in clusters:
|
||||
sorted_cell_ids = sorted(cl["cell_ids"])
|
||||
cl["cell_ids"] = sorted_cell_ids
|
||||
else:
|
||||
logger.debug(
|
||||
"Unknown cell_sort_type `"
|
||||
+ cell_sort_type
|
||||
+ "`, no cell sorting will happen."
|
||||
)
|
||||
|
||||
if cluster_sort_type == "raw_cell_ids":
|
||||
clusters_with_cells = [cl for cl in clusters if cl["cell_ids"] != []]
|
||||
clusters_without_cells = [cl for cl in clusters if cl["cell_ids"] == []]
|
||||
logger.debug(
|
||||
"Clusters with cells: " + str([cl["id"] for cl in clusters_with_cells])
|
||||
)
|
||||
logger.debug(
|
||||
" Their first cell ids: "
|
||||
+ str([cl["cell_ids"][0] for cl in clusters_with_cells])
|
||||
)
|
||||
logger.debug(
|
||||
"Clusters without cells: "
|
||||
+ str([cl["id"] for cl in clusters_without_cells])
|
||||
)
|
||||
clusters_with_cells_sorted = sorted(
|
||||
clusters_with_cells, key=lambda cluster: cluster["cell_ids"][0]
|
||||
)
|
||||
logger.debug(
|
||||
" First cell ids after sorting: "
|
||||
+ str([cl["cell_ids"][0] for cl in clusters_with_cells_sorted])
|
||||
)
|
||||
sorted_clusters = clusters_with_cells_sorted + clusters_without_cells
|
||||
else:
|
||||
logger.debug(
|
||||
"Unknown cluster_sort_type: `"
|
||||
+ cluster_sort_type
|
||||
+ "`, no cluster sorting will happen."
|
||||
)
|
||||
|
||||
if sort_ids:
|
||||
for i, cl in enumerate(sorted_clusters):
|
||||
cl["id"] = i
|
||||
return sorted_clusters
|
||||
|
||||
|
||||
## -------------------------------
|
||||
## Line Splitting
|
||||
|
||||
|
||||
def sort_cells_horizontal(line_cell_ids, raw_cells):
|
||||
## "line_cells" should be a non-empty list of (raw) cell_ids
|
||||
## "raw_cells" has the structure of item["raw"]["cells"].
|
||||
## Sorts the cells in the line by x0 (left start).
|
||||
new_line_cell_ids = sorted(
|
||||
line_cell_ids, key=lambda cell_id: raw_cells[cell_id]["bbox"][0]
|
||||
)
|
||||
return new_line_cell_ids
|
||||
|
||||
|
||||
def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
|
||||
new_clusters = []
|
||||
for ix, cluster in enumerate(clusters):
|
||||
new_cluster = copy.deepcopy(cluster)
|
||||
logger.debug(
|
||||
"Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
|
||||
)
|
||||
logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
|
||||
if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
|
||||
logger.debug(" Empty non-picture, removed")
|
||||
continue ## Skip this former cluster, now without cells.
|
||||
new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
|
||||
new_cluster["bbox"] = new_bbox
|
||||
new_clusters.append(new_cluster)
|
||||
return new_clusters
|
||||
|
||||
|
||||
def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
|
||||
if not (cluster["type"] in ["Table", "Picture"]):
|
||||
## A text-like cluster. The bbox only needs to be around the text cells:
|
||||
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
|
||||
new_bbox = surrounding_list(
|
||||
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
|
||||
)
|
||||
logger.debug(" New bounding box:" + str(new_bbox))
|
||||
if cluster["type"] == "Picture":
|
||||
## We only make the bbox completely comprise included text cells:
|
||||
logger.debug(" Picture")
|
||||
if len(cluster["cell_ids"]) != 0:
|
||||
min_bbox = surrounding_list(
|
||||
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
|
||||
)
|
||||
logger.debug(" Minimum bbox: " + str(min_bbox))
|
||||
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
|
||||
new_bbox = surrounding(min_bbox, cluster["bbox"])
|
||||
logger.debug(" New bbox (initial and text cells): " + str(new_bbox))
|
||||
else:
|
||||
logger.debug(" without text cells, no change.")
|
||||
new_bbox = cluster["bbox"]
|
||||
else: ## A table
|
||||
## At least we have to keep the included text cells, and we make the bbox completely comprise them
|
||||
min_bbox = surrounding_list(
|
||||
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
|
||||
)
|
||||
logger.debug(" Minimum bbox: " + str(min_bbox))
|
||||
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
|
||||
new_bbox = surrounding(min_bbox, cluster["bbox"])
|
||||
logger.debug(" Possibly increased bbox: " + str(new_bbox))
|
||||
|
||||
## Now we look which non-belonging cells are covered.
|
||||
## (To decrease dependencies, we don't make use of which cells we actually removed.)
|
||||
## We don't worry about orphan cells, those could still be added to the table.
|
||||
enclosed_cells = compute_enclosed_cells(
|
||||
new_bbox, raw_cells, min_cell_intersection_with_cluster=0.3
|
||||
)[0]
|
||||
additional_cells = set(enclosed_cells) - set(cluster["cell_ids"])
|
||||
logger.debug(
|
||||
" Additional cells enclosed by Table bbox: " + str(additional_cells)
|
||||
)
|
||||
spurious_cells = additional_cells - set(orphan_cell_indices)
|
||||
logger.debug(
|
||||
" Spurious cells enclosed by Table bbox (additional minus orphans): "
|
||||
+ str(spurious_cells)
|
||||
)
|
||||
if len(spurious_cells) == 0:
|
||||
return new_bbox
|
||||
|
||||
## Else we want to keep as much as possible, e.g., grid lines, but not the spurious cells if we can.
|
||||
## We initialize possible cuts with the current bbox.
|
||||
left_cut = new_bbox[0]
|
||||
right_cut = new_bbox[2]
|
||||
upper_cut = new_bbox[3]
|
||||
lower_cut = new_bbox[1]
|
||||
|
||||
for cell_ix in spurious_cells:
|
||||
cell = raw_cells[cell_ix]
|
||||
# logger.debug(" Spurious cell bbox: " + str(cell["bbox"]))
|
||||
is_left = cell["bbox"][2] < min_bbox[0]
|
||||
is_right = cell["bbox"][0] > min_bbox[2]
|
||||
is_above = cell["bbox"][1] > min_bbox[3]
|
||||
is_below = cell["bbox"][3] < min_bbox[1]
|
||||
# logger.debug(" Left, right, above, below? " + str([is_left, is_right, is_above, is_below]))
|
||||
|
||||
if is_left:
|
||||
if cell["bbox"][2] > left_cut:
|
||||
## We move the left cut to exclude this cell:
|
||||
left_cut = cell["bbox"][2]
|
||||
if is_right:
|
||||
if cell["bbox"][0] < right_cut:
|
||||
## We move the right cut to exclude this cell:
|
||||
right_cut = cell["bbox"][0]
|
||||
if is_above:
|
||||
if cell["bbox"][1] < upper_cut:
|
||||
## We move the upper cut to exclude this cell:
|
||||
upper_cut = cell["bbox"][1]
|
||||
if is_below:
|
||||
if cell["bbox"][3] > lower_cut:
|
||||
## We move the left cut to exclude this cell:
|
||||
lower_cut = cell["bbox"][3]
|
||||
# logger.debug(" Current bbox: " + str([left_cut, lower_cut, right_cut, upper_cut]))
|
||||
|
||||
new_bbox = [left_cut, lower_cut, right_cut, upper_cut]
|
||||
|
||||
logger.debug(" Final bbox: " + str(new_bbox))
|
||||
return new_bbox
|
||||
|
||||
|
||||
def remove_cluster_duplicates_by_conf(cluster_predictions, threshold=0.5):
|
||||
DuplicateDeletedClusterIDs = []
|
||||
for cluster_1 in cluster_predictions:
|
||||
for cluster_2 in cluster_predictions:
|
||||
if cluster_1["id"] != cluster_2["id"]:
|
||||
if_conf = False
|
||||
if cluster_1["confidence"] > cluster_2["confidence"]:
|
||||
if_conf = True
|
||||
if if_conf == True:
|
||||
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > threshold:
|
||||
DuplicateDeletedClusterIDs.append(cluster_2["id"])
|
||||
elif contains(
|
||||
cluster_1["bbox"],
|
||||
[
|
||||
cluster_2["bbox"][0] + 3,
|
||||
cluster_2["bbox"][1] + 3,
|
||||
cluster_2["bbox"][2] - 3,
|
||||
cluster_2["bbox"][3] - 3,
|
||||
],
|
||||
):
|
||||
DuplicateDeletedClusterIDs.append(cluster_2["id"])
|
||||
|
||||
DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
|
||||
|
||||
for cl_id in DuplicateDeletedClusterIDs:
|
||||
for cluster in cluster_predictions:
|
||||
if cl_id == cluster["id"]:
|
||||
cluster_predictions.remove(cluster)
|
||||
return cluster_predictions
|
||||
|
||||
|
||||
# Assign orphan cells by a low confidence prediction that is below the assigned confidence
|
||||
def assign_orphans_with_low_conf_pred(
|
||||
cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
|
||||
):
|
||||
for orph_id in orphan_cell_indices:
|
||||
cluster_chosen = {}
|
||||
iou_thresh = 0.05
|
||||
confidence = 0.05
|
||||
|
||||
# Loop over all predictions, and find the one with the highest IOU, and confidence
|
||||
for cluster in cluster_predictions_low:
|
||||
calc_iou = bb_iou(cluster["bbox"], raw_cells[orph_id]["bbox"])
|
||||
cluster_area = (cluster["bbox"][3] - cluster["bbox"][1]) * (
|
||||
cluster["bbox"][2] - cluster["bbox"][0]
|
||||
)
|
||||
cell_area = (
|
||||
raw_cells[orph_id]["bbox"][3] - raw_cells[orph_id]["bbox"][1]
|
||||
) * (raw_cells[orph_id]["bbox"][2] - raw_cells[orph_id]["bbox"][0])
|
||||
|
||||
if (
|
||||
(iou_thresh < calc_iou)
|
||||
and (cluster["confidence"] > confidence)
|
||||
and (cell_area * 3 > cluster_area)
|
||||
):
|
||||
cluster_chosen = cluster
|
||||
iou_thresh = calc_iou
|
||||
confidence = cluster["confidence"]
|
||||
# If a candidate is found, assign to it the PDF cell ids, and tag that it was created by this function for tracking
|
||||
if iou_thresh != 0.05 and confidence != 0.05:
|
||||
cluster_chosen["cell_ids"].append(orph_id)
|
||||
cluster_chosen["created_by"] = "orph_low_conf"
|
||||
cluster_predictions.append(cluster_chosen)
|
||||
orphan_cell_indices.remove(orph_id)
|
||||
return cluster_predictions, orphan_cell_indices
|
||||
|
||||
|
||||
def remove_ambigous_pdf_cell_by_conf(cluster_predictions, raw_cells, amb_cell_idxs):
|
||||
for amb_cell_id in amb_cell_idxs:
|
||||
highest_conf = 0
|
||||
highest_bbox_iou = 0
|
||||
cluster_chosen = None
|
||||
problamatic_clusters = []
|
||||
|
||||
# Find clusters in question
|
||||
for cluster in cluster_predictions:
|
||||
|
||||
if amb_cell_id in cluster["cell_ids"]:
|
||||
problamatic_clusters.append(amb_cell_id)
|
||||
|
||||
# If the cell_id is in a cluster of high conf, and highest iou score, and smaller in area
|
||||
bbox_iou_val = bb_iou(cluster["bbox"], raw_cells[amb_cell_id]["bbox"])
|
||||
|
||||
if (
|
||||
cluster["confidence"] > highest_conf
|
||||
and bbox_iou_val > highest_bbox_iou
|
||||
):
|
||||
cluster_chosen = cluster
|
||||
highest_conf = cluster["confidence"]
|
||||
highest_bbox_iou = bbox_iou_val
|
||||
if cluster["id"] in problamatic_clusters:
|
||||
problamatic_clusters.remove(cluster["id"])
|
||||
|
||||
# now remove the assigning of cell id from lower confidence, and threshold
|
||||
for cluster in cluster_predictions:
|
||||
for prob_amb_id in problamatic_clusters:
|
||||
if prob_amb_id in cluster["cell_ids"]:
|
||||
cluster["cell_ids"].remove(prob_amb_id)
|
||||
amb_cell_idxs.remove(amb_cell_id)
|
||||
|
||||
return cluster_predictions, amb_cell_idxs
|
||||
|
||||
|
||||
def ranges(nums):
|
||||
# Find if consecutive numbers exist within pdf cells
|
||||
# Used to remove line numbers for review manuscripts
|
||||
nums = sorted(set(nums))
|
||||
gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s + 1 < e]
|
||||
edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
|
||||
return list(zip(edges, edges))
|
||||
|
||||
|
||||
def set_orphan_as_text(
|
||||
cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
|
||||
):
|
||||
max_id = -1
|
||||
figures = []
|
||||
for cluster in cluster_predictions:
|
||||
if cluster["type"] == "Picture":
|
||||
figures.append(cluster)
|
||||
|
||||
if cluster["id"] > max_id:
|
||||
max_id = cluster["id"]
|
||||
max_id += 1
|
||||
|
||||
lines_detector = False
|
||||
content_of_orphans = []
|
||||
for orph_id in orphan_cell_indices:
|
||||
orph_cell = raw_cells[orph_id]
|
||||
content_of_orphans.append(raw_cells[orph_id]["text"])
|
||||
|
||||
fil_content_of_orphans = []
|
||||
for cell_content in content_of_orphans:
|
||||
if cell_content.isnumeric():
|
||||
try:
|
||||
num = int(cell_content)
|
||||
fil_content_of_orphans.append(num)
|
||||
except ValueError: # ignore the cell
|
||||
pass
|
||||
|
||||
# line_orphans = []
|
||||
# Check if there are more than 2 pdf orphan cells, if there are more than 2,
|
||||
# then check between the orphan cells if they are numeric
|
||||
# and if they are a consecutive series of numbers (using ranges function) to decide
|
||||
|
||||
if len(fil_content_of_orphans) > 2:
|
||||
out_ranges = ranges(fil_content_of_orphans)
|
||||
if len(out_ranges) > 1:
|
||||
cnt_range = 0
|
||||
for ranges_ in out_ranges:
|
||||
if ranges_[0] != ranges_[1]:
|
||||
# If there are more than 75 (half the total line number of a review manuscript page)
|
||||
# decide that there are line numbers on page to be ignored.
|
||||
if len(list(range(ranges_[0], ranges_[1]))) > 75:
|
||||
lines_detector = True
|
||||
# line_orphans = line_orphans + list(range(ranges_[0], ranges_[1]))
|
||||
|
||||
for orph_id in orphan_cell_indices:
|
||||
orph_cell = raw_cells[orph_id]
|
||||
if bool(orph_cell["text"] and not orph_cell["text"].isspace()):
|
||||
fig_flag = False
|
||||
# Do not assign orphan cells if they are inside a figure
|
||||
for fig in figures:
|
||||
if contains(fig["bbox"], orph_cell["bbox"]):
|
||||
fig_flag = True
|
||||
|
||||
# if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
|
||||
if fig_flag == False and lines_detector == False:
|
||||
# get class from low confidence detections if not set as text:
|
||||
class_type = "Text"
|
||||
|
||||
for cluster in cluster_predictions_low:
|
||||
intersection = compute_intersection(
|
||||
orph_cell["bbox"], cluster["bbox"]
|
||||
)
|
||||
class_type = "Text"
|
||||
if (
|
||||
cluster["confidence"] > 0.1
|
||||
and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
|
||||
):
|
||||
class_type = cluster["type"]
|
||||
elif contains(
|
||||
cluster["bbox"],
|
||||
[
|
||||
orph_cell["bbox"][0] + 3,
|
||||
orph_cell["bbox"][1] + 3,
|
||||
orph_cell["bbox"][2] - 3,
|
||||
orph_cell["bbox"][3] - 3,
|
||||
],
|
||||
):
|
||||
class_type = cluster["type"]
|
||||
elif intersection > area(orph_cell["bbox"]) * 0.2:
|
||||
class_type = cluster["type"]
|
||||
|
||||
new_cluster = {
|
||||
"id": max_id,
|
||||
"bbox": orph_cell["bbox"],
|
||||
"type": class_type,
|
||||
"cell_ids": [orph_id],
|
||||
"confidence": -1,
|
||||
"created_by": "orphan_default",
|
||||
}
|
||||
max_id += 1
|
||||
cluster_predictions.append(new_cluster)
|
||||
return cluster_predictions, orphan_cell_indices
|
||||
|
||||
|
||||
def merge_cells(cluster_predictions):
|
||||
# Using graph component creates clusters if orphan cells are touching or too close.
|
||||
G = nx.Graph()
|
||||
for cluster in cluster_predictions:
|
||||
if cluster["created_by"] == "orphan_default":
|
||||
G.add_node(cluster["id"])
|
||||
|
||||
for cluster_1 in cluster_predictions:
|
||||
for cluster_2 in cluster_predictions:
|
||||
if (
|
||||
cluster_1["id"] != cluster_2["id"]
|
||||
and cluster_2["created_by"] == "orphan_default"
|
||||
and cluster_1["created_by"] == "orphan_default"
|
||||
):
|
||||
cl1 = copy.deepcopy(cluster_1["bbox"])
|
||||
cl2 = copy.deepcopy(cluster_2["bbox"])
|
||||
cl1[0] = cl1[0] - 2
|
||||
cl1[1] = cl1[1] - 2
|
||||
cl1[2] = cl1[2] + 2
|
||||
cl1[3] = cl1[3] + 2
|
||||
cl2[0] = cl2[0] - 2
|
||||
cl2[1] = cl2[1] - 2
|
||||
cl2[2] = cl2[2] + 2
|
||||
cl2[3] = cl2[3] + 2
|
||||
if is_intersecting(cl1, cl2):
|
||||
G.add_edge(cluster_1["id"], cluster_2["id"])
|
||||
|
||||
component = sorted(map(sorted, nx.k_edge_components(G, k=1)))
|
||||
max_id = -1
|
||||
for cluster_1 in cluster_predictions:
|
||||
if cluster_1["id"] > max_id:
|
||||
max_id = cluster_1["id"]
|
||||
|
||||
for nodes in component:
|
||||
if len(nodes) > 1:
|
||||
max_id += 1
|
||||
lines = []
|
||||
for node in nodes:
|
||||
for cluster in cluster_predictions:
|
||||
if cluster["id"] == node:
|
||||
lines.append(cluster)
|
||||
cluster_predictions.remove(cluster)
|
||||
new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
|
||||
cluster_predictions.append(new_merged_cluster)
|
||||
return cluster_predictions
|
||||
|
||||
|
||||
def clean_up_clusters(
|
||||
cluster_predictions,
|
||||
raw_cells,
|
||||
merge_cells=False,
|
||||
img_table=False,
|
||||
one_cell_table=False,
|
||||
):
|
||||
DuplicateDeletedClusterIDs = []
|
||||
|
||||
for cluster_1 in cluster_predictions:
|
||||
for cluster_2 in cluster_predictions:
|
||||
if cluster_1["id"] != cluster_2["id"]:
|
||||
# remove any artifcats created by merging clusters
|
||||
if merge_cells == True:
|
||||
if contains(
|
||||
cluster_1["bbox"],
|
||||
[
|
||||
cluster_2["bbox"][0] + 3,
|
||||
cluster_2["bbox"][1] + 3,
|
||||
cluster_2["bbox"][2] - 3,
|
||||
cluster_2["bbox"][3] - 3,
|
||||
],
|
||||
):
|
||||
cluster_1["cell_ids"] = (
|
||||
cluster_1["cell_ids"] + cluster_2["cell_ids"]
|
||||
)
|
||||
DuplicateDeletedClusterIDs.append(cluster_2["id"])
|
||||
# remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
|
||||
elif img_table == True:
|
||||
if (
|
||||
cluster_1["type"] == "Text"
|
||||
and cluster_2["type"] == "Picture"
|
||||
or cluster_2["type"] == "Table"
|
||||
):
|
||||
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
|
||||
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
||||
elif contains(
|
||||
[
|
||||
cluster_2["bbox"][0] - 3,
|
||||
cluster_2["bbox"][1] - 3,
|
||||
cluster_2["bbox"][2] + 3,
|
||||
cluster_2["bbox"][3] + 3,
|
||||
],
|
||||
cluster_1["bbox"],
|
||||
):
|
||||
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
||||
# remove tables that have one pdf cell
|
||||
if one_cell_table == True:
|
||||
if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
|
||||
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
||||
|
||||
DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
|
||||
|
||||
for cl_id in DuplicateDeletedClusterIDs:
|
||||
for cluster in cluster_predictions:
|
||||
if cl_id == cluster["id"]:
|
||||
cluster_predictions.remove(cluster)
|
||||
return cluster_predictions
|
||||
|
||||
|
||||
def assigning_cell_ids_to_clusters(clusters, raw_cells, threshold):
|
||||
for cluster in clusters:
|
||||
cells_in_cluster, _ = compute_enclosed_cells(
|
||||
cluster["bbox"], raw_cells, min_cell_intersection_with_cluster=threshold
|
||||
)
|
||||
cluster["cell_ids"] = cells_in_cluster
|
||||
## These cell_ids are ids of the raw cells.
|
||||
## They are often, but not always, the same as the "id" or the index of the "cells" list in a prediction.
|
||||
return clusters
|
||||
|
||||
|
||||
# Creates a map of cell_id->cluster_id
|
||||
def cell_id_state_map(clusters, cell_count):
|
||||
clusters_around_cells = find_clusters_around_cells(cell_count, clusters)
|
||||
orphan_cell_indices = [
|
||||
ix for ix in range(cell_count) if len(clusters_around_cells[ix]) == 0
|
||||
] # which cells are assigned no cluster?
|
||||
ambiguous_cell_indices = [
|
||||
ix for ix in range(cell_count) if len(clusters_around_cells[ix]) > 1
|
||||
] # which cells are assigned > 1 clusters?
|
||||
return clusters_around_cells, orphan_cell_indices, ambiguous_cell_indices
|
41
docling/utils/utils.py
Normal file
41
docling/utils/utils.py
Normal file
@ -0,0 +1,41 @@
|
||||
import hashlib
|
||||
from io import BytesIO
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
|
||||
|
||||
def chunkify(iterator, chunk_size):
|
||||
"""Yield successive chunks of chunk_size from the iterable."""
|
||||
if isinstance(iterator, List):
|
||||
iterator = iter(iterator)
|
||||
for first in iterator: # Take the first element from the iterator
|
||||
yield [first] + list(islice(iterator, chunk_size - 1))
|
||||
|
||||
|
||||
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
|
||||
"""Create a stable page_hash of the path_or_stream of a file"""
|
||||
|
||||
block_size = 65536
|
||||
hasher = hashlib.sha256()
|
||||
|
||||
def _hash_buf(binary_stream):
|
||||
buf = binary_stream.read(block_size) # read and page_hash in chunks
|
||||
while len(buf) > 0:
|
||||
hasher.update(buf)
|
||||
buf = binary_stream.read(block_size)
|
||||
|
||||
if isinstance(path_or_stream, Path):
|
||||
with path_or_stream.open("rb") as afile:
|
||||
_hash_buf(afile)
|
||||
elif isinstance(path_or_stream, BytesIO):
|
||||
_hash_buf(path_or_stream)
|
||||
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
def create_hash(string: str):
|
||||
hasher = hashlib.sha256()
|
||||
hasher.update(string.encode("utf-8"))
|
||||
|
||||
return hasher.hexdigest()
|
73
examples/convert.py
Normal file
73
examples/convert.py
Normal file
@ -0,0 +1,73 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def export_documents(
|
||||
converted_docs: Iterable[ConvertedDocument],
|
||||
output_dir: Path,
|
||||
):
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
|
||||
for doc in converted_docs:
|
||||
if doc.status == ConversionStatus.SUCCESS:
|
||||
success_count += 1
|
||||
doc_filename = doc.input.file.stem
|
||||
|
||||
# Export Deep Search document JSON format:
|
||||
with (output_dir / f"{doc_filename}.json").open("w") as fp:
|
||||
fp.write(json.dumps(doc.render_as_dict()))
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
||||
fp.write(doc.render_as_markdown())
|
||||
else:
|
||||
_log.info(f"Document {doc.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
|
||||
_log.info(
|
||||
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
# Path("/Users/cau/Downloads/Issue-36122.pdf"),
|
||||
# Path("/Users/cau/Downloads/IBM_Storage_Insights_Fact_Sheet.pdf"),
|
||||
Path("./test/data/2206.01062.pdf"),
|
||||
Path("./test/data/2203.01017v2.pdf"),
|
||||
Path("./test/data/2305.03393v1.pdf"),
|
||||
]
|
||||
|
||||
artifacts_path = DocumentConverter.download_models_hf()
|
||||
|
||||
doc_converter = DocumentConverter(artifacts_path=artifacts_path)
|
||||
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
converted_docs = doc_converter.convert(input)
|
||||
export_documents(converted_docs, output_dir=Path("./scratch"))
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
11
examples/minimal.py
Normal file
11
examples/minimal.py
Normal file
@ -0,0 +1,11 @@
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
artifacts_path = DocumentConverter.download_models_hf()
|
||||
doc_converter = DocumentConverter(artifacts_path=artifacts_path)
|
||||
|
||||
input = DocumentConversionInput.from_paths(["factsheet.pdf"])
|
||||
converted_docs = doc_converter.convert(input)
|
||||
|
||||
for d in converted_docs:
|
||||
print(d.render_as_dict())
|
4865
poetry.lock
generated
Normal file
4865
poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
72
pyproject.toml
Normal file
72
pyproject.toml
Normal file
@ -0,0 +1,72 @@
|
||||
[tool.poetry]
|
||||
name = "docling"
|
||||
version = "0.1.0"
|
||||
description = "Docling PDF conversion package"
|
||||
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
||||
license = "MIT"
|
||||
readme = "README.md"
|
||||
keywords= ["docling", "convert", "document", "pdf", "layout model", "segmentation", "table structure", "table former"]
|
||||
classifiers = [
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: MacOS :: MacOS X",
|
||||
"Operating System :: POSIX :: Linux",
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: Science/Research",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
"Programming Language :: Python :: 3"
|
||||
]
|
||||
packages = [{include = "docling"}]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.11"
|
||||
pydantic = "^2.0.0"
|
||||
docling-core = "^0.2.0"
|
||||
docling-ibm-models = "^0.2.0"
|
||||
deepsearch-glm = ">=0.18.4,<1"
|
||||
deepsearch-toolkit = ">=0.47.0,<1"
|
||||
filetype = "^1.2.0"
|
||||
pypdfium2 = "^4.30.0"
|
||||
pydantic-settings = "^2.3.0"
|
||||
huggingface_hub = ">=0.23,<1"
|
||||
|
||||
[tool.poetry.group.ocr.dependencies]
|
||||
easyocr = "^1.7"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = {extras = ["jupyter"], version = "^24.4.2"}
|
||||
pytest = "^7.2.2"
|
||||
pre-commit = "^3.7.1"
|
||||
mypy = "^1.10.1"
|
||||
isort = "^5.10.1"
|
||||
python-semantic-release = "^7.32.2"
|
||||
flake8 = "^6.0.0"
|
||||
pyproject-flake8 = "^6.0.0"
|
||||
pytest-xdist = "^3.3.1"
|
||||
types-requests = "^2.31.0.2"
|
||||
flake8-pyproject = "^1.2.3"
|
||||
pylint = "^2.17.5"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.black]
|
||||
line-length = 88
|
||||
target-version = ["py311"]
|
||||
include = '\.pyi?$'
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
line_length = 88
|
||||
py_version=311
|
||||
|
||||
[tool.mypy]
|
||||
pretty = true
|
||||
# strict = true
|
||||
no_implicit_optional = true
|
||||
python_version = "3.11"
|
||||
|
||||
[tool.flake8]
|
||||
max-line-length = 88
|
||||
extend-ignore = ["E203", "E501"]
|
BIN
test/data/2203.01017v2.pdf
Normal file
BIN
test/data/2203.01017v2.pdf
Normal file
Binary file not shown.
BIN
test/data/2206.01062.pdf
Normal file
BIN
test/data/2206.01062.pdf
Normal file
Binary file not shown.
BIN
test/data/2305.03393v1.pdf
Normal file
BIN
test/data/2305.03393v1.pdf
Normal file
Binary file not shown.
33
test/test_backend_pdfium.py
Normal file
33
test/test_backend_pdfium.py
Normal file
@ -0,0 +1,33 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend
|
||||
from docling.datamodel.base_models import BoundingBox
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_doc_path():
|
||||
return Path("./data/2206.01062.pdf")
|
||||
|
||||
def test_get_text_from_rect(test_doc_path):
|
||||
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
|
||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Get the title text of the DocLayNet paper
|
||||
textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124))
|
||||
ref = "DocLayNet: A Large Human-Annotated Dataset for\r\nDocument-Layout Analysis"
|
||||
|
||||
assert textpiece.strip() == ref
|
||||
|
||||
def test_crop_page_image(test_doc_path):
|
||||
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
|
||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Crop out "Figure 1" from the DocLayNet paper
|
||||
im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527))
|
||||
# im.show()
|
||||
|
||||
def test_num_pages(test_doc_path):
|
||||
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
|
||||
doc_backend.page_count() == 9
|
Loading…
Reference in New Issue
Block a user