Initial commit
This commit is contained in:
commit
e2d996753b
442
.gitignore
vendored
Normal file
442
.gitignore
vendored
Normal file
@ -0,0 +1,442 @@
|
|||||||
|
model_artifacts/
|
||||||
|
scratch/
|
||||||
|
ds_convert_models/
|
||||||
|
|
||||||
|
# Created by https://www.toptal.com/developers/gitignore/api/python,macos,virtualenv,pycharm,visualstudiocode,emacs,vim,jupyternotebooks
|
||||||
|
# Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,virtualenv,pycharm,visualstudiocode,emacs,vim,jupyternotebooks
|
||||||
|
|
||||||
|
### Emacs ###
|
||||||
|
# -*- mode: gitignore; -*-
|
||||||
|
*~
|
||||||
|
\#*\#
|
||||||
|
/.emacs.desktop
|
||||||
|
/.emacs.desktop.lock
|
||||||
|
*.elc
|
||||||
|
auto-save-list
|
||||||
|
tramp
|
||||||
|
.\#*
|
||||||
|
|
||||||
|
# Org-mode
|
||||||
|
.org-id-locations
|
||||||
|
*_archive
|
||||||
|
|
||||||
|
# flymake-mode
|
||||||
|
*_flymake.*
|
||||||
|
|
||||||
|
# eshell files
|
||||||
|
/eshell/history
|
||||||
|
/eshell/lastdir
|
||||||
|
|
||||||
|
# elpa packages
|
||||||
|
/elpa/
|
||||||
|
|
||||||
|
# reftex files
|
||||||
|
*.rel
|
||||||
|
|
||||||
|
# AUCTeX auto folder
|
||||||
|
/auto/
|
||||||
|
|
||||||
|
# cask packages
|
||||||
|
.cask/
|
||||||
|
dist/
|
||||||
|
|
||||||
|
# Flycheck
|
||||||
|
flycheck_*.el
|
||||||
|
|
||||||
|
# server auth directory
|
||||||
|
/server/
|
||||||
|
|
||||||
|
# projectiles files
|
||||||
|
.projectile
|
||||||
|
|
||||||
|
# directory configuration
|
||||||
|
.dir-locals.el
|
||||||
|
|
||||||
|
# network security
|
||||||
|
/network-security.data
|
||||||
|
|
||||||
|
|
||||||
|
### JupyterNotebooks ###
|
||||||
|
# gitignore template for Jupyter Notebooks
|
||||||
|
# website: http://jupyter.org/
|
||||||
|
|
||||||
|
.ipynb_checkpoints
|
||||||
|
*/.ipynb_checkpoints/*
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# Remove previous ipynb_checkpoints
|
||||||
|
# git rm -r .ipynb_checkpoints/
|
||||||
|
|
||||||
|
### macOS ###
|
||||||
|
# General
|
||||||
|
.DS_Store
|
||||||
|
.AppleDouble
|
||||||
|
.LSOverride
|
||||||
|
|
||||||
|
# Icon must end with two \r
|
||||||
|
Icon
|
||||||
|
|
||||||
|
|
||||||
|
# Thumbnails
|
||||||
|
._*
|
||||||
|
|
||||||
|
# Files that might appear in the root of a volume
|
||||||
|
.DocumentRevisions-V100
|
||||||
|
.fseventsd
|
||||||
|
.Spotlight-V100
|
||||||
|
.TemporaryItems
|
||||||
|
.Trashes
|
||||||
|
.VolumeIcon.icns
|
||||||
|
.com.apple.timemachine.donotpresent
|
||||||
|
|
||||||
|
# Directories potentially created on remote AFP share
|
||||||
|
.AppleDB
|
||||||
|
.AppleDesktop
|
||||||
|
Network Trash Folder
|
||||||
|
Temporary Items
|
||||||
|
.apdisk
|
||||||
|
|
||||||
|
### macOS Patch ###
|
||||||
|
# iCloud generated files
|
||||||
|
*.icloud
|
||||||
|
|
||||||
|
### PyCharm ###
|
||||||
|
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
|
||||||
|
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
||||||
|
|
||||||
|
# User-specific stuff
|
||||||
|
.idea/**/workspace.xml
|
||||||
|
.idea/**/tasks.xml
|
||||||
|
.idea/**/usage.statistics.xml
|
||||||
|
.idea/**/dictionaries
|
||||||
|
.idea/**/shelf
|
||||||
|
|
||||||
|
# AWS User-specific
|
||||||
|
.idea/**/aws.xml
|
||||||
|
|
||||||
|
# Generated files
|
||||||
|
.idea/**/contentModel.xml
|
||||||
|
|
||||||
|
# Sensitive or high-churn files
|
||||||
|
.idea/**/dataSources/
|
||||||
|
.idea/**/dataSources.ids
|
||||||
|
.idea/**/dataSources.local.xml
|
||||||
|
.idea/**/sqlDataSources.xml
|
||||||
|
.idea/**/dynamic.xml
|
||||||
|
.idea/**/uiDesigner.xml
|
||||||
|
.idea/**/dbnavigator.xml
|
||||||
|
|
||||||
|
# Gradle
|
||||||
|
.idea/**/gradle.xml
|
||||||
|
.idea/**/libraries
|
||||||
|
|
||||||
|
# Gradle and Maven with auto-import
|
||||||
|
# When using Gradle or Maven with auto-import, you should exclude module files,
|
||||||
|
# since they will be recreated, and may cause churn. Uncomment if using
|
||||||
|
# auto-import.
|
||||||
|
# .idea/artifacts
|
||||||
|
# .idea/compiler.xml
|
||||||
|
# .idea/jarRepositories.xml
|
||||||
|
# .idea/modules.xml
|
||||||
|
# .idea/*.iml
|
||||||
|
# .idea/modules
|
||||||
|
# *.iml
|
||||||
|
# *.ipr
|
||||||
|
|
||||||
|
# CMake
|
||||||
|
cmake-build-*/
|
||||||
|
|
||||||
|
# Mongo Explorer plugin
|
||||||
|
.idea/**/mongoSettings.xml
|
||||||
|
|
||||||
|
# File-based project format
|
||||||
|
*.iws
|
||||||
|
|
||||||
|
# IntelliJ
|
||||||
|
out/
|
||||||
|
|
||||||
|
# mpeltonen/sbt-idea plugin
|
||||||
|
.idea_modules/
|
||||||
|
|
||||||
|
# JIRA plugin
|
||||||
|
atlassian-ide-plugin.xml
|
||||||
|
|
||||||
|
# Cursive Clojure plugin
|
||||||
|
.idea/replstate.xml
|
||||||
|
|
||||||
|
# SonarLint plugin
|
||||||
|
.idea/sonarlint/
|
||||||
|
|
||||||
|
# Crashlytics plugin (for Android Studio and IntelliJ)
|
||||||
|
com_crashlytics_export_strings.xml
|
||||||
|
crashlytics.properties
|
||||||
|
crashlytics-build.properties
|
||||||
|
fabric.properties
|
||||||
|
|
||||||
|
# Editor-based Rest Client
|
||||||
|
.idea/httpRequests
|
||||||
|
|
||||||
|
# Android studio 3.1+ serialized cache file
|
||||||
|
.idea/caches/build_file_checksums.ser
|
||||||
|
|
||||||
|
### PyCharm Patch ###
|
||||||
|
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
|
||||||
|
|
||||||
|
# *.iml
|
||||||
|
# modules.xml
|
||||||
|
# .idea/misc.xml
|
||||||
|
# *.ipr
|
||||||
|
|
||||||
|
# Sonarlint plugin
|
||||||
|
# https://plugins.jetbrains.com/plugin/7973-sonarlint
|
||||||
|
.idea/**/sonarlint/
|
||||||
|
|
||||||
|
# SonarQube Plugin
|
||||||
|
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
|
||||||
|
.idea/**/sonarIssues.xml
|
||||||
|
|
||||||
|
# Markdown Navigator plugin
|
||||||
|
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
|
||||||
|
.idea/**/markdown-navigator.xml
|
||||||
|
.idea/**/markdown-navigator-enh.xml
|
||||||
|
.idea/**/markdown-navigator/
|
||||||
|
|
||||||
|
# Cache file creation bug
|
||||||
|
# See https://youtrack.jetbrains.com/issue/JBR-2257
|
||||||
|
.idea/$CACHE_FILE$
|
||||||
|
|
||||||
|
# CodeStream plugin
|
||||||
|
# https://plugins.jetbrains.com/plugin/12206-codestream
|
||||||
|
.idea/codestream.xml
|
||||||
|
|
||||||
|
# Azure Toolkit for IntelliJ plugin
|
||||||
|
# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
|
||||||
|
.idea/**/azureSettings.xml
|
||||||
|
|
||||||
|
### Python ###
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/#use-with-ide
|
||||||
|
.pdm.toml
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
.idea/
|
||||||
|
|
||||||
|
### Python Patch ###
|
||||||
|
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
||||||
|
poetry.toml
|
||||||
|
|
||||||
|
# ruff
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
|
### Vim ###
|
||||||
|
# Swap
|
||||||
|
[._]*.s[a-v][a-z]
|
||||||
|
!*.svg # comment out if you don't need vector files
|
||||||
|
[._]*.sw[a-p]
|
||||||
|
[._]s[a-rt-v][a-z]
|
||||||
|
[._]ss[a-gi-z]
|
||||||
|
[._]sw[a-p]
|
||||||
|
|
||||||
|
# Session
|
||||||
|
Session.vim
|
||||||
|
Sessionx.vim
|
||||||
|
|
||||||
|
# Temporary
|
||||||
|
.netrwhist
|
||||||
|
# Auto-generated tag files
|
||||||
|
tags
|
||||||
|
# Persistent undo
|
||||||
|
[._]*.un~
|
||||||
|
|
||||||
|
|
||||||
|
### Visual Studio Code ###
|
||||||
|
.vscode/
|
||||||
|
|
||||||
|
### VirtualEnv ###
|
||||||
|
# Virtualenv
|
||||||
|
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
|
||||||
|
[Bb]in
|
||||||
|
[Ii]nclude
|
||||||
|
[Ll]ib
|
||||||
|
[Ll]ib64
|
||||||
|
[Ll]ocal
|
||||||
|
[Ss]cripts
|
||||||
|
pyvenv.cfg
|
||||||
|
pip-selfcheck.json
|
||||||
|
|
||||||
|
### VisualStudioCode ###
|
||||||
|
.vscode/*
|
||||||
|
!.vscode/settings.json
|
||||||
|
!.vscode/tasks.json
|
||||||
|
!.vscode/launch.json
|
||||||
|
!.vscode/extensions.json
|
||||||
|
!.vscode/*.code-snippets
|
||||||
|
|
||||||
|
# Local History for Visual Studio Code
|
||||||
|
.history/
|
||||||
|
|
||||||
|
# Built Visual Studio Code Extensions
|
||||||
|
*.vsix
|
||||||
|
|
||||||
|
### VisualStudioCode Patch ###
|
||||||
|
# Ignore all local history of files
|
||||||
|
.history
|
||||||
|
.ionide
|
||||||
|
|
||||||
|
|
||||||
|
# Docs
|
||||||
|
# docs/**/*.png
|
||||||
|
# docs/**/*.svg
|
34
.pre-commit-config.yaml
Normal file
34
.pre-commit-config.yaml
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
fail_fast: true
|
||||||
|
repos:
|
||||||
|
- repo: local
|
||||||
|
hooks:
|
||||||
|
- id: system
|
||||||
|
name: Black
|
||||||
|
entry: poetry run black docling examples
|
||||||
|
pass_filenames: false
|
||||||
|
language: system
|
||||||
|
files: '\.py$'
|
||||||
|
- repo: local
|
||||||
|
hooks:
|
||||||
|
- id: system
|
||||||
|
name: isort
|
||||||
|
entry: poetry run isort docling examples
|
||||||
|
pass_filenames: false
|
||||||
|
language: system
|
||||||
|
files: '\.py$'
|
||||||
|
# - repo: local
|
||||||
|
# hooks:
|
||||||
|
# - id: system
|
||||||
|
# name: flake8
|
||||||
|
# entry: poetry run flake8 docling
|
||||||
|
# pass_filenames: false
|
||||||
|
# language: system
|
||||||
|
# files: '\.py$'
|
||||||
|
# - repo: local
|
||||||
|
# hooks:
|
||||||
|
# - id: system
|
||||||
|
# name: MyPy
|
||||||
|
# entry: poetry run mypy docling
|
||||||
|
# pass_filenames: false
|
||||||
|
# language: system
|
||||||
|
# files: '\.py$'
|
129
CODE_OF_CONDUCT.md
Normal file
129
CODE_OF_CONDUCT.md
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
# Contributor Covenant Code of Conduct
|
||||||
|
|
||||||
|
## Our Pledge
|
||||||
|
|
||||||
|
We as members, contributors, and leaders pledge to make participation in our
|
||||||
|
community a harassment-free experience for everyone, regardless of age, body
|
||||||
|
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
||||||
|
identity and expression, level of experience, education, socio-economic status,
|
||||||
|
nationality, personal appearance, race, religion, or sexual identity
|
||||||
|
and orientation.
|
||||||
|
|
||||||
|
We pledge to act and interact in ways that contribute to an open, welcoming,
|
||||||
|
diverse, inclusive, and healthy community.
|
||||||
|
|
||||||
|
## Our Standards
|
||||||
|
|
||||||
|
Examples of behavior that contributes to a positive environment for our
|
||||||
|
community include:
|
||||||
|
|
||||||
|
* Demonstrating empathy and kindness toward other people
|
||||||
|
* Being respectful of differing opinions, viewpoints, and experiences
|
||||||
|
* Giving and gracefully accepting constructive feedback
|
||||||
|
* Accepting responsibility and apologizing to those affected by our mistakes,
|
||||||
|
and learning from the experience
|
||||||
|
* Focusing on what is best not just for us as individuals, but for the
|
||||||
|
overall community
|
||||||
|
|
||||||
|
Examples of unacceptable behavior include:
|
||||||
|
|
||||||
|
* The use of sexualized language or imagery, and sexual attention or
|
||||||
|
advances of any kind
|
||||||
|
* Trolling, insulting or derogatory comments, and personal or political attacks
|
||||||
|
* Public or private harassment
|
||||||
|
* Publishing others' private information, such as a physical or email
|
||||||
|
address, without their explicit permission
|
||||||
|
* Other conduct which could reasonably be considered inappropriate in a
|
||||||
|
professional setting
|
||||||
|
|
||||||
|
## Enforcement Responsibilities
|
||||||
|
|
||||||
|
Community leaders are responsible for clarifying and enforcing our standards of
|
||||||
|
acceptable behavior and will take appropriate and fair corrective action in
|
||||||
|
response to any behavior that they deem inappropriate, threatening, offensive,
|
||||||
|
or harmful.
|
||||||
|
|
||||||
|
Community leaders have the right and responsibility to remove, edit, or reject
|
||||||
|
comments, commits, code, wiki edits, issues, and other contributions that are
|
||||||
|
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
||||||
|
decisions when appropriate.
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
This Code of Conduct applies within all community spaces, and also applies when
|
||||||
|
an individual is officially representing the community in public spaces.
|
||||||
|
Examples of representing our community include using an official e-mail address,
|
||||||
|
posting via an official social media account, or acting as an appointed
|
||||||
|
representative at an online or offline event.
|
||||||
|
|
||||||
|
## Enforcement
|
||||||
|
|
||||||
|
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||||
|
reported to the community leaders responsible for enforcement using
|
||||||
|
[deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com).
|
||||||
|
|
||||||
|
All complaints will be reviewed and investigated promptly and fairly.
|
||||||
|
|
||||||
|
All community leaders are obligated to respect the privacy and security of the
|
||||||
|
reporter of any incident.
|
||||||
|
|
||||||
|
## Enforcement Guidelines
|
||||||
|
|
||||||
|
Community leaders will follow these Community Impact Guidelines in determining
|
||||||
|
the consequences for any action they deem in violation of this Code of Conduct:
|
||||||
|
|
||||||
|
### 1. Correction
|
||||||
|
|
||||||
|
**Community Impact**: Use of inappropriate language or other behavior deemed
|
||||||
|
unprofessional or unwelcome in the community.
|
||||||
|
|
||||||
|
**Consequence**: A private, written warning from community leaders, providing
|
||||||
|
clarity around the nature of the violation and an explanation of why the
|
||||||
|
behavior was inappropriate. A public apology may be requested.
|
||||||
|
|
||||||
|
### 2. Warning
|
||||||
|
|
||||||
|
**Community Impact**: A violation through a single incident or series
|
||||||
|
of actions.
|
||||||
|
|
||||||
|
**Consequence**: A warning with consequences for continued behavior. No
|
||||||
|
interaction with the people involved, including unsolicited interaction with
|
||||||
|
those enforcing the Code of Conduct, for a specified period of time. This
|
||||||
|
includes avoiding interactions in community spaces as well as external channels
|
||||||
|
like social media. Violating these terms may lead to a temporary or
|
||||||
|
permanent ban.
|
||||||
|
|
||||||
|
### 3. Temporary Ban
|
||||||
|
|
||||||
|
**Community Impact**: A serious violation of community standards, including
|
||||||
|
sustained inappropriate behavior.
|
||||||
|
|
||||||
|
**Consequence**: A temporary ban from any sort of interaction or public
|
||||||
|
communication with the community for a specified period of time. No public or
|
||||||
|
private interaction with the people involved, including unsolicited interaction
|
||||||
|
with those enforcing the Code of Conduct, is allowed during this period.
|
||||||
|
Violating these terms may lead to a permanent ban.
|
||||||
|
|
||||||
|
### 4. Permanent Ban
|
||||||
|
|
||||||
|
**Community Impact**: Demonstrating a pattern of violation of community
|
||||||
|
standards, including sustained inappropriate behavior, harassment of an
|
||||||
|
individual, or aggression toward or disparagement of classes of individuals.
|
||||||
|
|
||||||
|
**Consequence**: A permanent ban from any sort of public interaction within
|
||||||
|
the community.
|
||||||
|
|
||||||
|
## Attribution
|
||||||
|
|
||||||
|
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
||||||
|
version 2.0, available at
|
||||||
|
[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html](https://www.contributor-covenant.org/version/2/0/code_of_conduct.html).
|
||||||
|
|
||||||
|
Community Impact Guidelines were inspired by [Mozilla's code of conduct
|
||||||
|
enforcement ladder](https://github.com/mozilla/diversity).
|
||||||
|
|
||||||
|
Homepage: [https://www.contributor-covenant.org](https://www.contributor-covenant.org)
|
||||||
|
|
||||||
|
For answers to common questions about this code of conduct, see the FAQ at
|
||||||
|
[https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq). Translations are available at
|
||||||
|
[https://www.contributor-covenant.org/translations](https://www.contributor-covenant.org/translations).
|
184
CONTRIBUTING.md
Normal file
184
CONTRIBUTING.md
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
## Contributing In General
|
||||||
|
Our project welcomes external contributions. If you have an itch, please feel
|
||||||
|
free to scratch it.
|
||||||
|
|
||||||
|
To contribute code or documentation, please submit a [pull request](https://github.com/DS4SD/docling/pulls).
|
||||||
|
|
||||||
|
A good way to familiarize yourself with the codebase and contribution process is
|
||||||
|
to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/DS4SD/docling/issues).
|
||||||
|
Before embarking on a more ambitious contribution, please quickly [get in touch](#communication) with us.
|
||||||
|
|
||||||
|
For general questions or support requests, please refer to the [discussion section](https://github.com/DS4SD/docling/discussions).
|
||||||
|
|
||||||
|
**Note: We appreciate your effort, and want to avoid a situation where a contribution
|
||||||
|
requires extensive rework (by you or by us), sits in backlog for a long time, or
|
||||||
|
cannot be accepted at all!**
|
||||||
|
|
||||||
|
### Proposing new features
|
||||||
|
|
||||||
|
If you would like to implement a new feature, please [raise an issue](https://github.com/DS4SD/docling/issues)
|
||||||
|
before sending a pull request so the feature can be discussed. This is to avoid
|
||||||
|
you wasting your valuable time working on a feature that the project developers
|
||||||
|
are not interested in accepting into the code base.
|
||||||
|
|
||||||
|
### Fixing bugs
|
||||||
|
|
||||||
|
If you would like to fix a bug, please [raise an issue](https://github.com/DS4SD/docling/issues) before sending a
|
||||||
|
pull request so it can be tracked.
|
||||||
|
|
||||||
|
### Merge approval
|
||||||
|
|
||||||
|
The project maintainers use LGTM (Looks Good To Me) in comments on the code
|
||||||
|
review to indicate acceptance. A change requires LGTMs from two of the
|
||||||
|
maintainers of each component affected.
|
||||||
|
|
||||||
|
For a list of the maintainers, see the [MAINTAINERS.md](MAINTAINERS.md) page.
|
||||||
|
|
||||||
|
|
||||||
|
## Legal
|
||||||
|
|
||||||
|
Each source file must include a license header for the MIT
|
||||||
|
Software. Using the SPDX format is the simplest approach.
|
||||||
|
e.g.
|
||||||
|
|
||||||
|
```
|
||||||
|
/*
|
||||||
|
Copyright IBM Inc. All rights reserved.
|
||||||
|
|
||||||
|
SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
```
|
||||||
|
|
||||||
|
We have tried to make it as easy as possible to make contributions. This
|
||||||
|
applies to how we handle the legal aspects of contribution. We use the
|
||||||
|
same approach - the [Developer's Certificate of Origin 1.1 (DCO)](https://github.com/hyperledger/fabric/blob/master/docs/source/DCO1.1.txt) - that the Linux® Kernel [community](https://elinux.org/Developer_Certificate_Of_Origin)
|
||||||
|
uses to manage code contributions.
|
||||||
|
|
||||||
|
We simply ask that when submitting a patch for review, the developer
|
||||||
|
must include a sign-off statement in the commit message.
|
||||||
|
|
||||||
|
Here is an example Signed-off-by line, which indicates that the
|
||||||
|
submitter accepts the DCO:
|
||||||
|
|
||||||
|
```
|
||||||
|
Signed-off-by: John Doe <john.doe@example.com>
|
||||||
|
```
|
||||||
|
|
||||||
|
You can include this automatically when you commit a change to your
|
||||||
|
local git repository using the following command:
|
||||||
|
|
||||||
|
```
|
||||||
|
git commit -s
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Communication
|
||||||
|
|
||||||
|
Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Developing
|
||||||
|
|
||||||
|
### Usage of Poetry
|
||||||
|
|
||||||
|
We use Poetry to manage dependencies.
|
||||||
|
|
||||||
|
|
||||||
|
#### Install
|
||||||
|
|
||||||
|
To install, see the documentation here: https://python-poetry.org/docs/master/#installing-with-the-official-installer
|
||||||
|
|
||||||
|
1. Install the Poetry globally in your machine
|
||||||
|
```bash
|
||||||
|
curl -sSL https://install.python-poetry.org | python3 -
|
||||||
|
```
|
||||||
|
The installation script will print the installation bin folder `POETRY_BIN` which you need in the next steps.
|
||||||
|
|
||||||
|
2. Make sure Poetry is in your `$PATH`
|
||||||
|
- for `zsh`
|
||||||
|
```sh
|
||||||
|
echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.zshrc
|
||||||
|
```
|
||||||
|
- for `bash`
|
||||||
|
```sh
|
||||||
|
echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.bashrc
|
||||||
|
```
|
||||||
|
|
||||||
|
3. The official guidelines linked above include useful details on the configuration of autocomplete for most shell environments, e.g. Bash and Zsh.
|
||||||
|
|
||||||
|
|
||||||
|
#### Create a Virtual Environment and Install Dependencies
|
||||||
|
|
||||||
|
To activate the Virtual Environment, run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
poetry shell
|
||||||
|
```
|
||||||
|
|
||||||
|
To spawn a shell with the Virtual Environment activated. If the Virtual Environment doesn't exist, Poetry will create one for you. Then, to install dependencies, run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
poetry install
|
||||||
|
```
|
||||||
|
|
||||||
|
**(Advanced) Use a Specific Python Version**
|
||||||
|
|
||||||
|
If for whatever reason you need to work in a specific (older) version of Python, run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
poetry env use $(which python3.8)
|
||||||
|
```
|
||||||
|
|
||||||
|
This creates a Virtual Environment with Python 3.8. For other versions, replace `$(which python3.8)` by the path to the interpreter (e.g., `/usr/bin/python3.8`) or use `$(which pythonX.Y)`.
|
||||||
|
|
||||||
|
|
||||||
|
#### Add a new dependency
|
||||||
|
|
||||||
|
```bash
|
||||||
|
poetry add NAME
|
||||||
|
```
|
||||||
|
|
||||||
|
## Coding style guidelines
|
||||||
|
|
||||||
|
We use the following tools to enforce code style:
|
||||||
|
|
||||||
|
- iSort, to sort imports
|
||||||
|
- Black, to format code
|
||||||
|
|
||||||
|
|
||||||
|
We run a series of checks on the code base on every commit, using `pre-commit`. To install the hooks, run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pre-commit install
|
||||||
|
```
|
||||||
|
|
||||||
|
To run the checks on-demand, run:
|
||||||
|
|
||||||
|
```
|
||||||
|
pre-commit run --all-files
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: Checks like `Black` and `isort` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by their Hooks. In these cases, `git add` the modified files and `git commit` again.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
We use [MkDocs](https://www.mkdocs.org/) to write documentation.
|
||||||
|
|
||||||
|
To run the documentation server, do:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdocs serve
|
||||||
|
```
|
||||||
|
|
||||||
|
The server will be available on [http://localhost:8000](http://localhost:8000).
|
||||||
|
|
||||||
|
### Pushing Documentation to GitHub pages
|
||||||
|
|
||||||
|
Run the following:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdocs gh-deploy
|
||||||
|
```
|
23
Dockerfile
Normal file
23
Dockerfile
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
FROM python:3.11-slim-bookworm
|
||||||
|
|
||||||
|
ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
|
||||||
|
&& apt-get clean
|
||||||
|
|
||||||
|
RUN --mount=type=ssh \
|
||||||
|
pip install --no-cache-dir https://github.com/DS4SD/docling.git
|
||||||
|
|
||||||
|
ENV HF_HOME=/tmp/
|
||||||
|
ENV TORCH_HOME=/tmp/
|
||||||
|
|
||||||
|
COPY examples/minimal.py /root/minimal.py
|
||||||
|
|
||||||
|
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
|
||||||
|
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
|
||||||
|
RUN wget "https://www.ibm.com/docs/en/SSQRB8/com.ibm.spectrum.si.pdfs/IBM_Storage_Insights_Fact_Sheet.pdf" -O /root/factsheet.pdf
|
||||||
|
|
||||||
|
# On container shell:
|
||||||
|
# > cd /root/
|
||||||
|
# > python minimal.py
|
21
LICENSE
Normal file
21
LICENSE
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) [year] [fullname]
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
10
MAINTAINERS.md
Normal file
10
MAINTAINERS.md
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
# MAINTAINERS
|
||||||
|
|
||||||
|
- Christoph Auer - [@cau-git](https://github.com/cau-git)
|
||||||
|
- Michele Dolfi - [@dolfim-ibm](https://github.com/dolfim-ibm)
|
||||||
|
- Maxim Lysak - [@maxmnemonic](https://github.com/maxmnemonic)
|
||||||
|
- Nikos Livathinos - [@nikos-livathinos](https://github.com/nikos-livathinos)
|
||||||
|
- Ahmed Nassar [@nassarofficial](https://github.com/nassarofficial)
|
||||||
|
- Peter Staar - [@PeterStaar-IBM](https://github.com/PeterStaar-IBM)
|
||||||
|
|
||||||
|
Maintainers can be contacted at [deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com).
|
99
README.md
Normal file
99
README.md
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
<p align="center">
|
||||||
|
<a href="https://github.com/ds4sd/docling"> <img loading="lazy" alt="Docling" src="logo.png" width="150" /> </a>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
# Docling
|
||||||
|
|
||||||
|
Dockling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
* ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
|
||||||
|
* 📑 Understands detailed page layout, reading order and recovers table structures
|
||||||
|
* 📝 Extracts metadata from the document, such as title, authors, references and language
|
||||||
|
* 🔍 Optionally applies OCR (use with scanned PDFs)
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
You need Python 3.11 and poetry. Install poetry from [here](https://python-poetry.org/docs/#installing-with-the-official-installer).
|
||||||
|
|
||||||
|
Once you have `poetry` installed, create an environment and install the package:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
poetry env use $(which python3.11)
|
||||||
|
poetry shell
|
||||||
|
poetry install
|
||||||
|
```
|
||||||
|
|
||||||
|
**Notes**:
|
||||||
|
* Works on macOS and Linux environments. Windows platforms are currently not tested.
|
||||||
|
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
For basic usage, see the [convert.py](examples/convert.py) example module. Run with:
|
||||||
|
|
||||||
|
```
|
||||||
|
python examples/convert.py
|
||||||
|
```
|
||||||
|
The output of the above command will be written to `./scratch`.
|
||||||
|
|
||||||
|
### Enable or disable pipeline features
|
||||||
|
|
||||||
|
You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`
|
||||||
|
```python
|
||||||
|
doc_converter = DocumentConverter(
|
||||||
|
artifacts_path=artifacts_path,
|
||||||
|
pipeline_options=PipelineOptions(do_table_structure=False, # Controls if table structure is recovered.
|
||||||
|
do_ocr=True), # Controls if OCR is applied (ignores programmatic content)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Impose limits on the document size
|
||||||
|
|
||||||
|
You can limit the file size and number of pages which should be allowed to process per document.
|
||||||
|
```python
|
||||||
|
paths = [Path("./test/data/2206.01062.pdf")]
|
||||||
|
|
||||||
|
input = DocumentConversionInput.from_paths(
|
||||||
|
paths, limits=DocumentLimits(max_num_pages=100, max_file_size=20971520)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Convert from binary PDF streams
|
||||||
|
|
||||||
|
You can convert PDFs from a binary stream instead of from the filesystem as follows:
|
||||||
|
```python
|
||||||
|
buf = BytesIO(your_binary_stream)
|
||||||
|
docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
||||||
|
input = DocumentConversionInput.from_streams(docs)
|
||||||
|
converted_docs = doc_converter.convert(input)
|
||||||
|
```
|
||||||
|
### Limit resource usage
|
||||||
|
|
||||||
|
You can limit the CPU threads used by `docling` by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
||||||
|
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
Please read [Contributing to Docling](./CONTRIBUTING.md) for details.
|
||||||
|
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
If you use `Docling` in your projects, please consider citing the following:
|
||||||
|
|
||||||
|
```bib
|
||||||
|
@software{Docling,
|
||||||
|
author = {Deep Search Team},
|
||||||
|
month = {7},
|
||||||
|
title = {{Docling}},
|
||||||
|
url = {https://github.com/DS4SD/docling},
|
||||||
|
version = {main},
|
||||||
|
year = {2024}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
The `Docling` codebase is under MIT license.
|
||||||
|
For individual model usage, please refer to the model licenses found in the original packages.
|
0
docling/__init__.py
Normal file
0
docling/__init__.py
Normal file
0
docling/backend/__init__.py
Normal file
0
docling/backend/__init__.py
Normal file
55
docling/backend/abstract_backend.py
Normal file
55
docling/backend/abstract_backend.py
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Iterable, Optional, Union
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
|
class PdfPageBackend(ABC):
|
||||||
|
def __init__(self, page_obj: Any) -> object:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_text_cells(self) -> Iterable["Cell"]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_page_image(
|
||||||
|
self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
|
||||||
|
) -> Image.Image:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_size(self) -> "PageSize":
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def unload(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PdfDocumentBackend(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def load_page(self, page_no: int) -> PdfPageBackend:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def page_count(self) -> int:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def unload(self):
|
||||||
|
pass
|
223
docling/backend/pypdfium2_backend.py
Normal file
223
docling/backend/pypdfium2_backend.py
Normal file
@ -0,0 +1,223 @@
|
|||||||
|
import random
|
||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, List, Optional, Union
|
||||||
|
|
||||||
|
import pypdfium2 as pdfium
|
||||||
|
from PIL import Image, ImageDraw
|
||||||
|
from pypdfium2 import PdfPage
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
|
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
||||||
|
|
||||||
|
|
||||||
|
class PyPdfiumPageBackend(PdfPageBackend):
|
||||||
|
def __init__(self, page_obj: PdfPage):
|
||||||
|
super().__init__(page_obj)
|
||||||
|
self._ppage = page_obj
|
||||||
|
self.text_page = None
|
||||||
|
|
||||||
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||||
|
if not self.text_page:
|
||||||
|
self.text_page = self._ppage.get_textpage()
|
||||||
|
|
||||||
|
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
||||||
|
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
||||||
|
|
||||||
|
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
||||||
|
|
||||||
|
return text_piece
|
||||||
|
|
||||||
|
def get_text_cells(self) -> Iterable[Cell]:
|
||||||
|
if not self.text_page:
|
||||||
|
self.text_page = self._ppage.get_textpage()
|
||||||
|
|
||||||
|
cells = []
|
||||||
|
cell_counter = 0
|
||||||
|
|
||||||
|
page_size = self.get_size()
|
||||||
|
|
||||||
|
for i in range(self.text_page.count_rects()):
|
||||||
|
rect = self.text_page.get_rect(i)
|
||||||
|
text_piece = self.text_page.get_text_bounded(*rect)
|
||||||
|
x0, y0, x1, y1 = rect
|
||||||
|
cells.append(
|
||||||
|
Cell(
|
||||||
|
id=cell_counter,
|
||||||
|
text=text_piece,
|
||||||
|
bbox=BoundingBox(
|
||||||
|
l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
|
||||||
|
).to_top_left_origin(page_size.height),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
cell_counter += 1
|
||||||
|
|
||||||
|
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
|
||||||
|
# The cell merging code below is to clean this up.
|
||||||
|
def merge_horizontal_cells(
|
||||||
|
cells: List[Cell],
|
||||||
|
horizontal_threshold_factor: float = 1.0,
|
||||||
|
vertical_threshold_factor: float = 0.5,
|
||||||
|
) -> List[Cell]:
|
||||||
|
if not cells:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def group_rows(cells: List[Cell]) -> List[List[Cell]]:
|
||||||
|
rows = []
|
||||||
|
current_row = [cells[0]]
|
||||||
|
row_top = cells[0].bbox.t
|
||||||
|
row_bottom = cells[0].bbox.b
|
||||||
|
row_height = cells[0].bbox.height
|
||||||
|
|
||||||
|
for cell in cells[1:]:
|
||||||
|
vertical_threshold = row_height * vertical_threshold_factor
|
||||||
|
if (
|
||||||
|
abs(cell.bbox.t - row_top) <= vertical_threshold
|
||||||
|
and abs(cell.bbox.b - row_bottom) <= vertical_threshold
|
||||||
|
):
|
||||||
|
current_row.append(cell)
|
||||||
|
row_top = min(row_top, cell.bbox.t)
|
||||||
|
row_bottom = max(row_bottom, cell.bbox.b)
|
||||||
|
row_height = row_bottom - row_top
|
||||||
|
else:
|
||||||
|
rows.append(current_row)
|
||||||
|
current_row = [cell]
|
||||||
|
row_top = cell.bbox.t
|
||||||
|
row_bottom = cell.bbox.b
|
||||||
|
row_height = cell.bbox.height
|
||||||
|
|
||||||
|
if current_row:
|
||||||
|
rows.append(current_row)
|
||||||
|
|
||||||
|
return rows
|
||||||
|
|
||||||
|
def merge_row(row: List[Cell]) -> List[Cell]:
|
||||||
|
merged = []
|
||||||
|
current_group = [row[0]]
|
||||||
|
|
||||||
|
for cell in row[1:]:
|
||||||
|
prev_cell = current_group[-1]
|
||||||
|
avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
|
||||||
|
if (
|
||||||
|
cell.bbox.l - prev_cell.bbox.r
|
||||||
|
<= avg_height * horizontal_threshold_factor
|
||||||
|
):
|
||||||
|
current_group.append(cell)
|
||||||
|
else:
|
||||||
|
merged.append(merge_group(current_group))
|
||||||
|
current_group = [cell]
|
||||||
|
|
||||||
|
if current_group:
|
||||||
|
merged.append(merge_group(current_group))
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|
||||||
|
def merge_group(group: List[Cell]) -> Cell:
|
||||||
|
if len(group) == 1:
|
||||||
|
return group[0]
|
||||||
|
|
||||||
|
merged_text = "".join(cell.text for cell in group)
|
||||||
|
merged_bbox = BoundingBox(
|
||||||
|
l=min(cell.bbox.l for cell in group),
|
||||||
|
t=min(cell.bbox.t for cell in group),
|
||||||
|
r=max(cell.bbox.r for cell in group),
|
||||||
|
b=max(cell.bbox.b for cell in group),
|
||||||
|
)
|
||||||
|
return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
|
||||||
|
|
||||||
|
rows = group_rows(cells)
|
||||||
|
merged_cells = [cell for row in rows for cell in merge_row(row)]
|
||||||
|
|
||||||
|
for i, cell in enumerate(merged_cells, 1):
|
||||||
|
cell.id = i
|
||||||
|
|
||||||
|
return merged_cells
|
||||||
|
|
||||||
|
def draw_clusters_and_cells():
|
||||||
|
image = self.get_page_image()
|
||||||
|
draw = ImageDraw.Draw(image)
|
||||||
|
for c in cells:
|
||||||
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||||
|
cell_color = (
|
||||||
|
random.randint(30, 140),
|
||||||
|
random.randint(30, 140),
|
||||||
|
random.randint(30, 140),
|
||||||
|
)
|
||||||
|
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||||
|
image.show()
|
||||||
|
|
||||||
|
# before merge:
|
||||||
|
# draw_clusters_and_cells()
|
||||||
|
|
||||||
|
cells = merge_horizontal_cells(cells)
|
||||||
|
|
||||||
|
# after merge:
|
||||||
|
# draw_clusters_and_cells()
|
||||||
|
|
||||||
|
return cells
|
||||||
|
|
||||||
|
def get_page_image(
|
||||||
|
self, scale: int = 1, cropbox: Optional[BoundingBox] = None
|
||||||
|
) -> Image.Image:
|
||||||
|
|
||||||
|
page_size = self.get_size()
|
||||||
|
|
||||||
|
if not cropbox:
|
||||||
|
cropbox = BoundingBox(
|
||||||
|
l=0,
|
||||||
|
r=page_size.width,
|
||||||
|
t=0,
|
||||||
|
b=page_size.height,
|
||||||
|
coord_origin=CoordOrigin.TOPLEFT,
|
||||||
|
)
|
||||||
|
padbox = BoundingBox(
|
||||||
|
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
||||||
|
padbox.r = page_size.width - padbox.r
|
||||||
|
padbox.t = page_size.height - padbox.t
|
||||||
|
|
||||||
|
image = (
|
||||||
|
self._ppage.render(
|
||||||
|
scale=scale * 1.5,
|
||||||
|
rotation=0, # no additional rotation
|
||||||
|
crop=padbox.as_tuple(),
|
||||||
|
)
|
||||||
|
.to_pil()
|
||||||
|
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
||||||
|
) # We resize the image from 1.5x the given scale to make it sharper.
|
||||||
|
|
||||||
|
return image
|
||||||
|
|
||||||
|
def get_size(self) -> PageSize:
|
||||||
|
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
self._ppage = None
|
||||||
|
self.text_page = None
|
||||||
|
|
||||||
|
|
||||||
|
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
||||||
|
def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
|
||||||
|
super().__init__(path_or_stream)
|
||||||
|
|
||||||
|
if isinstance(path_or_stream, Path):
|
||||||
|
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
||||||
|
elif isinstance(path_or_stream, BytesIO):
|
||||||
|
self._pdoc = pdfium.PdfDocument(
|
||||||
|
path_or_stream
|
||||||
|
) # TODO Fix me, won't accept bytes.
|
||||||
|
|
||||||
|
def page_count(self) -> int:
|
||||||
|
return len(self._pdoc)
|
||||||
|
|
||||||
|
def load_page(self, page_no: int) -> PdfPage:
|
||||||
|
return PyPdfiumPageBackend(self._pdoc[page_no])
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return self.page_count() > 0
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
self._pdoc.close()
|
||||||
|
self._pdoc = None
|
0
docling/datamodel/__init__.py
Normal file
0
docling/datamodel/__init__.py
Normal file
247
docling/datamodel/base_models.py
Normal file
247
docling/datamodel/base_models.py
Normal file
@ -0,0 +1,247 @@
|
|||||||
|
from enum import Enum, auto
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
from PIL.Image import Image
|
||||||
|
from pydantic import BaseModel, ConfigDict, model_validator
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import PdfPageBackend
|
||||||
|
|
||||||
|
|
||||||
|
class ConversionStatus(str, Enum):
|
||||||
|
PENDING = auto()
|
||||||
|
STARTED = auto()
|
||||||
|
FAILURE = auto()
|
||||||
|
SUCCESS = auto()
|
||||||
|
SUCCESS_WITH_ERRORS = auto()
|
||||||
|
|
||||||
|
|
||||||
|
class DocInputType(str, Enum):
|
||||||
|
PATH = auto()
|
||||||
|
STREAM = auto()
|
||||||
|
|
||||||
|
|
||||||
|
class CoordOrigin(str, Enum):
|
||||||
|
TOPLEFT = auto()
|
||||||
|
BOTTOMLEFT = auto()
|
||||||
|
|
||||||
|
|
||||||
|
class PageSize(BaseModel):
|
||||||
|
width: float = 0.0
|
||||||
|
height: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
|
class BoundingBox(BaseModel):
|
||||||
|
l: float # left
|
||||||
|
t: float # top
|
||||||
|
r: float # right
|
||||||
|
b: float # bottom
|
||||||
|
|
||||||
|
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
|
||||||
|
|
||||||
|
@property
|
||||||
|
def width(self):
|
||||||
|
return self.r - self.l
|
||||||
|
|
||||||
|
@property
|
||||||
|
def height(self):
|
||||||
|
return abs(self.t - self.b)
|
||||||
|
|
||||||
|
def as_tuple(self):
|
||||||
|
if self.coord_origin == CoordOrigin.TOPLEFT:
|
||||||
|
return (self.l, self.t, self.r, self.b)
|
||||||
|
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
||||||
|
return (self.l, self.b, self.r, self.t)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
|
||||||
|
if origin == CoordOrigin.TOPLEFT:
|
||||||
|
return BoundingBox(
|
||||||
|
l=coord[0], t=coord[1], r=coord[2], b=coord[3], coord_origin=origin
|
||||||
|
)
|
||||||
|
elif origin == CoordOrigin.BOTTOMLEFT:
|
||||||
|
return BoundingBox(
|
||||||
|
l=coord[0], b=coord[1], r=coord[2], t=coord[3], coord_origin=origin
|
||||||
|
)
|
||||||
|
|
||||||
|
def area(self) -> float:
|
||||||
|
return (self.r - self.l) * (self.b - self.t)
|
||||||
|
|
||||||
|
def intersection_area_with(self, other: "BoundingBox") -> float:
|
||||||
|
# Calculate intersection coordinates
|
||||||
|
left = max(self.l, other.l)
|
||||||
|
top = max(self.t, other.t)
|
||||||
|
right = min(self.r, other.r)
|
||||||
|
bottom = min(self.b, other.b)
|
||||||
|
|
||||||
|
# Calculate intersection dimensions
|
||||||
|
width = right - left
|
||||||
|
height = bottom - top
|
||||||
|
|
||||||
|
# If the bounding boxes do not overlap, width or height will be negative
|
||||||
|
if width <= 0 or height <= 0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
return width * height
|
||||||
|
|
||||||
|
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
|
||||||
|
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
||||||
|
return self
|
||||||
|
elif self.coord_origin == CoordOrigin.TOPLEFT:
|
||||||
|
return BoundingBox(
|
||||||
|
l=self.l,
|
||||||
|
r=self.r,
|
||||||
|
t=page_height - self.t,
|
||||||
|
b=page_height - self.b,
|
||||||
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||||
|
)
|
||||||
|
|
||||||
|
def to_top_left_origin(self, page_height):
|
||||||
|
if self.coord_origin == CoordOrigin.TOPLEFT:
|
||||||
|
return self
|
||||||
|
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
||||||
|
return BoundingBox(
|
||||||
|
l=self.l,
|
||||||
|
r=self.r,
|
||||||
|
t=page_height - self.t, # self.b
|
||||||
|
b=page_height - self.b, # self.t
|
||||||
|
coord_origin=CoordOrigin.TOPLEFT,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Cell(BaseModel):
|
||||||
|
id: int
|
||||||
|
text: str
|
||||||
|
bbox: BoundingBox
|
||||||
|
|
||||||
|
|
||||||
|
class OcrCell(Cell):
|
||||||
|
confidence: float
|
||||||
|
|
||||||
|
|
||||||
|
class Cluster(BaseModel):
|
||||||
|
id: int
|
||||||
|
label: str
|
||||||
|
bbox: BoundingBox
|
||||||
|
confidence: float = 1.0
|
||||||
|
cells: List[Cell] = []
|
||||||
|
|
||||||
|
|
||||||
|
class BasePageElement(BaseModel):
|
||||||
|
label: str
|
||||||
|
id: int
|
||||||
|
page_no: int
|
||||||
|
cluster: Cluster
|
||||||
|
text: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class LayoutPrediction(BaseModel):
|
||||||
|
clusters: List[Cluster] = []
|
||||||
|
|
||||||
|
|
||||||
|
class TableCell(BaseModel):
|
||||||
|
bbox: BoundingBox
|
||||||
|
row_span: int
|
||||||
|
col_span: int
|
||||||
|
start_row_offset_idx: int
|
||||||
|
end_row_offset_idx: int
|
||||||
|
start_col_offset_idx: int
|
||||||
|
end_col_offset_idx: int
|
||||||
|
text: str
|
||||||
|
column_header: bool = False
|
||||||
|
row_header: bool = False
|
||||||
|
row_section: bool = False
|
||||||
|
|
||||||
|
@model_validator(mode="before")
|
||||||
|
@classmethod
|
||||||
|
def from_dict_format(cls, data: Any) -> Any:
|
||||||
|
if isinstance(data, Dict):
|
||||||
|
text = data["bbox"].get("token", "")
|
||||||
|
if not len(text):
|
||||||
|
text_cells = data.pop("text_cell_bboxes", None)
|
||||||
|
if text_cells:
|
||||||
|
for el in text_cells:
|
||||||
|
text += el["token"] + " "
|
||||||
|
|
||||||
|
text = text.strip()
|
||||||
|
data["text"] = text
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
class TableElement(BasePageElement):
|
||||||
|
otsl_seq: List[str]
|
||||||
|
num_rows: int = 0
|
||||||
|
num_cols: int = 0
|
||||||
|
table_cells: List[TableCell]
|
||||||
|
|
||||||
|
|
||||||
|
class TableStructurePrediction(BaseModel):
|
||||||
|
table_map: Dict[int, TableElement] = {}
|
||||||
|
|
||||||
|
|
||||||
|
class TextElement(BasePageElement):
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
class FigureData(BaseModel):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class FigureElement(BasePageElement):
|
||||||
|
data: Optional[FigureData] = None
|
||||||
|
provenance: Optional[str] = None
|
||||||
|
predicted_class: Optional[str] = None
|
||||||
|
confidence: Optional[float] = None
|
||||||
|
|
||||||
|
|
||||||
|
class FigureClassificationPrediction(BaseModel):
|
||||||
|
figure_count: int = 0
|
||||||
|
figure_map: Dict[int, FigureElement] = {}
|
||||||
|
|
||||||
|
|
||||||
|
class EquationPrediction(BaseModel):
|
||||||
|
equation_count: int = 0
|
||||||
|
equation_map: Dict[int, TextElement] = {}
|
||||||
|
|
||||||
|
|
||||||
|
class PagePredictions(BaseModel):
|
||||||
|
layout: LayoutPrediction = None
|
||||||
|
tablestructure: TableStructurePrediction = None
|
||||||
|
figures_classification: FigureClassificationPrediction = None
|
||||||
|
equations_prediction: EquationPrediction = None
|
||||||
|
|
||||||
|
|
||||||
|
PageElement = Union[TextElement, TableElement, FigureElement]
|
||||||
|
|
||||||
|
|
||||||
|
class AssembledUnit(BaseModel):
|
||||||
|
elements: List[PageElement]
|
||||||
|
body: List[PageElement]
|
||||||
|
headers: List[PageElement]
|
||||||
|
|
||||||
|
|
||||||
|
class Page(BaseModel):
|
||||||
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
|
|
||||||
|
page_no: int
|
||||||
|
page_hash: str = None
|
||||||
|
size: PageSize = None
|
||||||
|
image: Image = None
|
||||||
|
cells: List[Cell] = None
|
||||||
|
predictions: PagePredictions = PagePredictions()
|
||||||
|
assembled: AssembledUnit = None
|
||||||
|
|
||||||
|
_backend: PdfPageBackend = None # Internal PDF backend
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentStream(BaseModel):
|
||||||
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
|
|
||||||
|
filename: str
|
||||||
|
stream: BytesIO
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineOptions(BaseModel):
|
||||||
|
do_table_structure: bool = True
|
||||||
|
do_ocr: bool = False
|
351
docling/datamodel/document.py
Normal file
351
docling/datamodel/document.py
Normal file
@ -0,0 +1,351 @@
|
|||||||
|
import logging
|
||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path, PurePath
|
||||||
|
from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
|
||||||
|
|
||||||
|
from deepsearch.documents.core.export import export_to_markdown
|
||||||
|
from docling_core.types import BaseCell, BaseText
|
||||||
|
from docling_core.types import BoundingBox as DsBoundingBox
|
||||||
|
from docling_core.types import Document as DsDocument
|
||||||
|
from docling_core.types import DocumentDescription as DsDocumentDescription
|
||||||
|
from docling_core.types import FileInfoObject as DsFileInfoObject
|
||||||
|
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
||||||
|
from docling_core.types import Table as DsSchemaTable
|
||||||
|
from docling_core.types import TableCell
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
||||||
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
|
from docling.datamodel.base_models import (
|
||||||
|
AssembledUnit,
|
||||||
|
ConversionStatus,
|
||||||
|
DocumentStream,
|
||||||
|
FigureElement,
|
||||||
|
Page,
|
||||||
|
TableElement,
|
||||||
|
TextElement,
|
||||||
|
)
|
||||||
|
from docling.datamodel.settings import DocumentLimits
|
||||||
|
from docling.utils.utils import create_file_hash
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
layout_label_to_ds_type = {
|
||||||
|
"Title": "title",
|
||||||
|
"Document Index": "table-of-path_or_stream",
|
||||||
|
"Section-header": "subtitle-level-1",
|
||||||
|
"Checkbox-Selected": "checkbox-selected",
|
||||||
|
"Checkbox-Unselected": "checkbox-unselected",
|
||||||
|
"Caption": "caption",
|
||||||
|
"Page-header": "page-header",
|
||||||
|
"Page-footer": "page-footer",
|
||||||
|
"Footnote": "footnote",
|
||||||
|
"Table": "table",
|
||||||
|
"Formula": "equation",
|
||||||
|
"List-item": "paragraph",
|
||||||
|
"Code": "paragraph",
|
||||||
|
"Picture": "figure",
|
||||||
|
"Text": "paragraph",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class InputDocument(BaseModel):
|
||||||
|
file: PurePath = None
|
||||||
|
document_hash: Optional[str] = None
|
||||||
|
valid: bool = False
|
||||||
|
limits: DocumentLimits = DocumentLimits()
|
||||||
|
|
||||||
|
filesize: Optional[int] = None
|
||||||
|
page_count: Optional[int] = None
|
||||||
|
|
||||||
|
_backend: PdfDocumentBackend = None # Internal PDF backend used
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
path_or_stream: Union[BytesIO, Path],
|
||||||
|
filename: Optional[str] = None,
|
||||||
|
limits: Optional[DocumentLimits] = None,
|
||||||
|
pdf_backend=PyPdfiumDocumentBackend,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.limits = limits or DocumentLimits()
|
||||||
|
|
||||||
|
try:
|
||||||
|
if isinstance(path_or_stream, Path):
|
||||||
|
self.file = path_or_stream
|
||||||
|
self.filesize = path_or_stream.stat().st_size
|
||||||
|
if self.filesize > self.limits.max_file_size:
|
||||||
|
self.valid = False
|
||||||
|
else:
|
||||||
|
self.document_hash = create_file_hash(path_or_stream)
|
||||||
|
self._backend = pdf_backend(path_or_stream=path_or_stream)
|
||||||
|
|
||||||
|
elif isinstance(path_or_stream, BytesIO):
|
||||||
|
self.file = PurePath(filename)
|
||||||
|
self.filesize = path_or_stream.getbuffer().nbytes
|
||||||
|
|
||||||
|
if self.filesize > self.limits.max_file_size:
|
||||||
|
self.valid = False
|
||||||
|
else:
|
||||||
|
self.document_hash = create_file_hash(path_or_stream)
|
||||||
|
self._backend = pdf_backend(path_or_stream=path_or_stream)
|
||||||
|
|
||||||
|
if self.document_hash and self._backend.page_count() > 0:
|
||||||
|
self.page_count = self._backend.page_count()
|
||||||
|
|
||||||
|
if self.page_count <= self.limits.max_num_pages:
|
||||||
|
self.valid = True
|
||||||
|
|
||||||
|
except (FileNotFoundError, OSError) as e:
|
||||||
|
_log.exception(
|
||||||
|
f"File {self.file.name} not found or cannot be opened.", exc_info=e
|
||||||
|
)
|
||||||
|
# raise
|
||||||
|
except RuntimeError as e:
|
||||||
|
_log.exception(
|
||||||
|
f"An unexpected error occurred while opening the document {self.file.name}",
|
||||||
|
exc_info=e,
|
||||||
|
)
|
||||||
|
# raise
|
||||||
|
|
||||||
|
|
||||||
|
class ConvertedDocument(BaseModel):
|
||||||
|
input: InputDocument
|
||||||
|
|
||||||
|
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
||||||
|
errors: List[Dict] = [] # structure to keep errors
|
||||||
|
|
||||||
|
pages: List[Page] = []
|
||||||
|
assembled: AssembledUnit = None
|
||||||
|
|
||||||
|
output: DsDocument = None
|
||||||
|
|
||||||
|
def to_ds_document(self) -> DsDocument:
|
||||||
|
title = ""
|
||||||
|
desc = DsDocumentDescription(logs=[])
|
||||||
|
|
||||||
|
page_hashes = [
|
||||||
|
PageReference(hash=p.page_hash, page=p.page_no, model="default")
|
||||||
|
for p in self.pages
|
||||||
|
]
|
||||||
|
|
||||||
|
file_info = DsFileInfoObject(
|
||||||
|
filename=self.input.file.name,
|
||||||
|
document_hash=self.input.document_hash,
|
||||||
|
num_pages=self.input.page_count,
|
||||||
|
page_hashes=page_hashes,
|
||||||
|
)
|
||||||
|
|
||||||
|
main_text = []
|
||||||
|
tables = []
|
||||||
|
figures = []
|
||||||
|
|
||||||
|
page_no_to_page = {p.page_no: p for p in self.pages}
|
||||||
|
|
||||||
|
for element in self.assembled.elements:
|
||||||
|
# Convert bboxes to lower-left origin.
|
||||||
|
target_bbox = DsBoundingBox(
|
||||||
|
element.cluster.bbox.to_bottom_left_origin(
|
||||||
|
page_no_to_page[element.page_no].size.height
|
||||||
|
).as_tuple()
|
||||||
|
)
|
||||||
|
|
||||||
|
if isinstance(element, TextElement):
|
||||||
|
main_text.append(
|
||||||
|
BaseText(
|
||||||
|
text=element.text,
|
||||||
|
obj_type=layout_label_to_ds_type.get(element.label),
|
||||||
|
name=element.label,
|
||||||
|
prov=[
|
||||||
|
Prov(
|
||||||
|
bbox=target_bbox,
|
||||||
|
page=element.page_no,
|
||||||
|
span=[0, len(element.text)],
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif isinstance(element, TableElement):
|
||||||
|
index = len(tables)
|
||||||
|
ref_str = f"#/tables/{index}"
|
||||||
|
main_text.append(
|
||||||
|
Ref(
|
||||||
|
name=element.label,
|
||||||
|
obj_type=layout_label_to_ds_type.get(element.label),
|
||||||
|
ref=ref_str,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Initialise empty table data grid (only empty cells)
|
||||||
|
table_data = [
|
||||||
|
[
|
||||||
|
TableCell(
|
||||||
|
text="",
|
||||||
|
# bbox=[0,0,0,0],
|
||||||
|
spans=[[i, j]],
|
||||||
|
obj_type="body",
|
||||||
|
)
|
||||||
|
for j in range(element.num_cols)
|
||||||
|
]
|
||||||
|
for i in range(element.num_rows)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Overwrite cells in table data for which there is actual cell content.
|
||||||
|
for cell in element.table_cells:
|
||||||
|
for i in range(
|
||||||
|
min(cell.start_row_offset_idx, element.num_rows),
|
||||||
|
min(cell.end_row_offset_idx, element.num_rows),
|
||||||
|
):
|
||||||
|
for j in range(
|
||||||
|
min(cell.start_col_offset_idx, element.num_cols),
|
||||||
|
min(cell.end_col_offset_idx, element.num_cols),
|
||||||
|
):
|
||||||
|
celltype = "body"
|
||||||
|
if cell.column_header:
|
||||||
|
celltype = "col_header"
|
||||||
|
elif cell.row_header:
|
||||||
|
celltype = "row_header"
|
||||||
|
|
||||||
|
def make_spans(cell):
|
||||||
|
for rspan in range(
|
||||||
|
min(cell.start_row_offset_idx, element.num_rows),
|
||||||
|
min(cell.end_row_offset_idx, element.num_rows),
|
||||||
|
):
|
||||||
|
for cspan in range(
|
||||||
|
min(
|
||||||
|
cell.start_col_offset_idx, element.num_cols
|
||||||
|
),
|
||||||
|
min(cell.end_col_offset_idx, element.num_cols),
|
||||||
|
):
|
||||||
|
yield [rspan, cspan]
|
||||||
|
|
||||||
|
spans = list(make_spans(cell))
|
||||||
|
table_data[i][j] = TableCell(
|
||||||
|
text=cell.text,
|
||||||
|
bbox=cell.bbox.to_bottom_left_origin(
|
||||||
|
page_no_to_page[element.page_no].size.height
|
||||||
|
).as_tuple(),
|
||||||
|
# col=j,
|
||||||
|
# row=i,
|
||||||
|
spans=spans,
|
||||||
|
obj_type=celltype,
|
||||||
|
# col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
|
||||||
|
# row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
|
||||||
|
)
|
||||||
|
|
||||||
|
tables.append(
|
||||||
|
DsSchemaTable(
|
||||||
|
num_cols=element.num_cols,
|
||||||
|
num_rows=element.num_rows,
|
||||||
|
obj_type=layout_label_to_ds_type.get(element.label),
|
||||||
|
data=table_data,
|
||||||
|
prov=[
|
||||||
|
Prov(
|
||||||
|
bbox=target_bbox,
|
||||||
|
page=element.page_no,
|
||||||
|
span=[0, 0],
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
elif isinstance(element, FigureElement):
|
||||||
|
index = len(figures)
|
||||||
|
ref_str = f"#/figures/{index}"
|
||||||
|
main_text.append(
|
||||||
|
Ref(
|
||||||
|
name=element.label,
|
||||||
|
obj_type=layout_label_to_ds_type.get(element.label),
|
||||||
|
ref=ref_str,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
figures.append(
|
||||||
|
BaseCell(
|
||||||
|
prov=[
|
||||||
|
Prov(
|
||||||
|
bbox=target_bbox,
|
||||||
|
page=element.page_no,
|
||||||
|
span=[0, 0],
|
||||||
|
)
|
||||||
|
],
|
||||||
|
obj_type=layout_label_to_ds_type.get(element.label),
|
||||||
|
# data=[[]],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
page_dimensions = [
|
||||||
|
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
|
||||||
|
for p in self.pages
|
||||||
|
]
|
||||||
|
|
||||||
|
ds_doc = DsDocument(
|
||||||
|
name=title,
|
||||||
|
description=desc,
|
||||||
|
file_info=file_info,
|
||||||
|
main_text=main_text,
|
||||||
|
tables=tables,
|
||||||
|
figures=figures,
|
||||||
|
page_dimensions=page_dimensions,
|
||||||
|
)
|
||||||
|
|
||||||
|
return ds_doc
|
||||||
|
|
||||||
|
def render_as_dict(self):
|
||||||
|
if self.output:
|
||||||
|
return self.output.model_dump(by_alias=True, exclude_none=True)
|
||||||
|
else:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def render_as_markdown(self):
|
||||||
|
if self.output:
|
||||||
|
return export_to_markdown(
|
||||||
|
self.output.model_dump(by_alias=True, exclude_none=True)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentConversionInput(BaseModel):
|
||||||
|
|
||||||
|
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
||||||
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
||||||
|
|
||||||
|
DEFAULT_BACKEND: ClassVar = PyPdfiumDocumentBackend
|
||||||
|
|
||||||
|
def docs(
|
||||||
|
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
|
||||||
|
) -> Iterable[InputDocument]:
|
||||||
|
|
||||||
|
pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
|
||||||
|
|
||||||
|
for obj in self._path_or_stream_iterator:
|
||||||
|
if isinstance(obj, Path):
|
||||||
|
yield InputDocument(
|
||||||
|
path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
|
||||||
|
)
|
||||||
|
elif isinstance(obj, DocumentStream):
|
||||||
|
yield InputDocument(
|
||||||
|
path_or_stream=obj.stream,
|
||||||
|
filename=obj.filename,
|
||||||
|
limits=self.limits,
|
||||||
|
pdf_backend=pdf_backend,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
|
||||||
|
paths = [Path(p) for p in paths]
|
||||||
|
|
||||||
|
doc_input = cls(limits=limits)
|
||||||
|
doc_input._path_or_stream_iterator = paths
|
||||||
|
|
||||||
|
return doc_input
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_streams(
|
||||||
|
cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
|
||||||
|
):
|
||||||
|
doc_input = cls(limits=limits)
|
||||||
|
doc_input._path_or_stream_iterator = streams
|
||||||
|
|
||||||
|
return doc_input
|
32
docling/datamodel/settings.py
Normal file
32
docling/datamodel/settings.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from pydantic_settings import BaseSettings
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentLimits(BaseModel):
|
||||||
|
max_num_pages: int = sys.maxsize
|
||||||
|
max_file_size: int = sys.maxsize
|
||||||
|
|
||||||
|
|
||||||
|
class BatchConcurrencySettings(BaseModel):
|
||||||
|
doc_batch_size: int = 2
|
||||||
|
doc_batch_concurrency: int = 2
|
||||||
|
page_batch_size: int = 4
|
||||||
|
page_batch_concurrency: int = 2
|
||||||
|
|
||||||
|
# doc_batch_size: int = 1
|
||||||
|
# doc_batch_concurrency: int = 1
|
||||||
|
# page_batch_size: int = 1
|
||||||
|
# page_batch_concurrency: int = 1
|
||||||
|
|
||||||
|
# model_concurrency: int = 2
|
||||||
|
|
||||||
|
# To force models into single core: export OMP_NUM_THREADS=1
|
||||||
|
|
||||||
|
|
||||||
|
class AppSettings(BaseSettings):
|
||||||
|
perf: BatchConcurrencySettings
|
||||||
|
|
||||||
|
|
||||||
|
settings = AppSettings(perf=BatchConcurrencySettings())
|
207
docling/document_converter.py
Normal file
207
docling/document_converter.py
Normal file
@ -0,0 +1,207 @@
|
|||||||
|
import functools
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, Optional, Type, Union
|
||||||
|
|
||||||
|
from PIL import ImageDraw
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
||||||
|
from docling.datamodel.base_models import (
|
||||||
|
AssembledUnit,
|
||||||
|
ConversionStatus,
|
||||||
|
Page,
|
||||||
|
PipelineOptions,
|
||||||
|
)
|
||||||
|
from docling.datamodel.document import (
|
||||||
|
ConvertedDocument,
|
||||||
|
DocumentConversionInput,
|
||||||
|
InputDocument,
|
||||||
|
)
|
||||||
|
from docling.datamodel.settings import settings
|
||||||
|
from docling.models.ds_glm_model import GlmModel
|
||||||
|
from docling.models.page_assemble_model import PageAssembleModel
|
||||||
|
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||||
|
from docling.pipeline.standard_model_pipeline import StandardModelPipeline
|
||||||
|
from docling.utils.utils import chunkify, create_hash
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentConverter:
|
||||||
|
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
||||||
|
_table_model_path = "model_artifacts/tableformer"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
artifacts_path: Optional[Union[Path, str]] = None,
|
||||||
|
pipeline_options: PipelineOptions = PipelineOptions(),
|
||||||
|
pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
|
||||||
|
pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
|
||||||
|
):
|
||||||
|
if not artifacts_path:
|
||||||
|
artifacts_path = self.download_models_hf()
|
||||||
|
|
||||||
|
artifacts_path = Path(artifacts_path)
|
||||||
|
|
||||||
|
self.model_pipeline = pipeline_cls(
|
||||||
|
artifacts_path=artifacts_path, pipeline_options=pipeline_options
|
||||||
|
)
|
||||||
|
|
||||||
|
self.page_assemble_model = PageAssembleModel(config={})
|
||||||
|
self.glm_model = GlmModel(config={})
|
||||||
|
self.pdf_backend = pdf_backend
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def download_models_hf(
|
||||||
|
local_dir: Optional[Path] = None, force: bool = False
|
||||||
|
) -> Path:
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
download_path = snapshot_download(
|
||||||
|
repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
|
||||||
|
)
|
||||||
|
|
||||||
|
return Path(download_path)
|
||||||
|
|
||||||
|
def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument]:
|
||||||
|
|
||||||
|
for input_batch in chunkify(
|
||||||
|
input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
|
||||||
|
):
|
||||||
|
_log.info(f"Going to convert document batch...")
|
||||||
|
# parallel processing only within input_batch
|
||||||
|
# with ThreadPoolExecutor(
|
||||||
|
# max_workers=settings.perf.doc_batch_concurrency
|
||||||
|
# ) as pool:
|
||||||
|
# yield from pool.map(self.process_document, input_batch)
|
||||||
|
|
||||||
|
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
|
||||||
|
yield from map(self.process_document, input_batch)
|
||||||
|
|
||||||
|
def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
|
||||||
|
start_doc_time = time.time()
|
||||||
|
converted_doc = ConvertedDocument(input=in_doc)
|
||||||
|
|
||||||
|
if not in_doc.valid:
|
||||||
|
converted_doc.status = ConversionStatus.FAILURE
|
||||||
|
return converted_doc
|
||||||
|
|
||||||
|
for i in range(0, in_doc.page_count):
|
||||||
|
converted_doc.pages.append(Page(page_no=i))
|
||||||
|
|
||||||
|
all_assembled_pages = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Iterate batches of pages (page_batch_size) in the doc
|
||||||
|
for page_batch in chunkify(
|
||||||
|
converted_doc.pages, settings.perf.page_batch_size
|
||||||
|
):
|
||||||
|
|
||||||
|
start_pb_time = time.time()
|
||||||
|
# Pipeline
|
||||||
|
|
||||||
|
# 1. Initialise the page resources
|
||||||
|
init_pages = map(
|
||||||
|
functools.partial(self.initialize_page, in_doc), page_batch
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Populate page image
|
||||||
|
pages_with_images = map(
|
||||||
|
functools.partial(self.populate_page_images, in_doc), init_pages
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. Populate programmatic page cells
|
||||||
|
pages_with_cells = map(
|
||||||
|
functools.partial(self.parse_page_cells, in_doc),
|
||||||
|
pages_with_images,
|
||||||
|
)
|
||||||
|
|
||||||
|
pipeline_pages = self.model_pipeline.apply(pages_with_cells)
|
||||||
|
|
||||||
|
# 7. Assemble page elements (per page)
|
||||||
|
assembled_pages = self.page_assemble_model(pipeline_pages)
|
||||||
|
|
||||||
|
# exhaust assembled_pages
|
||||||
|
for assembled_page in assembled_pages:
|
||||||
|
# Free up mem resources before moving on with next batch
|
||||||
|
assembled_page.image = (
|
||||||
|
None # Comment this if you want to visualize page images
|
||||||
|
)
|
||||||
|
assembled_page._backend.unload()
|
||||||
|
|
||||||
|
all_assembled_pages.append(assembled_page)
|
||||||
|
|
||||||
|
end_pb_time = time.time() - start_pb_time
|
||||||
|
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||||
|
|
||||||
|
# Free up mem resources of PDF backend
|
||||||
|
in_doc._backend.unload()
|
||||||
|
|
||||||
|
converted_doc.pages = all_assembled_pages
|
||||||
|
self.assemble_doc(converted_doc)
|
||||||
|
|
||||||
|
converted_doc.status = ConversionStatus.SUCCESS
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
converted_doc.status = ConversionStatus.FAILURE
|
||||||
|
trace = "\n".join(traceback.format_exception(e))
|
||||||
|
_log.info(f"Encountered an error during conversion: {trace}")
|
||||||
|
|
||||||
|
end_doc_time = time.time() - start_doc_time
|
||||||
|
_log.info(
|
||||||
|
f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return converted_doc
|
||||||
|
|
||||||
|
# Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
|
||||||
|
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
||||||
|
page._backend = doc._backend.load_page(page.page_no)
|
||||||
|
page.size = page._backend.get_size()
|
||||||
|
page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Generate the page image and store it in the page object
|
||||||
|
def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
|
||||||
|
page.image = page._backend.get_page_image()
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Extract and populate the page cells and store it in the page object
|
||||||
|
def parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
|
||||||
|
page.cells = page._backend.get_text_cells()
|
||||||
|
|
||||||
|
# DEBUG code:
|
||||||
|
def draw_text_boxes(image, cells):
|
||||||
|
draw = ImageDraw.Draw(image)
|
||||||
|
for c in cells:
|
||||||
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||||
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||||
|
image.show()
|
||||||
|
|
||||||
|
# draw_text_boxes(page.image, cells)
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
def assemble_doc(self, converted_doc: ConvertedDocument):
|
||||||
|
all_elements = []
|
||||||
|
all_headers = []
|
||||||
|
all_body = []
|
||||||
|
|
||||||
|
for p in converted_doc.pages:
|
||||||
|
|
||||||
|
for el in p.assembled.body:
|
||||||
|
all_body.append(el)
|
||||||
|
for el in p.assembled.headers:
|
||||||
|
all_headers.append(el)
|
||||||
|
for el in p.assembled.elements:
|
||||||
|
all_elements.append(el)
|
||||||
|
|
||||||
|
converted_doc.assembled = AssembledUnit(
|
||||||
|
elements=all_elements, headers=all_headers, body=all_body
|
||||||
|
)
|
||||||
|
|
||||||
|
converted_doc.output = self.glm_model(converted_doc)
|
0
docling/models/__init__.py
Normal file
0
docling/models/__init__.py
Normal file
82
docling/models/ds_glm_model.py
Normal file
82
docling/models/ds_glm_model.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
import copy
|
||||||
|
import random
|
||||||
|
|
||||||
|
from deepsearch_glm.nlp_utils import init_nlp_model
|
||||||
|
from deepsearch_glm.utils.ds_utils import to_legacy_document_format
|
||||||
|
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
||||||
|
from docling_core.types import BaseText
|
||||||
|
from docling_core.types import Document as DsDocument
|
||||||
|
from docling_core.types import Ref
|
||||||
|
from PIL import ImageDraw
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
|
||||||
|
from docling.datamodel.document import ConvertedDocument
|
||||||
|
|
||||||
|
|
||||||
|
class GlmModel:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
load_pretrained_nlp_models()
|
||||||
|
model = init_nlp_model(model_names="language;term;reference")
|
||||||
|
self.model = model
|
||||||
|
|
||||||
|
def __call__(self, document: ConvertedDocument) -> DsDocument:
|
||||||
|
ds_doc = document.to_ds_document()
|
||||||
|
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
||||||
|
|
||||||
|
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
||||||
|
ds_doc_dict = to_legacy_document_format(
|
||||||
|
glm_doc, ds_doc_dict, update_name_label=True
|
||||||
|
)
|
||||||
|
|
||||||
|
exported_doc = DsDocument.model_validate(ds_doc_dict)
|
||||||
|
|
||||||
|
# DEBUG code:
|
||||||
|
def draw_clusters_and_cells(ds_document, page_no):
|
||||||
|
clusters_to_draw = []
|
||||||
|
image = copy.deepcopy(document.pages[page_no].image)
|
||||||
|
for ix, elem in enumerate(ds_document.main_text):
|
||||||
|
if isinstance(elem, BaseText):
|
||||||
|
prov = elem.prov[0]
|
||||||
|
elif isinstance(elem, Ref):
|
||||||
|
_, arr, index = elem.ref.split("/")
|
||||||
|
index = int(index)
|
||||||
|
if arr == "tables":
|
||||||
|
prov = ds_document.tables[index].prov[0]
|
||||||
|
elif arr == "figures":
|
||||||
|
prov = ds_document.figures[index].prov[0]
|
||||||
|
else:
|
||||||
|
prov = None
|
||||||
|
|
||||||
|
if prov and prov.page == page_no:
|
||||||
|
clusters_to_draw.append(
|
||||||
|
Cluster(
|
||||||
|
id=ix,
|
||||||
|
label=elem.name,
|
||||||
|
bbox=BoundingBox.from_tuple(
|
||||||
|
coord=prov.bbox,
|
||||||
|
origin=CoordOrigin.BOTTOMLEFT,
|
||||||
|
).to_top_left_origin(document.pages[page_no].size.height),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
draw = ImageDraw.Draw(image)
|
||||||
|
for c in clusters_to_draw:
|
||||||
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||||
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||||
|
draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
|
||||||
|
|
||||||
|
cell_color = (
|
||||||
|
random.randint(30, 140),
|
||||||
|
random.randint(30, 140),
|
||||||
|
random.randint(30, 140),
|
||||||
|
)
|
||||||
|
for tc in c.cells: # [:1]:
|
||||||
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||||
|
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||||
|
image.show()
|
||||||
|
|
||||||
|
# draw_clusters_and_cells(ds_doc, 0)
|
||||||
|
# draw_clusters_and_cells(exported_doc, 0)
|
||||||
|
|
||||||
|
return exported_doc
|
77
docling/models/easyocr_model.py
Normal file
77
docling/models/easyocr_model.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
import copy
|
||||||
|
import logging
|
||||||
|
import random
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
from PIL import ImageDraw
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class EasyOcrModel:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.enabled = config["enabled"]
|
||||||
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||||
|
|
||||||
|
if self.enabled:
|
||||||
|
import easyocr
|
||||||
|
|
||||||
|
self.reader = easyocr.Reader(config["lang"])
|
||||||
|
|
||||||
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
|
|
||||||
|
if not self.enabled:
|
||||||
|
yield from page_batch
|
||||||
|
return
|
||||||
|
|
||||||
|
for page in page_batch:
|
||||||
|
# rects = page._fpage.
|
||||||
|
high_res_image = page._backend.get_page_image(scale=self.scale)
|
||||||
|
im = numpy.array(high_res_image)
|
||||||
|
result = self.reader.readtext(im)
|
||||||
|
|
||||||
|
del high_res_image
|
||||||
|
del im
|
||||||
|
|
||||||
|
cells = [
|
||||||
|
OcrCell(
|
||||||
|
id=ix,
|
||||||
|
text=line[1],
|
||||||
|
confidence=line[2],
|
||||||
|
bbox=BoundingBox.from_tuple(
|
||||||
|
coord=(
|
||||||
|
line[0][0][0] / self.scale,
|
||||||
|
line[0][0][1] / self.scale,
|
||||||
|
line[0][2][0] / self.scale,
|
||||||
|
line[0][2][1] / self.scale,
|
||||||
|
),
|
||||||
|
origin=CoordOrigin.TOPLEFT,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
for ix, line in enumerate(result)
|
||||||
|
]
|
||||||
|
|
||||||
|
page.cells = cells # For now, just overwrites all digital cells.
|
||||||
|
|
||||||
|
# DEBUG code:
|
||||||
|
def draw_clusters_and_cells():
|
||||||
|
image = copy.deepcopy(page.image)
|
||||||
|
draw = ImageDraw.Draw(image)
|
||||||
|
|
||||||
|
cell_color = (
|
||||||
|
random.randint(30, 140),
|
||||||
|
random.randint(30, 140),
|
||||||
|
random.randint(30, 140),
|
||||||
|
)
|
||||||
|
for tc in cells:
|
||||||
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||||
|
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||||
|
image.show()
|
||||||
|
|
||||||
|
# draw_clusters_and_cells()
|
||||||
|
|
||||||
|
yield page
|
318
docling/models/layout_model.py
Normal file
318
docling/models/layout_model.py
Normal file
@ -0,0 +1,318 @@
|
|||||||
|
import copy
|
||||||
|
import logging
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
from typing import Iterable, List
|
||||||
|
|
||||||
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||||
|
from PIL import ImageDraw
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import (
|
||||||
|
BoundingBox,
|
||||||
|
Cell,
|
||||||
|
Cluster,
|
||||||
|
CoordOrigin,
|
||||||
|
LayoutPrediction,
|
||||||
|
Page,
|
||||||
|
)
|
||||||
|
from docling.utils import layout_utils as lu
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class LayoutModel:
|
||||||
|
|
||||||
|
TEXT_ELEM_LABELS = [
|
||||||
|
"Text",
|
||||||
|
"Footnote",
|
||||||
|
"Caption",
|
||||||
|
"Checkbox-Unselected",
|
||||||
|
"Checkbox-Selected",
|
||||||
|
"Section-header",
|
||||||
|
"Page-header",
|
||||||
|
"Page-footer",
|
||||||
|
"Code",
|
||||||
|
"List-item",
|
||||||
|
# "Formula",
|
||||||
|
]
|
||||||
|
PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
|
||||||
|
|
||||||
|
TABLE_LABEL = "Table"
|
||||||
|
FIGURE_LABEL = "Picture"
|
||||||
|
FORMULA_LABEL = "Formula"
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.layout_predictor = LayoutPredictor(
|
||||||
|
config["artifacts_path"]
|
||||||
|
) # TODO temporary
|
||||||
|
|
||||||
|
def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
|
||||||
|
MIN_INTERSECTION = 0.2
|
||||||
|
CLASS_THRESHOLDS = {
|
||||||
|
"Caption": 0.35,
|
||||||
|
"Footnote": 0.35,
|
||||||
|
"Formula": 0.35,
|
||||||
|
"List-item": 0.35,
|
||||||
|
"Page-footer": 0.35,
|
||||||
|
"Page-header": 0.35,
|
||||||
|
"Picture": 0.2, # low threshold adjust to capture chemical structures for examples.
|
||||||
|
"Section-header": 0.45,
|
||||||
|
"Table": 0.35,
|
||||||
|
"Text": 0.45,
|
||||||
|
"Title": 0.45,
|
||||||
|
"Document Index": 0.45,
|
||||||
|
"Code": 0.45,
|
||||||
|
"Checkbox-Selected": 0.45,
|
||||||
|
"Checkbox-Unselected": 0.45,
|
||||||
|
"Form": 0.45,
|
||||||
|
"Key-Value Region": 0.45,
|
||||||
|
}
|
||||||
|
|
||||||
|
_log.debug("================= Start postprocess function ====================")
|
||||||
|
start_time = time.time()
|
||||||
|
# Apply Confidence Threshold to cluster predictions
|
||||||
|
# confidence = self.conf_threshold
|
||||||
|
clusters_out = []
|
||||||
|
|
||||||
|
for cluster in clusters:
|
||||||
|
confidence = CLASS_THRESHOLDS[cluster.label]
|
||||||
|
if cluster.confidence >= confidence:
|
||||||
|
# annotation["created_by"] = "high_conf_pred"
|
||||||
|
clusters_out.append(cluster)
|
||||||
|
|
||||||
|
# map to dictionary clusters and cells, with bottom left origin
|
||||||
|
clusters = [
|
||||||
|
{
|
||||||
|
"id": c.id,
|
||||||
|
"bbox": list(
|
||||||
|
c.bbox.to_bottom_left_origin(page_height).as_tuple()
|
||||||
|
), # TODO
|
||||||
|
"confidence": c.confidence,
|
||||||
|
"cell_ids": [],
|
||||||
|
"type": c.label,
|
||||||
|
}
|
||||||
|
for c in clusters
|
||||||
|
]
|
||||||
|
|
||||||
|
clusters_out = [
|
||||||
|
{
|
||||||
|
"id": c.id,
|
||||||
|
"bbox": list(
|
||||||
|
c.bbox.to_bottom_left_origin(page_height).as_tuple()
|
||||||
|
), # TODO
|
||||||
|
"confidence": c.confidence,
|
||||||
|
"created_by": "high_conf_pred",
|
||||||
|
"cell_ids": [],
|
||||||
|
"type": c.label,
|
||||||
|
}
|
||||||
|
for c in clusters_out
|
||||||
|
]
|
||||||
|
|
||||||
|
raw_cells = [
|
||||||
|
{
|
||||||
|
"id": c.id,
|
||||||
|
"bbox": list(
|
||||||
|
c.bbox.to_bottom_left_origin(page_height).as_tuple()
|
||||||
|
), # TODO
|
||||||
|
"text": c.text,
|
||||||
|
}
|
||||||
|
for c in cells
|
||||||
|
]
|
||||||
|
cell_count = len(raw_cells)
|
||||||
|
|
||||||
|
_log.debug("---- 0. Treat cluster overlaps ------")
|
||||||
|
clusters_out = lu.remove_cluster_duplicates_by_conf(clusters_out, 0.8)
|
||||||
|
|
||||||
|
_log.debug(
|
||||||
|
"---- 1. Initially assign cells to clusters based on minimum intersection ------"
|
||||||
|
)
|
||||||
|
## Check for cells included in or touched by clusters:
|
||||||
|
clusters_out = lu.assigning_cell_ids_to_clusters(
|
||||||
|
clusters_out, raw_cells, MIN_INTERSECTION
|
||||||
|
)
|
||||||
|
|
||||||
|
_log.debug("---- 2. Assign Orphans with Low Confidence Detections")
|
||||||
|
# Creates a map of cell_id->cluster_id
|
||||||
|
(
|
||||||
|
clusters_around_cells,
|
||||||
|
orphan_cell_indices,
|
||||||
|
ambiguous_cell_indices,
|
||||||
|
) = lu.cell_id_state_map(clusters_out, cell_count)
|
||||||
|
|
||||||
|
# Assign orphan cells with lower confidence predictions
|
||||||
|
clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred(
|
||||||
|
clusters_out, clusters, raw_cells, orphan_cell_indices
|
||||||
|
)
|
||||||
|
|
||||||
|
# Refresh the cell_ids assignment, after creating new clusters using low conf predictions
|
||||||
|
clusters_out = lu.assigning_cell_ids_to_clusters(
|
||||||
|
clusters_out, raw_cells, MIN_INTERSECTION
|
||||||
|
)
|
||||||
|
|
||||||
|
_log.debug("---- 3. Settle Ambigous Cells")
|
||||||
|
# Creates an update map after assignment of cell_id->cluster_id
|
||||||
|
(
|
||||||
|
clusters_around_cells,
|
||||||
|
orphan_cell_indices,
|
||||||
|
ambiguous_cell_indices,
|
||||||
|
) = lu.cell_id_state_map(clusters_out, cell_count)
|
||||||
|
|
||||||
|
# Settle pdf cells that belong to multiple clusters
|
||||||
|
clusters_out, ambiguous_cell_indices = lu.remove_ambigous_pdf_cell_by_conf(
|
||||||
|
clusters_out, raw_cells, ambiguous_cell_indices
|
||||||
|
)
|
||||||
|
|
||||||
|
_log.debug("---- 4. Set Orphans as Text")
|
||||||
|
(
|
||||||
|
clusters_around_cells,
|
||||||
|
orphan_cell_indices,
|
||||||
|
ambiguous_cell_indices,
|
||||||
|
) = lu.cell_id_state_map(clusters_out, cell_count)
|
||||||
|
|
||||||
|
clusters_out, orphan_cell_indices = lu.set_orphan_as_text(
|
||||||
|
clusters_out, clusters, raw_cells, orphan_cell_indices
|
||||||
|
)
|
||||||
|
|
||||||
|
_log.debug("---- 5. Merge Cells & and adapt the bounding boxes")
|
||||||
|
# Merge cells orphan cells
|
||||||
|
clusters_out = lu.merge_cells(clusters_out)
|
||||||
|
|
||||||
|
# Clean up clusters that remain from merged and unreasonable clusters
|
||||||
|
clusters_out = lu.clean_up_clusters(
|
||||||
|
clusters_out,
|
||||||
|
raw_cells,
|
||||||
|
merge_cells=True,
|
||||||
|
img_table=True,
|
||||||
|
one_cell_table=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
new_clusters = lu.adapt_bboxes(raw_cells, clusters_out, orphan_cell_indices)
|
||||||
|
clusters_out = new_clusters
|
||||||
|
|
||||||
|
## We first rebuild where every cell is now:
|
||||||
|
## Now we write into a prediction cells list, not into the raw cells list.
|
||||||
|
## As we don't need previous labels, we best overwrite any old list, because that might
|
||||||
|
## have been sorted differently.
|
||||||
|
(
|
||||||
|
clusters_around_cells,
|
||||||
|
orphan_cell_indices,
|
||||||
|
ambiguous_cell_indices,
|
||||||
|
) = lu.cell_id_state_map(clusters_out, cell_count)
|
||||||
|
|
||||||
|
target_cells = []
|
||||||
|
for ix, cell in enumerate(raw_cells):
|
||||||
|
new_cell = {
|
||||||
|
"id": ix,
|
||||||
|
"rawcell_id": ix,
|
||||||
|
"label": "None",
|
||||||
|
"bbox": cell["bbox"],
|
||||||
|
"text": cell["text"],
|
||||||
|
}
|
||||||
|
for cluster_index in clusters_around_cells[
|
||||||
|
ix
|
||||||
|
]: # By previous analysis, this is always 1 cluster.
|
||||||
|
new_cell["label"] = clusters_out[cluster_index]["type"]
|
||||||
|
target_cells.append(new_cell)
|
||||||
|
# _log.debug("New label of cell " + str(ix) + " is " + str(new_cell["label"]))
|
||||||
|
cells_out = target_cells
|
||||||
|
|
||||||
|
## -------------------------------
|
||||||
|
## Sort clusters into reasonable reading order, and sort the cells inside each cluster
|
||||||
|
_log.debug("---- 5. Sort clusters in reading order ------")
|
||||||
|
sorted_clusters = lu.produce_reading_order(
|
||||||
|
clusters_out, "raw_cell_ids", "raw_cell_ids", True
|
||||||
|
)
|
||||||
|
clusters_out = sorted_clusters
|
||||||
|
|
||||||
|
# end_time = timer()
|
||||||
|
_log.debug("---- End of postprocessing function ------")
|
||||||
|
end_time = time.time() - start_time
|
||||||
|
_log.debug(f"Finished post processing in seconds={end_time:.3f}")
|
||||||
|
|
||||||
|
cells_out = [
|
||||||
|
Cell(
|
||||||
|
id=c["id"],
|
||||||
|
bbox=BoundingBox.from_tuple(
|
||||||
|
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
||||||
|
).to_top_left_origin(page_height),
|
||||||
|
text=c["text"],
|
||||||
|
)
|
||||||
|
for c in cells_out
|
||||||
|
]
|
||||||
|
clusters_out_new = []
|
||||||
|
for c in clusters_out:
|
||||||
|
cluster_cells = [ccell for ccell in cells_out if ccell.id in c["cell_ids"]]
|
||||||
|
c_new = Cluster(
|
||||||
|
id=c["id"],
|
||||||
|
bbox=BoundingBox.from_tuple(
|
||||||
|
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
||||||
|
).to_top_left_origin(page_height),
|
||||||
|
confidence=c["confidence"],
|
||||||
|
label=c["type"],
|
||||||
|
cells=cluster_cells,
|
||||||
|
)
|
||||||
|
clusters_out_new.append(c_new)
|
||||||
|
|
||||||
|
return clusters_out_new, cells_out
|
||||||
|
|
||||||
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
|
for page in page_batch:
|
||||||
|
clusters = []
|
||||||
|
for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)):
|
||||||
|
cluster = Cluster(
|
||||||
|
id=ix,
|
||||||
|
label=pred_item["label"],
|
||||||
|
confidence=pred_item["confidence"],
|
||||||
|
bbox=BoundingBox.model_validate(pred_item),
|
||||||
|
cells=[],
|
||||||
|
)
|
||||||
|
clusters.append(cluster)
|
||||||
|
|
||||||
|
# Map cells to clusters
|
||||||
|
# TODO: Remove, postprocess should take care of it anyway.
|
||||||
|
for cell in page.cells:
|
||||||
|
for cluster in clusters:
|
||||||
|
if not cell.bbox.area() > 0:
|
||||||
|
overlap_frac = 0.0
|
||||||
|
else:
|
||||||
|
overlap_frac = (
|
||||||
|
cell.bbox.intersection_area_with(cluster.bbox)
|
||||||
|
/ cell.bbox.area()
|
||||||
|
)
|
||||||
|
|
||||||
|
if overlap_frac > 0.5:
|
||||||
|
cluster.cells.append(cell)
|
||||||
|
|
||||||
|
# Pre-sort clusters
|
||||||
|
# clusters = self.sort_clusters_by_cell_order(clusters)
|
||||||
|
|
||||||
|
# DEBUG code:
|
||||||
|
def draw_clusters_and_cells():
|
||||||
|
image = copy.deepcopy(page.image)
|
||||||
|
draw = ImageDraw.Draw(image)
|
||||||
|
for c in clusters:
|
||||||
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||||
|
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
||||||
|
|
||||||
|
cell_color = (
|
||||||
|
random.randint(30, 140),
|
||||||
|
random.randint(30, 140),
|
||||||
|
random.randint(30, 140),
|
||||||
|
)
|
||||||
|
for tc in c.cells: # [:1]:
|
||||||
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
||||||
|
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||||
|
image.show()
|
||||||
|
|
||||||
|
# draw_clusters_and_cells()
|
||||||
|
|
||||||
|
clusters, page.cells = self.postprocess(
|
||||||
|
clusters, page.cells, page.size.height
|
||||||
|
)
|
||||||
|
|
||||||
|
# draw_clusters_and_cells()
|
||||||
|
|
||||||
|
page.predictions.layout = LayoutPrediction(clusters=clusters)
|
||||||
|
|
||||||
|
yield page
|
160
docling/models/page_assemble_model.py
Normal file
160
docling/models/page_assemble_model.py
Normal file
@ -0,0 +1,160 @@
|
|||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Iterable, List
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import (
|
||||||
|
AssembledUnit,
|
||||||
|
FigureElement,
|
||||||
|
Page,
|
||||||
|
PageElement,
|
||||||
|
TableElement,
|
||||||
|
TextElement,
|
||||||
|
)
|
||||||
|
from docling.models.layout_model import LayoutModel
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PageAssembleModel:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
# self.line_wrap_pattern = re.compile(r'(?<=[^\W_])- \n(?=\w)')
|
||||||
|
|
||||||
|
# def sanitize_text_poor(self, lines):
|
||||||
|
# text = '\n'.join(lines)
|
||||||
|
#
|
||||||
|
# # treat line wraps.
|
||||||
|
# sanitized_text = self.line_wrap_pattern.sub('', text)
|
||||||
|
#
|
||||||
|
# sanitized_text = sanitized_text.replace('\n', ' ')
|
||||||
|
#
|
||||||
|
# return sanitized_text
|
||||||
|
|
||||||
|
def sanitize_text(self, lines):
|
||||||
|
if len(lines) <= 1:
|
||||||
|
return " ".join(lines)
|
||||||
|
|
||||||
|
for ix, line in enumerate(lines[1:]):
|
||||||
|
prev_line = lines[ix]
|
||||||
|
|
||||||
|
if prev_line.endswith("-"):
|
||||||
|
prev_words = re.findall(r"\b[\w]+\b", prev_line)
|
||||||
|
line_words = re.findall(r"\b[\w]+\b", line)
|
||||||
|
|
||||||
|
if (
|
||||||
|
len(prev_words)
|
||||||
|
and len(line_words)
|
||||||
|
and prev_words[-1].isalnum()
|
||||||
|
and line_words[0].isalnum()
|
||||||
|
):
|
||||||
|
lines[ix] = prev_line[:-1]
|
||||||
|
else:
|
||||||
|
lines[ix] += " "
|
||||||
|
|
||||||
|
sanitized_text = "".join(lines)
|
||||||
|
|
||||||
|
return sanitized_text.strip() # Strip any leading or trailing whitespace
|
||||||
|
|
||||||
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
|
for page in page_batch:
|
||||||
|
# assembles some JSON output page by page.
|
||||||
|
|
||||||
|
elements: List[PageElement] = []
|
||||||
|
headers: List[PageElement] = []
|
||||||
|
body: List[PageElement] = []
|
||||||
|
|
||||||
|
for cluster in page.predictions.layout.clusters:
|
||||||
|
# _log.info("Cluster label seen:", cluster.label)
|
||||||
|
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
||||||
|
|
||||||
|
textlines = [
|
||||||
|
cell.text.replace("\x02", "-").strip()
|
||||||
|
for cell in cluster.cells
|
||||||
|
if len(cell.text.strip()) > 0
|
||||||
|
]
|
||||||
|
text = self.sanitize_text(textlines)
|
||||||
|
text_el = TextElement(
|
||||||
|
label=cluster.label,
|
||||||
|
id=cluster.id,
|
||||||
|
text=text,
|
||||||
|
page_no=page.page_no,
|
||||||
|
cluster=cluster,
|
||||||
|
)
|
||||||
|
elements.append(text_el)
|
||||||
|
|
||||||
|
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
|
||||||
|
headers.append(text_el)
|
||||||
|
else:
|
||||||
|
body.append(text_el)
|
||||||
|
elif cluster.label == LayoutModel.TABLE_LABEL:
|
||||||
|
tbl = None
|
||||||
|
if page.predictions.tablestructure:
|
||||||
|
tbl = page.predictions.tablestructure.table_map.get(
|
||||||
|
cluster.id, None
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
not tbl
|
||||||
|
): # fallback: add table without structure, if it isn't present
|
||||||
|
tbl = TableElement(
|
||||||
|
label=cluster.label,
|
||||||
|
id=cluster.id,
|
||||||
|
text="",
|
||||||
|
otsl_seq=[],
|
||||||
|
table_cells=[],
|
||||||
|
cluster=cluster,
|
||||||
|
page_no=page.page_no,
|
||||||
|
)
|
||||||
|
|
||||||
|
elements.append(tbl)
|
||||||
|
body.append(tbl)
|
||||||
|
elif cluster.label == LayoutModel.FIGURE_LABEL:
|
||||||
|
fig = None
|
||||||
|
if page.predictions.figures_classification:
|
||||||
|
fig = page.predictions.figures_classification.figure_map.get(
|
||||||
|
cluster.id, None
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
not fig
|
||||||
|
): # fallback: add figure without classification, if it isn't present
|
||||||
|
fig = FigureElement(
|
||||||
|
label=cluster.label,
|
||||||
|
id=cluster.id,
|
||||||
|
text="",
|
||||||
|
data=None,
|
||||||
|
cluster=cluster,
|
||||||
|
page_no=page.page_no,
|
||||||
|
)
|
||||||
|
elements.append(fig)
|
||||||
|
body.append(fig)
|
||||||
|
elif cluster.label == LayoutModel.FORMULA_LABEL:
|
||||||
|
equation = None
|
||||||
|
if page.predictions.equations_prediction:
|
||||||
|
equation = (
|
||||||
|
page.predictions.equations_prediction.equation_map.get(
|
||||||
|
cluster.id, None
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if not equation: # fallback: add empty formula, if it isn't present
|
||||||
|
text = self.sanitize_text(
|
||||||
|
[
|
||||||
|
cell.text.replace("\x02", "-").strip()
|
||||||
|
for cell in cluster.cells
|
||||||
|
if len(cell.text.strip()) > 0
|
||||||
|
]
|
||||||
|
)
|
||||||
|
equation = TextElement(
|
||||||
|
label=cluster.label,
|
||||||
|
id=cluster.id,
|
||||||
|
cluster=cluster,
|
||||||
|
page_no=page.page_no,
|
||||||
|
text=text,
|
||||||
|
)
|
||||||
|
elements.append(equation)
|
||||||
|
body.append(equation)
|
||||||
|
|
||||||
|
page.assembled = AssembledUnit(
|
||||||
|
elements=elements, headers=headers, body=body
|
||||||
|
)
|
||||||
|
|
||||||
|
yield page
|
114
docling/models/table_structure_model.py
Normal file
114
docling/models/table_structure_model.py
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import (
|
||||||
|
BoundingBox,
|
||||||
|
Page,
|
||||||
|
TableCell,
|
||||||
|
TableElement,
|
||||||
|
TableStructurePrediction,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TableStructureModel:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.do_cell_matching = config["do_cell_matching"]
|
||||||
|
|
||||||
|
self.enabled = config["enabled"]
|
||||||
|
if self.enabled:
|
||||||
|
artifacts_path = config["artifacts_path"]
|
||||||
|
# Third Party
|
||||||
|
import docling_ibm_models.tableformer.common as c
|
||||||
|
|
||||||
|
self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
|
||||||
|
self.tm_config["model"]["save_dir"] = artifacts_path
|
||||||
|
self.tm_model_type = self.tm_config["model"]["type"]
|
||||||
|
|
||||||
|
self.tf_predictor = TFPredictor(self.tm_config)
|
||||||
|
|
||||||
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
|
|
||||||
|
if not self.enabled:
|
||||||
|
yield from page_batch
|
||||||
|
return
|
||||||
|
|
||||||
|
for page in page_batch:
|
||||||
|
page.predictions.tablestructure = TableStructurePrediction() # dummy
|
||||||
|
|
||||||
|
in_tables = [
|
||||||
|
(
|
||||||
|
cluster,
|
||||||
|
[
|
||||||
|
round(cluster.bbox.l),
|
||||||
|
round(cluster.bbox.t),
|
||||||
|
round(cluster.bbox.r),
|
||||||
|
round(cluster.bbox.b),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
for cluster in page.predictions.layout.clusters
|
||||||
|
if cluster.label == "Table"
|
||||||
|
]
|
||||||
|
if not len(in_tables):
|
||||||
|
yield page
|
||||||
|
continue
|
||||||
|
|
||||||
|
tokens = []
|
||||||
|
for c in page.cells:
|
||||||
|
for cluster, _ in in_tables:
|
||||||
|
if c.bbox.area() > 0:
|
||||||
|
if (
|
||||||
|
c.bbox.intersection_area_with(cluster.bbox) / c.bbox.area()
|
||||||
|
> 0.2
|
||||||
|
):
|
||||||
|
# Only allow non empty stings (spaces) into the cells of a table
|
||||||
|
if len(c.text.strip()) > 0:
|
||||||
|
tokens.append(c.model_dump())
|
||||||
|
|
||||||
|
iocr_page = {
|
||||||
|
"image": numpy.asarray(page.image),
|
||||||
|
"tokens": tokens,
|
||||||
|
"width": page.size.width,
|
||||||
|
"height": page.size.height,
|
||||||
|
}
|
||||||
|
|
||||||
|
table_clusters, table_bboxes = zip(*in_tables)
|
||||||
|
|
||||||
|
if len(table_bboxes):
|
||||||
|
tf_output = self.tf_predictor.multi_table_predict(
|
||||||
|
iocr_page, table_bboxes, do_matching=self.do_cell_matching
|
||||||
|
)
|
||||||
|
|
||||||
|
for table_cluster, table_out in zip(table_clusters, tf_output):
|
||||||
|
table_cells = []
|
||||||
|
for element in table_out["tf_responses"]:
|
||||||
|
|
||||||
|
if not self.do_cell_matching:
|
||||||
|
the_bbox = BoundingBox.model_validate(element["bbox"])
|
||||||
|
text_piece = page._backend.get_text_in_rect(the_bbox)
|
||||||
|
element["bbox"]["token"] = text_piece
|
||||||
|
|
||||||
|
tc = TableCell.model_validate(element)
|
||||||
|
table_cells.append(tc)
|
||||||
|
|
||||||
|
# Retrieving cols/rows, after post processing:
|
||||||
|
num_rows = table_out["predict_details"]["num_rows"]
|
||||||
|
num_cols = table_out["predict_details"]["num_cols"]
|
||||||
|
otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
|
||||||
|
|
||||||
|
tbl = TableElement(
|
||||||
|
otsl_seq=otsl_seq,
|
||||||
|
table_cells=table_cells,
|
||||||
|
num_rows=num_rows,
|
||||||
|
num_cols=num_cols,
|
||||||
|
id=table_cluster.id,
|
||||||
|
page_no=page.page_no,
|
||||||
|
cluster=table_cluster,
|
||||||
|
label="Table",
|
||||||
|
)
|
||||||
|
|
||||||
|
page.predictions.tablestructure.table_map[table_cluster.id] = tbl
|
||||||
|
|
||||||
|
yield page
|
0
docling/pipeline/__init__.py
Normal file
0
docling/pipeline/__init__.py
Normal file
18
docling/pipeline/base_model_pipeline.py
Normal file
18
docling/pipeline/base_model_pipeline.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
from abc import abstractmethod
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import Page, PipelineOptions
|
||||||
|
|
||||||
|
|
||||||
|
class BaseModelPipeline:
|
||||||
|
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
||||||
|
self.model_pipe = []
|
||||||
|
self.artifacts_path = artifacts_path
|
||||||
|
self.pipeline_options = pipeline_options
|
||||||
|
|
||||||
|
def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
|
for model in self.model_pipe:
|
||||||
|
page_batch = model(page_batch)
|
||||||
|
|
||||||
|
yield from page_batch
|
40
docling/pipeline/standard_model_pipeline.py
Normal file
40
docling/pipeline/standard_model_pipeline.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import Page, PipelineOptions
|
||||||
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
|
from docling.models.layout_model import LayoutModel
|
||||||
|
from docling.models.page_assemble_model import PageAssembleModel
|
||||||
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
|
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||||
|
|
||||||
|
|
||||||
|
class StandardModelPipeline(BaseModelPipeline):
|
||||||
|
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
||||||
|
_table_model_path = "model_artifacts/tableformer"
|
||||||
|
|
||||||
|
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
||||||
|
super().__init__(artifacts_path, pipeline_options)
|
||||||
|
|
||||||
|
self.model_pipe = [
|
||||||
|
EasyOcrModel(
|
||||||
|
config={
|
||||||
|
"lang": ["fr", "de", "es", "en"],
|
||||||
|
"enabled": pipeline_options.do_ocr,
|
||||||
|
}
|
||||||
|
),
|
||||||
|
LayoutModel(
|
||||||
|
config={
|
||||||
|
"artifacts_path": artifacts_path
|
||||||
|
/ StandardModelPipeline._layout_model_path
|
||||||
|
}
|
||||||
|
),
|
||||||
|
TableStructureModel(
|
||||||
|
config={
|
||||||
|
"artifacts_path": artifacts_path
|
||||||
|
/ StandardModelPipeline._table_model_path,
|
||||||
|
"enabled": pipeline_options.do_table_structure,
|
||||||
|
"do_cell_matching": False,
|
||||||
|
}
|
||||||
|
),
|
||||||
|
]
|
0
docling/utils/__init__.py
Normal file
0
docling/utils/__init__.py
Normal file
806
docling/utils/layout_utils.py
Normal file
806
docling/utils/layout_utils.py
Normal file
@ -0,0 +1,806 @@
|
|||||||
|
import copy
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import networkx as nx
|
||||||
|
|
||||||
|
logger = logging.getLogger("layout_utils")
|
||||||
|
|
||||||
|
|
||||||
|
## -------------------------------
|
||||||
|
## Geometric helper functions
|
||||||
|
## The coordinates grow left to right, and bottom to top.
|
||||||
|
## The bounding box list elements 0 to 3 are x_left, y_bottom, x_right, y_top.
|
||||||
|
|
||||||
|
|
||||||
|
def area(bbox):
|
||||||
|
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
|
||||||
|
|
||||||
|
|
||||||
|
def contains(bbox_i, bbox_j):
|
||||||
|
## Returns True if bbox_i contains bbox_j, else False
|
||||||
|
return (
|
||||||
|
bbox_i[0] <= bbox_j[0]
|
||||||
|
and bbox_i[1] <= bbox_j[1]
|
||||||
|
and bbox_i[2] >= bbox_j[2]
|
||||||
|
and bbox_i[3] >= bbox_j[3]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_intersecting(bbox_i, bbox_j):
|
||||||
|
return not (
|
||||||
|
bbox_i[2] < bbox_j[0]
|
||||||
|
or bbox_i[0] > bbox_j[2]
|
||||||
|
or bbox_i[3] < bbox_j[1]
|
||||||
|
or bbox_i[1] > bbox_j[3]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def bb_iou(boxA, boxB):
|
||||||
|
# determine the (x, y)-coordinates of the intersection rectangle
|
||||||
|
xA = max(boxA[0], boxB[0])
|
||||||
|
yA = max(boxA[1], boxB[1])
|
||||||
|
xB = min(boxA[2], boxB[2])
|
||||||
|
yB = min(boxA[3], boxB[3])
|
||||||
|
# compute the area of intersection rectangle
|
||||||
|
interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
|
||||||
|
# compute the area of both the prediction and ground-truth
|
||||||
|
# rectangles
|
||||||
|
boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
|
||||||
|
boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
|
||||||
|
# compute the intersection over union by taking the intersection
|
||||||
|
# area and dividing it by the sum of prediction + ground-truth
|
||||||
|
# areas - the interesection area
|
||||||
|
iou = interArea / float(boxAArea + boxBArea - interArea)
|
||||||
|
# return the intersection over union value
|
||||||
|
return iou
|
||||||
|
|
||||||
|
|
||||||
|
def compute_intersection(bbox_i, bbox_j):
|
||||||
|
## Returns the size of the intersection area of the two boxes
|
||||||
|
if not is_intersecting(bbox_i, bbox_j):
|
||||||
|
return 0
|
||||||
|
## Determine the (x, y)-coordinates of the intersection rectangle:
|
||||||
|
xA = max(bbox_i[0], bbox_j[0])
|
||||||
|
yA = max(bbox_i[1], bbox_j[1])
|
||||||
|
xB = min(bbox_i[2], bbox_j[2])
|
||||||
|
yB = min(bbox_i[3], bbox_j[3])
|
||||||
|
## Compute the area of intersection rectangle:
|
||||||
|
interArea = (xB - xA) * (yB - yA)
|
||||||
|
if interArea < 0:
|
||||||
|
logger.debug("Warning: Negative intersection detected!")
|
||||||
|
return 0
|
||||||
|
return interArea
|
||||||
|
|
||||||
|
|
||||||
|
def surrounding(bbox_i, bbox_j):
|
||||||
|
## Computes minimal box that contains both input boxes
|
||||||
|
sbox = []
|
||||||
|
sbox.append(min(bbox_i[0], bbox_j[0]))
|
||||||
|
sbox.append(min(bbox_i[1], bbox_j[1]))
|
||||||
|
sbox.append(max(bbox_i[2], bbox_j[2]))
|
||||||
|
sbox.append(max(bbox_i[3], bbox_j[3]))
|
||||||
|
return sbox
|
||||||
|
|
||||||
|
|
||||||
|
def surrounding_list(bbox_list):
|
||||||
|
## Computes minimal box that contains all boxes in the input list
|
||||||
|
## The list should be non-empty, but just in case it's not:
|
||||||
|
if len(bbox_list) == 0:
|
||||||
|
sbox = [0, 0, 0, 0]
|
||||||
|
else:
|
||||||
|
sbox = []
|
||||||
|
sbox.append(min([bbox[0] for bbox in bbox_list]))
|
||||||
|
sbox.append(min([bbox[1] for bbox in bbox_list]))
|
||||||
|
sbox.append(max([bbox[2] for bbox in bbox_list]))
|
||||||
|
sbox.append(max([bbox[3] for bbox in bbox_list]))
|
||||||
|
return sbox
|
||||||
|
|
||||||
|
|
||||||
|
def vertical_overlap(bboxA, bboxB):
|
||||||
|
## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
|
||||||
|
if bboxB[3] < bboxA[1]: ## B below A
|
||||||
|
return False
|
||||||
|
elif bboxA[3] < bboxB[1]: ## A below B
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def vertical_overlap_fraction(bboxA, bboxB):
|
||||||
|
## Returns the vertical overlap as fraction of the lower bbox height.
|
||||||
|
## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
|
||||||
|
## Height 0 is permitted in the input.
|
||||||
|
heightA = bboxA[3] - bboxA[1]
|
||||||
|
heightB = bboxB[3] - bboxB[1]
|
||||||
|
min_height = min(heightA, heightB)
|
||||||
|
if bboxA[3] >= bboxB[3]: ## A starts higher or equal
|
||||||
|
if (
|
||||||
|
bboxA[1] <= bboxB[1]
|
||||||
|
): ## B is completely in A; this can include height of B = 0:
|
||||||
|
fraction = 1
|
||||||
|
else:
|
||||||
|
overlap = max(bboxB[3] - bboxA[1], 0)
|
||||||
|
fraction = overlap / max(min_height, 0.001)
|
||||||
|
else:
|
||||||
|
if (
|
||||||
|
bboxB[1] <= bboxA[1]
|
||||||
|
): ## A is completely in B; this can include height of A = 0:
|
||||||
|
fraction = 1
|
||||||
|
else:
|
||||||
|
overlap = max(bboxA[3] - bboxB[1], 0)
|
||||||
|
fraction = overlap / max(min_height, 0.001)
|
||||||
|
return fraction
|
||||||
|
|
||||||
|
|
||||||
|
## -------------------------------
|
||||||
|
## Cluster-and-cell relations
|
||||||
|
|
||||||
|
|
||||||
|
def compute_enclosed_cells(
|
||||||
|
cluster_bbox, raw_cells, min_cell_intersection_with_cluster=0.2
|
||||||
|
):
|
||||||
|
cells_in_cluster = []
|
||||||
|
cells_in_cluster_int = []
|
||||||
|
for ix, cell in enumerate(raw_cells):
|
||||||
|
cell_bbox = cell["bbox"]
|
||||||
|
intersection = compute_intersection(cell_bbox, cluster_bbox)
|
||||||
|
frac_area = area(cell_bbox) * min_cell_intersection_with_cluster
|
||||||
|
|
||||||
|
if (
|
||||||
|
intersection > frac_area and frac_area > 0
|
||||||
|
): # intersect > certain fraction of cell
|
||||||
|
cells_in_cluster.append(ix)
|
||||||
|
cells_in_cluster_int.append(intersection)
|
||||||
|
elif contains(
|
||||||
|
cluster_bbox,
|
||||||
|
[cell_bbox[0] + 3, cell_bbox[1] + 3, cell_bbox[2] - 3, cell_bbox[3] - 3],
|
||||||
|
):
|
||||||
|
cells_in_cluster.append(ix)
|
||||||
|
return cells_in_cluster, cells_in_cluster_int
|
||||||
|
|
||||||
|
|
||||||
|
def find_clusters_around_cells(cell_count, clusters):
|
||||||
|
## Per raw cell, find to which clusters it belongs.
|
||||||
|
## Return list of these indices in the raw-cell order.
|
||||||
|
clusters_around_cells = [[] for _ in range(cell_count)]
|
||||||
|
for cl_ix, cluster in enumerate(clusters):
|
||||||
|
for ix in cluster["cell_ids"]:
|
||||||
|
clusters_around_cells[ix].append(cl_ix)
|
||||||
|
return clusters_around_cells
|
||||||
|
|
||||||
|
|
||||||
|
def find_cell_index(raw_ix, cell_array):
|
||||||
|
## "raw_ix" is a rawcell_id.
|
||||||
|
## "cell_array" has the structure of an (annotation) cells array.
|
||||||
|
## Returns index of cell in cell_array that has this rawcell_id.
|
||||||
|
for ix, cell in enumerate(cell_array):
|
||||||
|
if cell["rawcell_id"] == raw_ix:
|
||||||
|
return ix
|
||||||
|
|
||||||
|
|
||||||
|
def find_cell_indices(cluster, cell_array):
|
||||||
|
## "cluster" must have the structure as in a clusters array in a prediction,
|
||||||
|
## "cell_array" that of a cells array.
|
||||||
|
## Returns list of indices of cells in cell_array that have the rawcell_ids as in the cluster,
|
||||||
|
## in the order of the rawcell_ids.
|
||||||
|
result = []
|
||||||
|
for raw_ix in sorted(cluster["cell_ids"]):
|
||||||
|
## Find the cell with this rawcell_id (if any)
|
||||||
|
for ix, cell in enumerate(cell_array):
|
||||||
|
if cell["rawcell_id"] == raw_ix:
|
||||||
|
result.append(ix)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def find_first_cell_index(cluster, cell_array):
|
||||||
|
## "cluster" must be a dict with key "cell_ids"; it can also be a line.
|
||||||
|
## "cell_array" has the structure of a cells array in an annotation.
|
||||||
|
## Returns index of cell in cell_array that has the lowest rawcell_id from the cluster.
|
||||||
|
result = [] ## We keep it a list as it can be empty (picture without text cells)
|
||||||
|
if len(cluster["cell_ids"]) == 0:
|
||||||
|
return result
|
||||||
|
raw_ix = min(cluster["cell_ids"])
|
||||||
|
## Find the cell with this rawcell_id (if any)
|
||||||
|
for ix, cell in enumerate(cell_array):
|
||||||
|
if cell["rawcell_id"] == raw_ix:
|
||||||
|
result.append(ix)
|
||||||
|
break ## One is enough; should be only one anyway.
|
||||||
|
if result == []:
|
||||||
|
logger.debug(
|
||||||
|
" Warning: Raw cell " + str(raw_ix) + " not found in annotation cells"
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
## -------------------------------
|
||||||
|
## Cluster labels and text
|
||||||
|
|
||||||
|
|
||||||
|
def relabel_cluster(cluster, cl_ix, new_label, target_pred):
|
||||||
|
## "cluster" must have the structure as in a clusters array in a prediction,
|
||||||
|
## "cl_ix" is its index in target_pred,
|
||||||
|
## "new_label" is the intended new label,
|
||||||
|
## "target_pred" is the entire current target prediction.
|
||||||
|
## Sets label on the cluster itself, and on the cells in the target_pred.
|
||||||
|
## Returns new_label so that also the cl_label variable in the main code is easily set.
|
||||||
|
target_pred["clusters"][cl_ix]["type"] = new_label
|
||||||
|
cluster_target_cells = find_cell_indices(cluster, target_pred["cells"])
|
||||||
|
for ix in cluster_target_cells:
|
||||||
|
target_pred["cells"][ix]["label"] = new_label
|
||||||
|
return new_label
|
||||||
|
|
||||||
|
|
||||||
|
def find_cluster_text(cluster, raw_cells):
|
||||||
|
## "cluster" must be a dict with "cell_ids"; it can also be a line.
|
||||||
|
## "raw_cells" must have the format of item["raw"]["cells"]
|
||||||
|
## Returns the text of the cluster, with blanks between the cell contents
|
||||||
|
## (which seem to be words or phrases without starting or trailing blanks).
|
||||||
|
## Note that in formulas, this may give a lot more blanks than originally
|
||||||
|
cluster_text = ""
|
||||||
|
for raw_ix in sorted(cluster["cell_ids"]):
|
||||||
|
cluster_text = cluster_text + raw_cells[raw_ix]["text"] + " "
|
||||||
|
return cluster_text.rstrip()
|
||||||
|
|
||||||
|
|
||||||
|
def find_cluster_text_without_blanks(cluster, raw_cells):
|
||||||
|
## "cluster" must be a dict with "cell_ids"; it can also be a line.
|
||||||
|
## "raw_cells" must have the format of item["raw"]["cells"]
|
||||||
|
## Returns the text of the cluster, without blanks between the cell contents
|
||||||
|
## Interesting in formula analysis.
|
||||||
|
cluster_text = ""
|
||||||
|
for raw_ix in sorted(cluster["cell_ids"]):
|
||||||
|
cluster_text = cluster_text + raw_cells[raw_ix]["text"]
|
||||||
|
return cluster_text.rstrip()
|
||||||
|
|
||||||
|
|
||||||
|
## -------------------------------
|
||||||
|
## Clusters and lines
|
||||||
|
## (Most line-oriented functions are only needed in TextAnalysisGivenClusters,
|
||||||
|
## but this one also in FormulaAnalysis)
|
||||||
|
|
||||||
|
|
||||||
|
def build_cluster_from_lines(lines, label, id):
|
||||||
|
## Lines must be a non-empty list of dicts (lines) with elements "cell_ids" and "bbox"
|
||||||
|
## (There is no condition that they are really geometrically lines)
|
||||||
|
## A cluster in standard format is returned with given label and id
|
||||||
|
local_lines = copy.deepcopy(
|
||||||
|
lines
|
||||||
|
) ## without this, it changes "lines" also outside this function
|
||||||
|
first_line = local_lines.pop(0)
|
||||||
|
cluster = {
|
||||||
|
"id": id,
|
||||||
|
"type": label,
|
||||||
|
"cell_ids": first_line["cell_ids"],
|
||||||
|
"bbox": first_line["bbox"],
|
||||||
|
"confidence": 0,
|
||||||
|
"created_by": "merged_cells",
|
||||||
|
}
|
||||||
|
confidence = 0
|
||||||
|
counter = 0
|
||||||
|
for line in local_lines:
|
||||||
|
new_cell_ids = cluster["cell_ids"] + line["cell_ids"]
|
||||||
|
cluster["cell_ids"] = new_cell_ids
|
||||||
|
cluster["bbox"] = surrounding(cluster["bbox"], line["bbox"])
|
||||||
|
counter += 1
|
||||||
|
confidence += line["confidence"]
|
||||||
|
confidence = confidence / counter
|
||||||
|
cluster["confidence"] = confidence
|
||||||
|
return cluster
|
||||||
|
|
||||||
|
|
||||||
|
## -------------------------------
|
||||||
|
## Reading order
|
||||||
|
|
||||||
|
|
||||||
|
def produce_reading_order(clusters, cluster_sort_type, cell_sort_type, sort_ids):
|
||||||
|
## In:
|
||||||
|
## Clusters: list as in predictions.
|
||||||
|
## cluster_sort_type: string, currently only "raw_cells".
|
||||||
|
## cell_sort_type: string, currently only "raw_cells".
|
||||||
|
## sort_ids: Boolean, whether the cluster ids should be adapted to their new position
|
||||||
|
## Out: Another clusters list, sorted according to the type.
|
||||||
|
|
||||||
|
logger.debug("---- Start cluster sorting ------")
|
||||||
|
|
||||||
|
if cell_sort_type == "raw_cell_ids":
|
||||||
|
for cl in clusters:
|
||||||
|
sorted_cell_ids = sorted(cl["cell_ids"])
|
||||||
|
cl["cell_ids"] = sorted_cell_ids
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
"Unknown cell_sort_type `"
|
||||||
|
+ cell_sort_type
|
||||||
|
+ "`, no cell sorting will happen."
|
||||||
|
)
|
||||||
|
|
||||||
|
if cluster_sort_type == "raw_cell_ids":
|
||||||
|
clusters_with_cells = [cl for cl in clusters if cl["cell_ids"] != []]
|
||||||
|
clusters_without_cells = [cl for cl in clusters if cl["cell_ids"] == []]
|
||||||
|
logger.debug(
|
||||||
|
"Clusters with cells: " + str([cl["id"] for cl in clusters_with_cells])
|
||||||
|
)
|
||||||
|
logger.debug(
|
||||||
|
" Their first cell ids: "
|
||||||
|
+ str([cl["cell_ids"][0] for cl in clusters_with_cells])
|
||||||
|
)
|
||||||
|
logger.debug(
|
||||||
|
"Clusters without cells: "
|
||||||
|
+ str([cl["id"] for cl in clusters_without_cells])
|
||||||
|
)
|
||||||
|
clusters_with_cells_sorted = sorted(
|
||||||
|
clusters_with_cells, key=lambda cluster: cluster["cell_ids"][0]
|
||||||
|
)
|
||||||
|
logger.debug(
|
||||||
|
" First cell ids after sorting: "
|
||||||
|
+ str([cl["cell_ids"][0] for cl in clusters_with_cells_sorted])
|
||||||
|
)
|
||||||
|
sorted_clusters = clusters_with_cells_sorted + clusters_without_cells
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
"Unknown cluster_sort_type: `"
|
||||||
|
+ cluster_sort_type
|
||||||
|
+ "`, no cluster sorting will happen."
|
||||||
|
)
|
||||||
|
|
||||||
|
if sort_ids:
|
||||||
|
for i, cl in enumerate(sorted_clusters):
|
||||||
|
cl["id"] = i
|
||||||
|
return sorted_clusters
|
||||||
|
|
||||||
|
|
||||||
|
## -------------------------------
|
||||||
|
## Line Splitting
|
||||||
|
|
||||||
|
|
||||||
|
def sort_cells_horizontal(line_cell_ids, raw_cells):
|
||||||
|
## "line_cells" should be a non-empty list of (raw) cell_ids
|
||||||
|
## "raw_cells" has the structure of item["raw"]["cells"].
|
||||||
|
## Sorts the cells in the line by x0 (left start).
|
||||||
|
new_line_cell_ids = sorted(
|
||||||
|
line_cell_ids, key=lambda cell_id: raw_cells[cell_id]["bbox"][0]
|
||||||
|
)
|
||||||
|
return new_line_cell_ids
|
||||||
|
|
||||||
|
|
||||||
|
def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
|
||||||
|
new_clusters = []
|
||||||
|
for ix, cluster in enumerate(clusters):
|
||||||
|
new_cluster = copy.deepcopy(cluster)
|
||||||
|
logger.debug(
|
||||||
|
"Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
|
||||||
|
)
|
||||||
|
logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
|
||||||
|
if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
|
||||||
|
logger.debug(" Empty non-picture, removed")
|
||||||
|
continue ## Skip this former cluster, now without cells.
|
||||||
|
new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
|
||||||
|
new_cluster["bbox"] = new_bbox
|
||||||
|
new_clusters.append(new_cluster)
|
||||||
|
return new_clusters
|
||||||
|
|
||||||
|
|
||||||
|
def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
|
||||||
|
if not (cluster["type"] in ["Table", "Picture"]):
|
||||||
|
## A text-like cluster. The bbox only needs to be around the text cells:
|
||||||
|
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
|
||||||
|
new_bbox = surrounding_list(
|
||||||
|
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
|
||||||
|
)
|
||||||
|
logger.debug(" New bounding box:" + str(new_bbox))
|
||||||
|
if cluster["type"] == "Picture":
|
||||||
|
## We only make the bbox completely comprise included text cells:
|
||||||
|
logger.debug(" Picture")
|
||||||
|
if len(cluster["cell_ids"]) != 0:
|
||||||
|
min_bbox = surrounding_list(
|
||||||
|
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
|
||||||
|
)
|
||||||
|
logger.debug(" Minimum bbox: " + str(min_bbox))
|
||||||
|
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
|
||||||
|
new_bbox = surrounding(min_bbox, cluster["bbox"])
|
||||||
|
logger.debug(" New bbox (initial and text cells): " + str(new_bbox))
|
||||||
|
else:
|
||||||
|
logger.debug(" without text cells, no change.")
|
||||||
|
new_bbox = cluster["bbox"]
|
||||||
|
else: ## A table
|
||||||
|
## At least we have to keep the included text cells, and we make the bbox completely comprise them
|
||||||
|
min_bbox = surrounding_list(
|
||||||
|
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
|
||||||
|
)
|
||||||
|
logger.debug(" Minimum bbox: " + str(min_bbox))
|
||||||
|
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
|
||||||
|
new_bbox = surrounding(min_bbox, cluster["bbox"])
|
||||||
|
logger.debug(" Possibly increased bbox: " + str(new_bbox))
|
||||||
|
|
||||||
|
## Now we look which non-belonging cells are covered.
|
||||||
|
## (To decrease dependencies, we don't make use of which cells we actually removed.)
|
||||||
|
## We don't worry about orphan cells, those could still be added to the table.
|
||||||
|
enclosed_cells = compute_enclosed_cells(
|
||||||
|
new_bbox, raw_cells, min_cell_intersection_with_cluster=0.3
|
||||||
|
)[0]
|
||||||
|
additional_cells = set(enclosed_cells) - set(cluster["cell_ids"])
|
||||||
|
logger.debug(
|
||||||
|
" Additional cells enclosed by Table bbox: " + str(additional_cells)
|
||||||
|
)
|
||||||
|
spurious_cells = additional_cells - set(orphan_cell_indices)
|
||||||
|
logger.debug(
|
||||||
|
" Spurious cells enclosed by Table bbox (additional minus orphans): "
|
||||||
|
+ str(spurious_cells)
|
||||||
|
)
|
||||||
|
if len(spurious_cells) == 0:
|
||||||
|
return new_bbox
|
||||||
|
|
||||||
|
## Else we want to keep as much as possible, e.g., grid lines, but not the spurious cells if we can.
|
||||||
|
## We initialize possible cuts with the current bbox.
|
||||||
|
left_cut = new_bbox[0]
|
||||||
|
right_cut = new_bbox[2]
|
||||||
|
upper_cut = new_bbox[3]
|
||||||
|
lower_cut = new_bbox[1]
|
||||||
|
|
||||||
|
for cell_ix in spurious_cells:
|
||||||
|
cell = raw_cells[cell_ix]
|
||||||
|
# logger.debug(" Spurious cell bbox: " + str(cell["bbox"]))
|
||||||
|
is_left = cell["bbox"][2] < min_bbox[0]
|
||||||
|
is_right = cell["bbox"][0] > min_bbox[2]
|
||||||
|
is_above = cell["bbox"][1] > min_bbox[3]
|
||||||
|
is_below = cell["bbox"][3] < min_bbox[1]
|
||||||
|
# logger.debug(" Left, right, above, below? " + str([is_left, is_right, is_above, is_below]))
|
||||||
|
|
||||||
|
if is_left:
|
||||||
|
if cell["bbox"][2] > left_cut:
|
||||||
|
## We move the left cut to exclude this cell:
|
||||||
|
left_cut = cell["bbox"][2]
|
||||||
|
if is_right:
|
||||||
|
if cell["bbox"][0] < right_cut:
|
||||||
|
## We move the right cut to exclude this cell:
|
||||||
|
right_cut = cell["bbox"][0]
|
||||||
|
if is_above:
|
||||||
|
if cell["bbox"][1] < upper_cut:
|
||||||
|
## We move the upper cut to exclude this cell:
|
||||||
|
upper_cut = cell["bbox"][1]
|
||||||
|
if is_below:
|
||||||
|
if cell["bbox"][3] > lower_cut:
|
||||||
|
## We move the left cut to exclude this cell:
|
||||||
|
lower_cut = cell["bbox"][3]
|
||||||
|
# logger.debug(" Current bbox: " + str([left_cut, lower_cut, right_cut, upper_cut]))
|
||||||
|
|
||||||
|
new_bbox = [left_cut, lower_cut, right_cut, upper_cut]
|
||||||
|
|
||||||
|
logger.debug(" Final bbox: " + str(new_bbox))
|
||||||
|
return new_bbox
|
||||||
|
|
||||||
|
|
||||||
|
def remove_cluster_duplicates_by_conf(cluster_predictions, threshold=0.5):
|
||||||
|
DuplicateDeletedClusterIDs = []
|
||||||
|
for cluster_1 in cluster_predictions:
|
||||||
|
for cluster_2 in cluster_predictions:
|
||||||
|
if cluster_1["id"] != cluster_2["id"]:
|
||||||
|
if_conf = False
|
||||||
|
if cluster_1["confidence"] > cluster_2["confidence"]:
|
||||||
|
if_conf = True
|
||||||
|
if if_conf == True:
|
||||||
|
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > threshold:
|
||||||
|
DuplicateDeletedClusterIDs.append(cluster_2["id"])
|
||||||
|
elif contains(
|
||||||
|
cluster_1["bbox"],
|
||||||
|
[
|
||||||
|
cluster_2["bbox"][0] + 3,
|
||||||
|
cluster_2["bbox"][1] + 3,
|
||||||
|
cluster_2["bbox"][2] - 3,
|
||||||
|
cluster_2["bbox"][3] - 3,
|
||||||
|
],
|
||||||
|
):
|
||||||
|
DuplicateDeletedClusterIDs.append(cluster_2["id"])
|
||||||
|
|
||||||
|
DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
|
||||||
|
|
||||||
|
for cl_id in DuplicateDeletedClusterIDs:
|
||||||
|
for cluster in cluster_predictions:
|
||||||
|
if cl_id == cluster["id"]:
|
||||||
|
cluster_predictions.remove(cluster)
|
||||||
|
return cluster_predictions
|
||||||
|
|
||||||
|
|
||||||
|
# Assign orphan cells by a low confidence prediction that is below the assigned confidence
|
||||||
|
def assign_orphans_with_low_conf_pred(
|
||||||
|
cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
|
||||||
|
):
|
||||||
|
for orph_id in orphan_cell_indices:
|
||||||
|
cluster_chosen = {}
|
||||||
|
iou_thresh = 0.05
|
||||||
|
confidence = 0.05
|
||||||
|
|
||||||
|
# Loop over all predictions, and find the one with the highest IOU, and confidence
|
||||||
|
for cluster in cluster_predictions_low:
|
||||||
|
calc_iou = bb_iou(cluster["bbox"], raw_cells[orph_id]["bbox"])
|
||||||
|
cluster_area = (cluster["bbox"][3] - cluster["bbox"][1]) * (
|
||||||
|
cluster["bbox"][2] - cluster["bbox"][0]
|
||||||
|
)
|
||||||
|
cell_area = (
|
||||||
|
raw_cells[orph_id]["bbox"][3] - raw_cells[orph_id]["bbox"][1]
|
||||||
|
) * (raw_cells[orph_id]["bbox"][2] - raw_cells[orph_id]["bbox"][0])
|
||||||
|
|
||||||
|
if (
|
||||||
|
(iou_thresh < calc_iou)
|
||||||
|
and (cluster["confidence"] > confidence)
|
||||||
|
and (cell_area * 3 > cluster_area)
|
||||||
|
):
|
||||||
|
cluster_chosen = cluster
|
||||||
|
iou_thresh = calc_iou
|
||||||
|
confidence = cluster["confidence"]
|
||||||
|
# If a candidate is found, assign to it the PDF cell ids, and tag that it was created by this function for tracking
|
||||||
|
if iou_thresh != 0.05 and confidence != 0.05:
|
||||||
|
cluster_chosen["cell_ids"].append(orph_id)
|
||||||
|
cluster_chosen["created_by"] = "orph_low_conf"
|
||||||
|
cluster_predictions.append(cluster_chosen)
|
||||||
|
orphan_cell_indices.remove(orph_id)
|
||||||
|
return cluster_predictions, orphan_cell_indices
|
||||||
|
|
||||||
|
|
||||||
|
def remove_ambigous_pdf_cell_by_conf(cluster_predictions, raw_cells, amb_cell_idxs):
|
||||||
|
for amb_cell_id in amb_cell_idxs:
|
||||||
|
highest_conf = 0
|
||||||
|
highest_bbox_iou = 0
|
||||||
|
cluster_chosen = None
|
||||||
|
problamatic_clusters = []
|
||||||
|
|
||||||
|
# Find clusters in question
|
||||||
|
for cluster in cluster_predictions:
|
||||||
|
|
||||||
|
if amb_cell_id in cluster["cell_ids"]:
|
||||||
|
problamatic_clusters.append(amb_cell_id)
|
||||||
|
|
||||||
|
# If the cell_id is in a cluster of high conf, and highest iou score, and smaller in area
|
||||||
|
bbox_iou_val = bb_iou(cluster["bbox"], raw_cells[amb_cell_id]["bbox"])
|
||||||
|
|
||||||
|
if (
|
||||||
|
cluster["confidence"] > highest_conf
|
||||||
|
and bbox_iou_val > highest_bbox_iou
|
||||||
|
):
|
||||||
|
cluster_chosen = cluster
|
||||||
|
highest_conf = cluster["confidence"]
|
||||||
|
highest_bbox_iou = bbox_iou_val
|
||||||
|
if cluster["id"] in problamatic_clusters:
|
||||||
|
problamatic_clusters.remove(cluster["id"])
|
||||||
|
|
||||||
|
# now remove the assigning of cell id from lower confidence, and threshold
|
||||||
|
for cluster in cluster_predictions:
|
||||||
|
for prob_amb_id in problamatic_clusters:
|
||||||
|
if prob_amb_id in cluster["cell_ids"]:
|
||||||
|
cluster["cell_ids"].remove(prob_amb_id)
|
||||||
|
amb_cell_idxs.remove(amb_cell_id)
|
||||||
|
|
||||||
|
return cluster_predictions, amb_cell_idxs
|
||||||
|
|
||||||
|
|
||||||
|
def ranges(nums):
|
||||||
|
# Find if consecutive numbers exist within pdf cells
|
||||||
|
# Used to remove line numbers for review manuscripts
|
||||||
|
nums = sorted(set(nums))
|
||||||
|
gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s + 1 < e]
|
||||||
|
edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
|
||||||
|
return list(zip(edges, edges))
|
||||||
|
|
||||||
|
|
||||||
|
def set_orphan_as_text(
|
||||||
|
cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
|
||||||
|
):
|
||||||
|
max_id = -1
|
||||||
|
figures = []
|
||||||
|
for cluster in cluster_predictions:
|
||||||
|
if cluster["type"] == "Picture":
|
||||||
|
figures.append(cluster)
|
||||||
|
|
||||||
|
if cluster["id"] > max_id:
|
||||||
|
max_id = cluster["id"]
|
||||||
|
max_id += 1
|
||||||
|
|
||||||
|
lines_detector = False
|
||||||
|
content_of_orphans = []
|
||||||
|
for orph_id in orphan_cell_indices:
|
||||||
|
orph_cell = raw_cells[orph_id]
|
||||||
|
content_of_orphans.append(raw_cells[orph_id]["text"])
|
||||||
|
|
||||||
|
fil_content_of_orphans = []
|
||||||
|
for cell_content in content_of_orphans:
|
||||||
|
if cell_content.isnumeric():
|
||||||
|
try:
|
||||||
|
num = int(cell_content)
|
||||||
|
fil_content_of_orphans.append(num)
|
||||||
|
except ValueError: # ignore the cell
|
||||||
|
pass
|
||||||
|
|
||||||
|
# line_orphans = []
|
||||||
|
# Check if there are more than 2 pdf orphan cells, if there are more than 2,
|
||||||
|
# then check between the orphan cells if they are numeric
|
||||||
|
# and if they are a consecutive series of numbers (using ranges function) to decide
|
||||||
|
|
||||||
|
if len(fil_content_of_orphans) > 2:
|
||||||
|
out_ranges = ranges(fil_content_of_orphans)
|
||||||
|
if len(out_ranges) > 1:
|
||||||
|
cnt_range = 0
|
||||||
|
for ranges_ in out_ranges:
|
||||||
|
if ranges_[0] != ranges_[1]:
|
||||||
|
# If there are more than 75 (half the total line number of a review manuscript page)
|
||||||
|
# decide that there are line numbers on page to be ignored.
|
||||||
|
if len(list(range(ranges_[0], ranges_[1]))) > 75:
|
||||||
|
lines_detector = True
|
||||||
|
# line_orphans = line_orphans + list(range(ranges_[0], ranges_[1]))
|
||||||
|
|
||||||
|
for orph_id in orphan_cell_indices:
|
||||||
|
orph_cell = raw_cells[orph_id]
|
||||||
|
if bool(orph_cell["text"] and not orph_cell["text"].isspace()):
|
||||||
|
fig_flag = False
|
||||||
|
# Do not assign orphan cells if they are inside a figure
|
||||||
|
for fig in figures:
|
||||||
|
if contains(fig["bbox"], orph_cell["bbox"]):
|
||||||
|
fig_flag = True
|
||||||
|
|
||||||
|
# if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
|
||||||
|
if fig_flag == False and lines_detector == False:
|
||||||
|
# get class from low confidence detections if not set as text:
|
||||||
|
class_type = "Text"
|
||||||
|
|
||||||
|
for cluster in cluster_predictions_low:
|
||||||
|
intersection = compute_intersection(
|
||||||
|
orph_cell["bbox"], cluster["bbox"]
|
||||||
|
)
|
||||||
|
class_type = "Text"
|
||||||
|
if (
|
||||||
|
cluster["confidence"] > 0.1
|
||||||
|
and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
|
||||||
|
):
|
||||||
|
class_type = cluster["type"]
|
||||||
|
elif contains(
|
||||||
|
cluster["bbox"],
|
||||||
|
[
|
||||||
|
orph_cell["bbox"][0] + 3,
|
||||||
|
orph_cell["bbox"][1] + 3,
|
||||||
|
orph_cell["bbox"][2] - 3,
|
||||||
|
orph_cell["bbox"][3] - 3,
|
||||||
|
],
|
||||||
|
):
|
||||||
|
class_type = cluster["type"]
|
||||||
|
elif intersection > area(orph_cell["bbox"]) * 0.2:
|
||||||
|
class_type = cluster["type"]
|
||||||
|
|
||||||
|
new_cluster = {
|
||||||
|
"id": max_id,
|
||||||
|
"bbox": orph_cell["bbox"],
|
||||||
|
"type": class_type,
|
||||||
|
"cell_ids": [orph_id],
|
||||||
|
"confidence": -1,
|
||||||
|
"created_by": "orphan_default",
|
||||||
|
}
|
||||||
|
max_id += 1
|
||||||
|
cluster_predictions.append(new_cluster)
|
||||||
|
return cluster_predictions, orphan_cell_indices
|
||||||
|
|
||||||
|
|
||||||
|
def merge_cells(cluster_predictions):
|
||||||
|
# Using graph component creates clusters if orphan cells are touching or too close.
|
||||||
|
G = nx.Graph()
|
||||||
|
for cluster in cluster_predictions:
|
||||||
|
if cluster["created_by"] == "orphan_default":
|
||||||
|
G.add_node(cluster["id"])
|
||||||
|
|
||||||
|
for cluster_1 in cluster_predictions:
|
||||||
|
for cluster_2 in cluster_predictions:
|
||||||
|
if (
|
||||||
|
cluster_1["id"] != cluster_2["id"]
|
||||||
|
and cluster_2["created_by"] == "orphan_default"
|
||||||
|
and cluster_1["created_by"] == "orphan_default"
|
||||||
|
):
|
||||||
|
cl1 = copy.deepcopy(cluster_1["bbox"])
|
||||||
|
cl2 = copy.deepcopy(cluster_2["bbox"])
|
||||||
|
cl1[0] = cl1[0] - 2
|
||||||
|
cl1[1] = cl1[1] - 2
|
||||||
|
cl1[2] = cl1[2] + 2
|
||||||
|
cl1[3] = cl1[3] + 2
|
||||||
|
cl2[0] = cl2[0] - 2
|
||||||
|
cl2[1] = cl2[1] - 2
|
||||||
|
cl2[2] = cl2[2] + 2
|
||||||
|
cl2[3] = cl2[3] + 2
|
||||||
|
if is_intersecting(cl1, cl2):
|
||||||
|
G.add_edge(cluster_1["id"], cluster_2["id"])
|
||||||
|
|
||||||
|
component = sorted(map(sorted, nx.k_edge_components(G, k=1)))
|
||||||
|
max_id = -1
|
||||||
|
for cluster_1 in cluster_predictions:
|
||||||
|
if cluster_1["id"] > max_id:
|
||||||
|
max_id = cluster_1["id"]
|
||||||
|
|
||||||
|
for nodes in component:
|
||||||
|
if len(nodes) > 1:
|
||||||
|
max_id += 1
|
||||||
|
lines = []
|
||||||
|
for node in nodes:
|
||||||
|
for cluster in cluster_predictions:
|
||||||
|
if cluster["id"] == node:
|
||||||
|
lines.append(cluster)
|
||||||
|
cluster_predictions.remove(cluster)
|
||||||
|
new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
|
||||||
|
cluster_predictions.append(new_merged_cluster)
|
||||||
|
return cluster_predictions
|
||||||
|
|
||||||
|
|
||||||
|
def clean_up_clusters(
|
||||||
|
cluster_predictions,
|
||||||
|
raw_cells,
|
||||||
|
merge_cells=False,
|
||||||
|
img_table=False,
|
||||||
|
one_cell_table=False,
|
||||||
|
):
|
||||||
|
DuplicateDeletedClusterIDs = []
|
||||||
|
|
||||||
|
for cluster_1 in cluster_predictions:
|
||||||
|
for cluster_2 in cluster_predictions:
|
||||||
|
if cluster_1["id"] != cluster_2["id"]:
|
||||||
|
# remove any artifcats created by merging clusters
|
||||||
|
if merge_cells == True:
|
||||||
|
if contains(
|
||||||
|
cluster_1["bbox"],
|
||||||
|
[
|
||||||
|
cluster_2["bbox"][0] + 3,
|
||||||
|
cluster_2["bbox"][1] + 3,
|
||||||
|
cluster_2["bbox"][2] - 3,
|
||||||
|
cluster_2["bbox"][3] - 3,
|
||||||
|
],
|
||||||
|
):
|
||||||
|
cluster_1["cell_ids"] = (
|
||||||
|
cluster_1["cell_ids"] + cluster_2["cell_ids"]
|
||||||
|
)
|
||||||
|
DuplicateDeletedClusterIDs.append(cluster_2["id"])
|
||||||
|
# remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
|
||||||
|
elif img_table == True:
|
||||||
|
if (
|
||||||
|
cluster_1["type"] == "Text"
|
||||||
|
and cluster_2["type"] == "Picture"
|
||||||
|
or cluster_2["type"] == "Table"
|
||||||
|
):
|
||||||
|
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
|
||||||
|
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
||||||
|
elif contains(
|
||||||
|
[
|
||||||
|
cluster_2["bbox"][0] - 3,
|
||||||
|
cluster_2["bbox"][1] - 3,
|
||||||
|
cluster_2["bbox"][2] + 3,
|
||||||
|
cluster_2["bbox"][3] + 3,
|
||||||
|
],
|
||||||
|
cluster_1["bbox"],
|
||||||
|
):
|
||||||
|
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
||||||
|
# remove tables that have one pdf cell
|
||||||
|
if one_cell_table == True:
|
||||||
|
if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
|
||||||
|
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
||||||
|
|
||||||
|
DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
|
||||||
|
|
||||||
|
for cl_id in DuplicateDeletedClusterIDs:
|
||||||
|
for cluster in cluster_predictions:
|
||||||
|
if cl_id == cluster["id"]:
|
||||||
|
cluster_predictions.remove(cluster)
|
||||||
|
return cluster_predictions
|
||||||
|
|
||||||
|
|
||||||
|
def assigning_cell_ids_to_clusters(clusters, raw_cells, threshold):
|
||||||
|
for cluster in clusters:
|
||||||
|
cells_in_cluster, _ = compute_enclosed_cells(
|
||||||
|
cluster["bbox"], raw_cells, min_cell_intersection_with_cluster=threshold
|
||||||
|
)
|
||||||
|
cluster["cell_ids"] = cells_in_cluster
|
||||||
|
## These cell_ids are ids of the raw cells.
|
||||||
|
## They are often, but not always, the same as the "id" or the index of the "cells" list in a prediction.
|
||||||
|
return clusters
|
||||||
|
|
||||||
|
|
||||||
|
# Creates a map of cell_id->cluster_id
|
||||||
|
def cell_id_state_map(clusters, cell_count):
|
||||||
|
clusters_around_cells = find_clusters_around_cells(cell_count, clusters)
|
||||||
|
orphan_cell_indices = [
|
||||||
|
ix for ix in range(cell_count) if len(clusters_around_cells[ix]) == 0
|
||||||
|
] # which cells are assigned no cluster?
|
||||||
|
ambiguous_cell_indices = [
|
||||||
|
ix for ix in range(cell_count) if len(clusters_around_cells[ix]) > 1
|
||||||
|
] # which cells are assigned > 1 clusters?
|
||||||
|
return clusters_around_cells, orphan_cell_indices, ambiguous_cell_indices
|
41
docling/utils/utils.py
Normal file
41
docling/utils/utils.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
import hashlib
|
||||||
|
from io import BytesIO
|
||||||
|
from itertools import islice
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
|
|
||||||
|
def chunkify(iterator, chunk_size):
|
||||||
|
"""Yield successive chunks of chunk_size from the iterable."""
|
||||||
|
if isinstance(iterator, List):
|
||||||
|
iterator = iter(iterator)
|
||||||
|
for first in iterator: # Take the first element from the iterator
|
||||||
|
yield [first] + list(islice(iterator, chunk_size - 1))
|
||||||
|
|
||||||
|
|
||||||
|
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
|
||||||
|
"""Create a stable page_hash of the path_or_stream of a file"""
|
||||||
|
|
||||||
|
block_size = 65536
|
||||||
|
hasher = hashlib.sha256()
|
||||||
|
|
||||||
|
def _hash_buf(binary_stream):
|
||||||
|
buf = binary_stream.read(block_size) # read and page_hash in chunks
|
||||||
|
while len(buf) > 0:
|
||||||
|
hasher.update(buf)
|
||||||
|
buf = binary_stream.read(block_size)
|
||||||
|
|
||||||
|
if isinstance(path_or_stream, Path):
|
||||||
|
with path_or_stream.open("rb") as afile:
|
||||||
|
_hash_buf(afile)
|
||||||
|
elif isinstance(path_or_stream, BytesIO):
|
||||||
|
_hash_buf(path_or_stream)
|
||||||
|
|
||||||
|
return hasher.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def create_hash(string: str):
|
||||||
|
hasher = hashlib.sha256()
|
||||||
|
hasher.update(string.encode("utf-8"))
|
||||||
|
|
||||||
|
return hasher.hexdigest()
|
73
examples/convert.py
Normal file
73
examples/convert.py
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
|
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||||
|
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def export_documents(
|
||||||
|
converted_docs: Iterable[ConvertedDocument],
|
||||||
|
output_dir: Path,
|
||||||
|
):
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
success_count = 0
|
||||||
|
failure_count = 0
|
||||||
|
|
||||||
|
for doc in converted_docs:
|
||||||
|
if doc.status == ConversionStatus.SUCCESS:
|
||||||
|
success_count += 1
|
||||||
|
doc_filename = doc.input.file.stem
|
||||||
|
|
||||||
|
# Export Deep Search document JSON format:
|
||||||
|
with (output_dir / f"{doc_filename}.json").open("w") as fp:
|
||||||
|
fp.write(json.dumps(doc.render_as_dict()))
|
||||||
|
|
||||||
|
# Export Markdown format:
|
||||||
|
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
||||||
|
fp.write(doc.render_as_markdown())
|
||||||
|
else:
|
||||||
|
_log.info(f"Document {doc.input.file} failed to convert.")
|
||||||
|
failure_count += 1
|
||||||
|
|
||||||
|
_log.info(
|
||||||
|
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
input_doc_paths = [
|
||||||
|
# Path("/Users/cau/Downloads/Issue-36122.pdf"),
|
||||||
|
# Path("/Users/cau/Downloads/IBM_Storage_Insights_Fact_Sheet.pdf"),
|
||||||
|
Path("./test/data/2206.01062.pdf"),
|
||||||
|
Path("./test/data/2203.01017v2.pdf"),
|
||||||
|
Path("./test/data/2305.03393v1.pdf"),
|
||||||
|
]
|
||||||
|
|
||||||
|
artifacts_path = DocumentConverter.download_models_hf()
|
||||||
|
|
||||||
|
doc_converter = DocumentConverter(artifacts_path=artifacts_path)
|
||||||
|
|
||||||
|
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
converted_docs = doc_converter.convert(input)
|
||||||
|
export_documents(converted_docs, output_dir=Path("./scratch"))
|
||||||
|
|
||||||
|
end_time = time.time() - start_time
|
||||||
|
|
||||||
|
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
11
examples/minimal.py
Normal file
11
examples/minimal.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
from docling.datamodel.document import DocumentConversionInput
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
artifacts_path = DocumentConverter.download_models_hf()
|
||||||
|
doc_converter = DocumentConverter(artifacts_path=artifacts_path)
|
||||||
|
|
||||||
|
input = DocumentConversionInput.from_paths(["factsheet.pdf"])
|
||||||
|
converted_docs = doc_converter.convert(input)
|
||||||
|
|
||||||
|
for d in converted_docs:
|
||||||
|
print(d.render_as_dict())
|
4865
poetry.lock
generated
Normal file
4865
poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
72
pyproject.toml
Normal file
72
pyproject.toml
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
[tool.poetry]
|
||||||
|
name = "docling"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Docling PDF conversion package"
|
||||||
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
||||||
|
license = "MIT"
|
||||||
|
readme = "README.md"
|
||||||
|
keywords= ["docling", "convert", "document", "pdf", "layout model", "segmentation", "table structure", "table former"]
|
||||||
|
classifiers = [
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Operating System :: MacOS :: MacOS X",
|
||||||
|
"Operating System :: POSIX :: Linux",
|
||||||
|
"Development Status :: 5 - Production/Stable",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"Intended Audience :: Science/Research",
|
||||||
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||||
|
"Programming Language :: Python :: 3"
|
||||||
|
]
|
||||||
|
packages = [{include = "docling"}]
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = "^3.11"
|
||||||
|
pydantic = "^2.0.0"
|
||||||
|
docling-core = "^0.2.0"
|
||||||
|
docling-ibm-models = "^0.2.0"
|
||||||
|
deepsearch-glm = ">=0.18.4,<1"
|
||||||
|
deepsearch-toolkit = ">=0.47.0,<1"
|
||||||
|
filetype = "^1.2.0"
|
||||||
|
pypdfium2 = "^4.30.0"
|
||||||
|
pydantic-settings = "^2.3.0"
|
||||||
|
huggingface_hub = ">=0.23,<1"
|
||||||
|
|
||||||
|
[tool.poetry.group.ocr.dependencies]
|
||||||
|
easyocr = "^1.7"
|
||||||
|
|
||||||
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
||||||
|
pytest = "^7.2.2"
|
||||||
|
pre-commit = "^3.7.1"
|
||||||
|
mypy = "^1.10.1"
|
||||||
|
isort = "^5.10.1"
|
||||||
|
python-semantic-release = "^7.32.2"
|
||||||
|
flake8 = "^6.0.0"
|
||||||
|
pyproject-flake8 = "^6.0.0"
|
||||||
|
pytest-xdist = "^3.3.1"
|
||||||
|
types-requests = "^2.31.0.2"
|
||||||
|
flake8-pyproject = "^1.2.3"
|
||||||
|
pylint = "^2.17.5"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
|
[tool.black]
|
||||||
|
line-length = 88
|
||||||
|
target-version = ["py311"]
|
||||||
|
include = '\.pyi?$'
|
||||||
|
|
||||||
|
[tool.isort]
|
||||||
|
profile = "black"
|
||||||
|
line_length = 88
|
||||||
|
py_version=311
|
||||||
|
|
||||||
|
[tool.mypy]
|
||||||
|
pretty = true
|
||||||
|
# strict = true
|
||||||
|
no_implicit_optional = true
|
||||||
|
python_version = "3.11"
|
||||||
|
|
||||||
|
[tool.flake8]
|
||||||
|
max-line-length = 88
|
||||||
|
extend-ignore = ["E203", "E501"]
|
BIN
test/data/2203.01017v2.pdf
Normal file
BIN
test/data/2203.01017v2.pdf
Normal file
Binary file not shown.
BIN
test/data/2206.01062.pdf
Normal file
BIN
test/data/2206.01062.pdf
Normal file
Binary file not shown.
BIN
test/data/2305.03393v1.pdf
Normal file
BIN
test/data/2305.03393v1.pdf
Normal file
Binary file not shown.
33
test/test_backend_pdfium.py
Normal file
33
test/test_backend_pdfium.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend
|
||||||
|
from docling.datamodel.base_models import BoundingBox
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def test_doc_path():
|
||||||
|
return Path("./data/2206.01062.pdf")
|
||||||
|
|
||||||
|
def test_get_text_from_rect(test_doc_path):
|
||||||
|
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
|
||||||
|
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
|
# Get the title text of the DocLayNet paper
|
||||||
|
textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124))
|
||||||
|
ref = "DocLayNet: A Large Human-Annotated Dataset for\r\nDocument-Layout Analysis"
|
||||||
|
|
||||||
|
assert textpiece.strip() == ref
|
||||||
|
|
||||||
|
def test_crop_page_image(test_doc_path):
|
||||||
|
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
|
||||||
|
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
|
# Crop out "Figure 1" from the DocLayNet paper
|
||||||
|
im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527))
|
||||||
|
# im.show()
|
||||||
|
|
||||||
|
def test_num_pages(test_doc_path):
|
||||||
|
doc_backend = PyPdfiumDocumentBackend(test_doc_path)
|
||||||
|
doc_backend.page_count() == 9
|
Loading…
Reference in New Issue
Block a user