
* Skeleton for SmolDocling model and VLM Pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * wip smolDocling inference and vlm pipeline Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * WIP, first working code for inference of SmolDocling, and vlm pipeline assembly code, example included. Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixes to preserve page image and demo export to html Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Enabled figure support in vlm_pipeline Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fix for table span compute in vlm_pipeline Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Properly propagating image data per page, together with predicted tags in VLM pipeline. This enables correct figure extraction and page numbers in provenances Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Cleaned up logs, added pages to vlm_pipeline, basic timing per page measurement in smol_docling models Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Replaced hardcoded otsl tokens with the ones from docling-core tokens.py enum Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added tokens/sec measurement, improved example Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added capability for vlm_pipeline to grab text from preconfigured backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Exposed "force_backend_text" as pipeline parameter Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Flipped keep_backend to True for vlm_pipeline assembly to work Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated vlm pipeline assembly and smol docling model code to support updated doctags Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixing doctags starting tag, that broke elements on first line during assembly Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Introduced SmolDoclingOptions to configure model parameters (such as query and artifacts path) via client code, see example in minimal_smol_docling. Provisioning for other potential vlm all-in-one models. Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Moved artifacts_path for SmolDocling into vlm_options instead of global pipeline option Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * New assembly code for latest model revision, updated prompt and parsing of doctags, updated logging Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated example of Smol Docling usage Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added captions for the images for SmolDocling assembly code, improved provenance definition for all elements Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Update minimal smoldocling example Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix repo id Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Cleaned up unnecessary logging Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * More elegant solution in removing the input prompt Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * removed minimal_smol_docling example from CI checks Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Removed special html code wrapping when exporting to docling document, cleaned up comments Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Addressing PR comments, added enabled property to SmolDocling, and related VLM pipeline option, few other minor things Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Moved keep_backend = True to vlm pipeline Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * removed pipeline_options.generate_table_images from vlm_pipeline (deprecated in the pipelines) Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added example on how to get original predicted doctags in minimal_smol_docling Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * removing changes from base_pipeline Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Replaced remaining strings to appropriate enums Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated poetry.lock Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * re-built poetry.lock Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Generalize and refactor VLM pipeline and models Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename example Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Move imports Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Expose control over using flash_attention_2 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix VLM example exclusion in CI Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add back device_map and accelerate Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Make drawing code resilient against bad bboxes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * chore: clean up code and comments Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * chore: more cleanup Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * chore: fix leftover .to(device) Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: add proper table provenance Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
192 lines
6.0 KiB
TOML
192 lines
6.0 KiB
TOML
[tool.poetry]
|
|
name = "docling"
|
|
version = "2.24.0" # DO NOT EDIT, updated automatically
|
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
|
license = "MIT"
|
|
readme = "README.md"
|
|
repository = "https://github.com/DS4SD/docling"
|
|
homepage = "https://github.com/DS4SD/docling"
|
|
keywords= ["docling", "convert", "document", "pdf", "docx", "html", "markdown", "layout model", "segmentation", "table structure", "table former"]
|
|
classifiers = [
|
|
"License :: OSI Approved :: MIT License",
|
|
"Operating System :: MacOS :: MacOS X",
|
|
"Operating System :: POSIX :: Linux",
|
|
"Development Status :: 5 - Production/Stable",
|
|
"Intended Audience :: Developers",
|
|
"Intended Audience :: Science/Research",
|
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
"Programming Language :: Python :: 3"
|
|
]
|
|
packages = [{include = "docling"}]
|
|
|
|
[tool.poetry.dependencies]
|
|
######################
|
|
# actual dependencies:
|
|
######################
|
|
python = "^3.9"
|
|
pydantic = "^2.0.0"
|
|
docling-core = {extras = ["chunking"], version = "^2.19.0"}
|
|
docling-ibm-models = "^3.4.0"
|
|
docling-parse = "^3.3.0"
|
|
filetype = "^1.2.0"
|
|
pypdfium2 = "^4.30.0"
|
|
pydantic-settings = "^2.3.0"
|
|
huggingface_hub = ">=0.23,<1"
|
|
requests = "^2.32.2"
|
|
easyocr = "^1.7"
|
|
tesserocr = { version = "^2.7.1", optional = true }
|
|
certifi = ">=2024.7.4"
|
|
rtree = "^1.3.0"
|
|
scipy = [
|
|
{ version = "^1.6.0", markers = "python_version >= '3.10'" },
|
|
{ version = ">=1.6.0,<1.14.0", markers = "python_version < '3.10'" }
|
|
]
|
|
typer = "^0.12.5"
|
|
python-docx = "^1.1.2"
|
|
python-pptx = "^1.0.2"
|
|
beautifulsoup4 = "^4.12.3"
|
|
pandas = "^2.1.4"
|
|
marko = "^2.1.2"
|
|
openpyxl = "^3.1.5"
|
|
lxml = ">=4.0.0,<6.0.0"
|
|
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
|
|
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
|
|
onnxruntime = [
|
|
# 1.19.2 is the last version with python3.9 support,
|
|
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
|
|
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
|
|
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
|
|
]
|
|
|
|
transformers = [
|
|
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
|
|
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true }
|
|
]
|
|
accelerate = [
|
|
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true },
|
|
]
|
|
pillow = ">=10.0.0,<12.0.0"
|
|
tqdm = "^4.65.0"
|
|
|
|
[tool.poetry.group.dev.dependencies]
|
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
|
pytest = "^7.2.2"
|
|
pre-commit = "^3.7.1"
|
|
mypy = "^1.10.1"
|
|
isort = "^5.10.1"
|
|
python-semantic-release = "^7.32.2"
|
|
flake8 = "^6.0.0"
|
|
pyproject-flake8 = "^6.0.0"
|
|
pytest-xdist = "^3.3.1"
|
|
types-requests = "^2.31.0.2"
|
|
flake8-pyproject = "^1.2.3"
|
|
pylint = "^2.17.5"
|
|
pandas-stubs = "^2.1.4.231227"
|
|
ipykernel = "^6.29.5"
|
|
ipywidgets = "^8.1.5"
|
|
nbqa = "^1.9.0"
|
|
types-openpyxl = "^3.1.5.20241114"
|
|
types-tqdm = "^4.67.0.20241221"
|
|
|
|
[tool.poetry.group.docs.dependencies]
|
|
mkdocs-material = "^9.5.40"
|
|
mkdocs-jupyter = "^0.25.0"
|
|
mkdocs-click = "^0.8.1"
|
|
mkdocstrings = {extras = ["python"], version = "^0.27.0"}
|
|
griffe-pydantic = "^1.1.0"
|
|
|
|
[tool.poetry.group.examples.dependencies]
|
|
datasets = "^2.21.0"
|
|
python-dotenv = "^1.0.1"
|
|
langchain-huggingface = "^0.0.3"
|
|
langchain-milvus = "^0.1.4"
|
|
langchain-text-splitters = "^0.2.4"
|
|
|
|
[tool.poetry.group.constraints]
|
|
optional = true
|
|
|
|
[tool.poetry.group.constraints.dependencies]
|
|
numpy = [
|
|
{ version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
|
|
{ version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
|
|
]
|
|
|
|
[tool.poetry.group.mac_intel]
|
|
optional = true
|
|
|
|
[tool.poetry.group.mac_intel.dependencies]
|
|
torch = [
|
|
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2"},
|
|
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2"}
|
|
]
|
|
torchvision = [
|
|
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0"},
|
|
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2"}
|
|
]
|
|
|
|
[tool.poetry.extras]
|
|
tesserocr = ["tesserocr"]
|
|
ocrmac = ["ocrmac"]
|
|
vlm = ["transformers", "accelerate"]
|
|
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
|
|
|
[tool.poetry.scripts]
|
|
docling = "docling.cli.main:app"
|
|
docling-tools = "docling.cli.tools:app"
|
|
|
|
[build-system]
|
|
requires = ["poetry-core"]
|
|
build-backend = "poetry.core.masonry.api"
|
|
|
|
[tool.black]
|
|
line-length = 88
|
|
target-version = ["py39"]
|
|
include = '\.pyi?$'
|
|
|
|
[tool.isort]
|
|
profile = "black"
|
|
line_length = 88
|
|
py_version=39
|
|
|
|
[tool.mypy]
|
|
pretty = true
|
|
# strict = true
|
|
no_implicit_optional = true
|
|
plugins = "pydantic.mypy"
|
|
python_version = "3.10"
|
|
|
|
[[tool.mypy.overrides]]
|
|
module = [
|
|
"docling_parse.*",
|
|
"pypdfium2.*",
|
|
"networkx.*",
|
|
"scipy.*",
|
|
"filetype.*",
|
|
"tesserocr.*",
|
|
"docling_ibm_models.*",
|
|
"easyocr.*",
|
|
"ocrmac.*",
|
|
"lxml.*",
|
|
"huggingface_hub.*",
|
|
"transformers.*",
|
|
]
|
|
ignore_missing_imports = true
|
|
|
|
[tool.flake8]
|
|
max-line-length = 88
|
|
extend-ignore = ["E203", "E501"]
|
|
|
|
[tool.semantic_release]
|
|
# for default values check:
|
|
# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
|
|
|
|
version_source = "tag_only"
|
|
branch = "main"
|
|
|
|
# configure types which should trigger minor and patch version bumps respectively
|
|
# (note that they must be a subset of the configured allowed types):
|
|
parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test"
|
|
parser_angular_minor_types = "feat"
|
|
parser_angular_patch_types = "fix,perf"
|