Docling/pyproject.toml
2025-05-12 09:44:26 +00:00

285 lines
7.9 KiB
TOML

[tool.poetry]
name = "docling"
version = "2.31.1" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
authors = [
"Christoph Auer <cau@zurich.ibm.com>",
"Michele Dolfi <dol@zurich.ibm.com>",
"Maxim Lysak <mly@zurich.ibm.com>",
"Nikos Livathinos <nli@zurich.ibm.com>",
"Ahmed Nassar <ahn@zurich.ibm.com>",
"Panos Vagenas <pva@zurich.ibm.com>",
"Peter Staar <taa@zurich.ibm.com>",
]
license = "MIT"
readme = "README.md"
repository = "https://github.com/docling-project/docling"
homepage = "https://github.com/docling-project/docling"
keywords = [
"docling",
"convert",
"document",
"pdf",
"docx",
"html",
"markdown",
"layout model",
"segmentation",
"table structure",
"table former",
]
classifiers = [
"License :: OSI Approved :: MIT License",
"Operating System :: MacOS :: MacOS X",
"Operating System :: POSIX :: Linux",
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Programming Language :: Python :: 3",
]
packages = [{ include = "docling" }]
[tool.poetry.dependencies]
######################
# actual dependencies:
######################
python = "^3.9"
pydantic = "^2.0.0"
docling-core = {version = "^2.26.0", extras = ["chunking"]}
docling-ibm-models = "^3.4.0"
docling-parse = "^4.0.0"
filetype = "^1.2.0"
pypdfium2 = "^4.30.0"
pydantic-settings = "^2.3.0"
huggingface_hub = ">=0.23,<1"
requests = "^2.32.2"
easyocr = "^1.7"
tesserocr = { version = "^2.7.1", optional = true }
certifi = ">=2024.7.4"
rtree = "^1.3.0"
scipy = [
{ version = "^1.6.0", markers = "python_version >= '3.10'" },
{ version = ">=1.6.0,<1.14.0", markers = "python_version < '3.10'" },
]
typer = ">=0.12.5,<0.16.0"
python-docx = "^1.1.2"
python-pptx = "^1.0.2"
beautifulsoup4 = "^4.12.3"
pandas = "^2.1.4"
marko = "^2.1.2"
openpyxl = "^3.1.5"
lxml = ">=4.0.0,<6.0.0"
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
onnxruntime = [
# 1.19.2 is the last version with python3.9 support,
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" },
]
transformers = [
{ markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
{ markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true },
]
accelerate = [
{ markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true },
]
pillow = ">=10.0.0,<12.0.0"
tqdm = "^4.65.0"
pluggy = "^1.0.0"
pylatexenc = "^2.10"
[tool.poetry.group.dev.dependencies]
python = "^3.9.2"
black = { extras = ["jupyter"], version = "^24.4.2" }
pytest = "^7.2.2"
pre-commit = "^3.7.1"
mypy = "^1.10.1"
isort = "^5.10.1"
python-semantic-release = "^7.32.2"
flake8 = "^6.0.0"
pyproject-flake8 = "^6.0.0"
pytest-xdist = "^3.3.1"
types-requests = "^2.31.0.2"
flake8-pyproject = "^1.2.3"
pylint = "^2.17.5"
pandas-stubs = "^2.1.4.231227"
ipykernel = "^6.29.5"
ipywidgets = "^8.1.5"
nbqa = "^1.9.0"
types-openpyxl = "^3.1.5.20241114"
types-tqdm = "^4.67.0.20241221"
coverage = "^7.6.2"
pytest-cov = "^6.0.0"
[tool.poetry.group.docs.dependencies]
mkdocs-material = "^9.5.40"
mkdocs-jupyter = "^0.25.0"
mkdocs-click = "^0.8.1"
mkdocstrings = { extras = ["python"], version = "^0.27.0" }
griffe-pydantic = "^1.1.0"
[tool.poetry.group.examples.dependencies]
datasets = "^2.21.0"
python-dotenv = "^1.0.1"
langchain-huggingface = "^0.0.3"
langchain-milvus = "^0.1.4"
langchain-text-splitters = "^0.2.4"
[tool.poetry.group.constraints]
optional = true
[tool.poetry.group.constraints.dependencies]
numpy = [
{ version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
{ version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
]
[tool.poetry.group.mac_intel]
optional = true
[tool.poetry.group.mac_intel.dependencies]
torch = [
{ markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2" },
{ markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2" },
]
torchvision = [
{ markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0" },
{ markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2" },
]
[tool.poetry.extras]
tesserocr = ["tesserocr"]
ocrmac = ["ocrmac"]
vlm = ["transformers", "accelerate"]
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
[tool.poetry.scripts]
docling = "docling.cli.main:app"
docling-tools = "docling.cli.tools:app"
[tool.poetry.plugins."docling"]
"docling_defaults" = "docling.models.plugins.defaults"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.ruff]
target-version = "py39"
line-length = 88
respect-gitignore = true
# extend-exclude = [
# "tests",
# ]
[tool.ruff.format]
skip-magic-trailing-comma = false
[tool.ruff.lint]
select = [
# "B", # flake8-bugbear
"C", # flake8-comprehensions
"C9", # mccabe
# "D", # flake8-docstrings
"E", # pycodestyle errors (default)
"F", # pyflakes (default)
"I", # isort
"PD", # pandas-vet
"PIE", # pie
# "PTH", # pathlib
"Q", # flake8-quotes
# "RET", # return
"RUF", # Enable all ruff-specific checks
# "SIM", # simplify
"S307", # eval
# "T20", # (disallow print statements) keep debugging statements out of the codebase
"W", # pycodestyle warnings
"ASYNC", # async
"UP", # pyupgrade
]
ignore = [
"C408", # Unnecessary `dict()` call (rewrite as a literal)
"E501", # Line too long, handled by ruff formatter
"D107", # "Missing docstring in __init__",
"F401", # imported but unused; consider using `importlib.util.find_spec` to test for "
"F811", # "redefinition of the same function"
"PL", # Pylint
"RUF012", # Mutable Class Attributes
"UP006", # List vs list, etc
"UP007", # Option and Union
"UP035", # `typing.Set` is deprecated, use `set` instead"
]
#extend-select = []
[tool.ruff.lint.pep8-naming]
classmethod-decorators = [
# Allow Pydantic's `@validator` decorator to trigger class method treatment.
"pydantic.validator",
]
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["E402", "F401"]
"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
[tool.ruff.lint.mccabe]
max-complexity = 20
# [tool.ruff.lint.isort.sections]
# "docling" = ["docling_core", "docling_ibm_models", "docling_parse"]
[tool.ruff.lint.isort]
combine-as-imports = true
# section-order = [
# "future",
# "standard-library",
# "third-party",
# "docling",
# "first-party",
# "local-folder",
# ]
[tool.mypy]
pretty = true
# strict = true
no_implicit_optional = true
plugins = "pydantic.mypy"
python_version = "3.10"
[[tool.mypy.overrides]]
module = [
"docling_parse.*",
"pypdfium2.*",
"networkx.*",
"scipy.*",
"filetype.*",
"tesserocr.*",
"docling_ibm_models.*",
"easyocr.*",
"ocrmac.*",
"mlx_vlm.*",
"lxml.*",
"huggingface_hub.*",
"transformers.*",
"pylatexenc.*",
]
ignore_missing_imports = true
[tool.semantic_release]
# for default values check:
# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
version_source = "tag_only"
branch = "main"
# configure types which should trigger minor and patch version bumps respectively
# (note that they must be a subset of the configured allowed types):
parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test"
parser_angular_minor_types = "feat"
parser_angular_patch_types = "fix,perf"