
* refactor: upgrade BeautifulSoup4 with type hints Upgrade dependency library BeautifulSoup4 to 4.13.3 (with type hints). Refactor backends using BeautifulSoup4 to comply with type hints. Apply style simplifications and improvements for consistency. Remove variables and functions that are never used. Remove code duplication between backends for parsing HTML tables. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * build: allow beautifulsoup4 version 4.12.3 Allow older version of beautifulsoup4 and ensure compatibility. Update library dependencies. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
190 lines
5.9 KiB
TOML
190 lines
5.9 KiB
TOML
[tool.poetry]
|
|
name = "docling"
|
|
version = "2.23.0" # DO NOT EDIT, updated automatically
|
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
|
license = "MIT"
|
|
readme = "README.md"
|
|
repository = "https://github.com/DS4SD/docling"
|
|
homepage = "https://github.com/DS4SD/docling"
|
|
keywords= ["docling", "convert", "document", "pdf", "docx", "html", "markdown", "layout model", "segmentation", "table structure", "table former"]
|
|
classifiers = [
|
|
"License :: OSI Approved :: MIT License",
|
|
"Operating System :: MacOS :: MacOS X",
|
|
"Operating System :: POSIX :: Linux",
|
|
"Development Status :: 5 - Production/Stable",
|
|
"Intended Audience :: Developers",
|
|
"Intended Audience :: Science/Research",
|
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
"Programming Language :: Python :: 3"
|
|
]
|
|
packages = [{include = "docling"}]
|
|
|
|
[tool.poetry.dependencies]
|
|
######################
|
|
# actual dependencies:
|
|
######################
|
|
python = "^3.9"
|
|
pydantic = "^2.0.0"
|
|
docling-core = {extras = ["chunking"], version = "^2.19.0"}
|
|
docling-ibm-models = "^3.3.0"
|
|
deepsearch-glm = "^1.0.0"
|
|
docling-parse = "^3.3.0"
|
|
filetype = "^1.2.0"
|
|
pypdfium2 = "^4.30.0"
|
|
pydantic-settings = "^2.3.0"
|
|
huggingface_hub = ">=0.23,<1"
|
|
requests = "^2.32.2"
|
|
easyocr = "^1.7"
|
|
tesserocr = { version = "^2.7.1", optional = true }
|
|
certifi = ">=2024.7.4"
|
|
rtree = "^1.3.0"
|
|
scipy = [
|
|
{ version = "^1.6.0", markers = "python_version >= '3.10'" },
|
|
{ version = ">=1.6.0,<1.14.0", markers = "python_version < '3.10'" }
|
|
]
|
|
typer = "^0.12.5"
|
|
python-docx = "^1.1.2"
|
|
python-pptx = "^1.0.2"
|
|
beautifulsoup4 = "^4.12.3"
|
|
pandas = "^2.1.4"
|
|
marko = "^2.1.2"
|
|
openpyxl = "^3.1.5"
|
|
lxml = ">=4.0.0,<6.0.0"
|
|
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
|
|
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
|
|
onnxruntime = [
|
|
# 1.19.2 is the last version with python3.9 support,
|
|
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
|
|
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
|
|
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
|
|
]
|
|
transformers = [
|
|
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
|
|
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true }
|
|
]
|
|
pillow = ">=10.0.0,<12.0.0"
|
|
tqdm = "^4.65.0"
|
|
|
|
[tool.poetry.group.dev.dependencies]
|
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
|
pytest = "^7.2.2"
|
|
pre-commit = "^3.7.1"
|
|
mypy = "^1.10.1"
|
|
isort = "^5.10.1"
|
|
python-semantic-release = "^7.32.2"
|
|
flake8 = "^6.0.0"
|
|
pyproject-flake8 = "^6.0.0"
|
|
pytest-xdist = "^3.3.1"
|
|
types-requests = "^2.31.0.2"
|
|
flake8-pyproject = "^1.2.3"
|
|
pylint = "^2.17.5"
|
|
pandas-stubs = "^2.1.4.231227"
|
|
ipykernel = "^6.29.5"
|
|
ipywidgets = "^8.1.5"
|
|
nbqa = "^1.9.0"
|
|
types-openpyxl = "^3.1.5.20241114"
|
|
types-tqdm = "^4.67.0.20241221"
|
|
|
|
[tool.poetry.group.docs.dependencies]
|
|
mkdocs-material = "^9.5.40"
|
|
mkdocs-jupyter = "^0.25.0"
|
|
mkdocs-click = "^0.8.1"
|
|
mkdocstrings = {extras = ["python"], version = "^0.27.0"}
|
|
griffe-pydantic = "^1.1.0"
|
|
|
|
[tool.poetry.group.examples.dependencies]
|
|
datasets = "^2.21.0"
|
|
python-dotenv = "^1.0.1"
|
|
langchain-huggingface = "^0.0.3"
|
|
langchain-milvus = "^0.1.4"
|
|
langchain-text-splitters = "^0.2.4"
|
|
|
|
[tool.poetry.group.constraints]
|
|
optional = true
|
|
|
|
[tool.poetry.group.constraints.dependencies]
|
|
numpy = [
|
|
{ version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
|
|
{ version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
|
|
]
|
|
|
|
[tool.poetry.group.mac_intel]
|
|
optional = true
|
|
|
|
[tool.poetry.group.mac_intel.dependencies]
|
|
torch = [
|
|
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2"},
|
|
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2"}
|
|
]
|
|
torchvision = [
|
|
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0"},
|
|
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2"}
|
|
]
|
|
|
|
[tool.poetry.extras]
|
|
tesserocr = ["tesserocr"]
|
|
ocrmac = ["ocrmac"]
|
|
vlm = ["transformers"]
|
|
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
|
|
|
[tool.poetry.scripts]
|
|
docling = "docling.cli.main:app"
|
|
docling-tools = "docling.cli.tools:app"
|
|
|
|
[build-system]
|
|
requires = ["poetry-core"]
|
|
build-backend = "poetry.core.masonry.api"
|
|
|
|
[tool.black]
|
|
line-length = 88
|
|
target-version = ["py39"]
|
|
include = '\.pyi?$'
|
|
|
|
[tool.isort]
|
|
profile = "black"
|
|
line_length = 88
|
|
py_version=39
|
|
|
|
[tool.mypy]
|
|
pretty = true
|
|
# strict = true
|
|
no_implicit_optional = true
|
|
plugins = "pydantic.mypy"
|
|
python_version = "3.10"
|
|
|
|
[[tool.mypy.overrides]]
|
|
module = [
|
|
"docling_parse.*",
|
|
"pypdfium2.*",
|
|
"networkx.*",
|
|
"scipy.*",
|
|
"filetype.*",
|
|
"tesserocr.*",
|
|
"docling_ibm_models.*",
|
|
"easyocr.*",
|
|
"ocrmac.*",
|
|
"deepsearch_glm.*",
|
|
"lxml.*",
|
|
"huggingface_hub.*",
|
|
"transformers.*",
|
|
]
|
|
ignore_missing_imports = true
|
|
|
|
[tool.flake8]
|
|
max-line-length = 88
|
|
extend-ignore = ["E203", "E501"]
|
|
|
|
[tool.semantic_release]
|
|
# for default values check:
|
|
# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
|
|
|
|
version_source = "tag_only"
|
|
branch = "main"
|
|
|
|
# configure types which should trigger minor and patch version bumps respectively
|
|
# (note that they must be a subset of the configured allowed types):
|
|
parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test"
|
|
parser_angular_minor_types = "feat"
|
|
parser_angular_patch_types = "fix,perf"
|