docs: introduce docs site (#141)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas 2024-10-14 14:13:13 +02:00 committed by GitHub
parent 2b1e72d327
commit d504432c1e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
25 changed files with 1324 additions and 574 deletions

View File

@ -1,6 +1,6 @@
<p align="center">
<a href="https://github.com/ds4sd/docling">
<img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
<img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/logo.png" width="150" />
</a>
</p>
@ -200,8 +200,8 @@ To see all available options (export formats etc.) run `docling --help`.
### RAG
Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
- [Basic RAG pipeline with LlamaIndex 🦙](https://github.com/DS4SD/docling/tree/main/docs/examples/rag_llamaindex.ipynb)
- [Basic RAG pipeline with LangChain 🦜🔗](https://github.com/DS4SD/docling/tree/main/docs/examples/rag_langchain.ipynb)
## Advanced features

View File

Before

Width:  |  Height:  |  Size: 258 KiB

After

Width:  |  Height:  |  Size: 258 KiB

View File

Before

Width:  |  Height:  |  Size: 18 KiB

After

Width:  |  Height:  |  Size: 18 KiB

View File

@ -0,0 +1,105 @@
import json
import logging
import time
from pathlib import Path
from typing import Iterable
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)
success_count = 0
failure_count = 0
partial_success_count = 0
for conv_res in conv_results:
if conv_res.status == ConversionStatus.SUCCESS:
success_count += 1
doc_filename = conv_res.input.file.stem
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.render_as_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_doctags())
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
_log.info(
f"Document {conv_res.input.file} was partially converted with the following errors:"
)
for item in conv_res.errors:
_log.info(f"\t{item.error_message}")
partial_success_count += 1
else:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
_log.info(
f"Processed {success_count + partial_success_count + failure_count} docs, "
f"of which {failure_count} failed "
f"and {partial_success_count} were partially converted."
)
return success_count, partial_success_count, failure_count
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
Path("./tests/data/2203.01017v2.pdf"),
Path("./tests/data/2305.03393v1.pdf"),
Path("./tests/data/redp5110.pdf"),
Path("./tests/data/redp5695.pdf"),
]
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs)
doc_converter = DocumentConverter()
input = DocumentConversionInput.from_paths(input_doc_paths)
start_time = time.time()
conv_results = doc_converter.convert(input)
success_count, partial_success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,175 @@
import json
import logging
import time
from pathlib import Path
from typing import Iterable
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)
success_count = 0
failure_count = 0
for conv_res in conv_results:
if conv_res.status == ConversionStatus.SUCCESS:
success_count += 1
doc_filename = conv_res.input.file.stem
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.render_as_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_doctags())
else:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
_log.info(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
)
return success_count, failure_count
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
###########################################################################
# The following sections contain a combination of PipelineOptions
# and PDF Backends for various configurations.
# Uncomment one section at the time to see the differences in the output.
# PyPdfium without EasyOCR
# --------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=False
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = False
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=PyPdfiumDocumentBackend,
# )
# PyPdfium with EasyOCR
# -----------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=PyPdfiumDocumentBackend,
# )
# Docling Parse without EasyOCR
# -------------------------
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
# Docling Parse with EasyOCR
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )
# Docling Parse with Tesseract
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractOcrOptions()
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )
# Docling Parse with Tesseract CLI
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractCliOcrOptions()
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )
###########################################################################
# Define input files
input = DocumentConversionInput.from_paths(input_doc_paths)
start_time = time.time()
conv_results = doc_converter.convert(input)
success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,85 @@
import logging
import time
from pathlib import Path
from typing import Tuple
from docling.datamodel.base_models import (
AssembleOptions,
ConversionStatus,
FigureElement,
PageElement,
TableElement,
)
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
IMAGE_RESOLUTION_SCALE = 2.0
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions()
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
doc_converter = DocumentConverter(assemble_options=assemble_options)
start_time = time.time()
conv_results = doc_converter.convert(input_files)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results:
if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue
doc_filename = conv_res.input.file.stem
# Export page images
for page in conv_res.pages:
page_no = page.page_no + 1
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.save(fp, format="PNG")
# Export figures and tables
for element, image in conv_res.render_element_images(
element_types=(FigureElement, TableElement)
):
element_image_filename = (
output_dir / f"{doc_filename}-element-{element.id}.png"
)
with element_image_filename.open("wb") as fp:
image.save(fp, "PNG")
success_count += 1
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,116 @@
import datetime
import logging
import time
from pathlib import Path
import pandas as pd
from docling.datamodel.base_models import AssembleOptions, ConversionStatus
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.utils.export import generate_multimodal_pages
_log = logging.getLogger(__name__)
IMAGE_RESOLUTION_SCALE = 2.0
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions()
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
doc_converter = DocumentConverter(assemble_options=assemble_options)
start_time = time.time()
converted_docs = doc_converter.convert(input_files)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True)
for doc in converted_docs:
if doc.status != ConversionStatus.SUCCESS:
_log.info(f"Document {doc.input.file} failed to convert.")
failure_count += 1
continue
rows = []
for (
content_text,
content_md,
content_dt,
page_cells,
page_segments,
page,
) in generate_multimodal_pages(doc):
dpi = page._default_image_scale * 72
rows.append(
{
"document": doc.input.file.name,
"hash": doc.input.document_hash,
"page_hash": page.page_hash,
"image": {
"width": page.image.width,
"height": page.image.height,
"bytes": page.image.tobytes(),
},
"cells": page_cells,
"contents": content_text,
"contents_md": content_md,
"contents_dt": content_dt,
"segments": page_segments,
"extra": {
"page_num": page.page_no + 1,
"width_in_points": page.size.width,
"height_in_points": page.size.height,
"dpi": dpi,
},
}
)
success_count += 1
# Generate one parquet from all documents
df = pd.json_normalize(rows)
now = datetime.datetime.now()
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
df.to_parquet(output_filename)
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
# This block demonstrates how the file can be opened with the HF datasets library
# from datasets import Dataset
# from PIL import Image
# multimodal_df = pd.read_parquet(output_filename)
# # Convert pandas DataFrame to Hugging Face Dataset and load bytes into image
# dataset = Dataset.from_pandas(multimodal_df)
# def transforms(examples):
# examples["image"] = Image.frombytes('RGB', (examples["image.width"], examples["image.height"]), examples["image.bytes"], 'raw')
# return examples
# dataset = dataset.map(transforms)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,74 @@
import logging
import time
from pathlib import Path
from typing import Tuple
import pandas as pd
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
doc_converter = DocumentConverter()
start_time = time.time()
conv_results = doc_converter.convert(input_files)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results:
if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue
doc_filename = conv_res.input.file.stem
# Export tables
for table_ix, table in enumerate(conv_res.output.tables):
table_df: pd.DataFrame = table.export_to_dataframe()
print(f"## Table {table_ix}")
print(table_df.to_markdown())
# Save the table as csv
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename)
# Save the table as html
element_html_filename = (
output_dir / f"{doc_filename}-table-{table_ix+1}.html"
)
_log.info(f"Saving HTML table to {element_html_filename}")
with element_html_filename.open("w") as fp:
fp.write(table.export_to_html())
success_count += 1
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__":
main()

6
docs/examples/minimal.py Normal file
View File

@ -0,0 +1,6 @@
from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter()
doc = converter.convert_single(source)
print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"

View File

@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# RAG with Docling and 🦜🔗 LangChain"
"# RAG with LangChain 🦜🔗"
]
},
{

View File

@ -11,7 +11,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# RAG with Docling and 🦙 LlamaIndex"
"# RAG with LlamaIndex 🦙"
]
},
{
@ -25,9 +25,11 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"LlamaIndex extensions `DoclingReader` and `DoclingNodeParser` presented in this notebook seamlessly integrate Docling into LlamaIndex, enabling you to:\n",
"This example leverages the official [LlamaIndex Docling extension](../../integrations/llamaindex/).\n",
"\n",
"Presented extensions `DoclingReader` and `DoclingNodeParser` enable you to:\n",
"- use PDF documents in your LLM applications with ease and speed, and\n",
"- leverage Docling's rich format for advanced, document-native grounding."
"- harness Docling's rich format for advanced, document-native grounding."
]
},
{

29
docs/index.md Normal file
View File

@ -0,0 +1,29 @@
# Docling
<p align="center">
<a href="https://ds4sd.github.io/docling/">
<img loading="lazy" alt="Docling" src="assets/logo.png" width="150" />
</a>
</p>
[![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
[![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
[![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
[![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
[![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
## Features
* ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
* 📑 Understands detailed page layout, reading order and recovers table structures
* 📝 Extracts metadata from the document, such as title, authors, references and language
* 🔍 Includes OCR support for scanned PDFs
* 🤖 Integrates easily with LLM app / RAG frameworks like LlamaIndex&nbsp;🦙 & LangChain&nbsp;🦜🔗
* 💻 Provides a simple and convenient CLI

100
docs/installation.md Normal file
View File

@ -0,0 +1,100 @@
To use Docling, simply install `docling` from your Python package manager, e.g. pip:
```bash
pip install docling
```
Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 architectures.
??? "Alternative PyTorch distributions"
The Docling models depend on the [PyTorch](https://pytorch.org/) library.
Depending on your architecture, you might want to use a different distribution of `torch`.
For example, you might want support for different accelerator or for a cpu-only version.
All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
One common situation is the installation on Linux systems with cpu-only support.
In this case, we suggest the installation of Docling with the following options
```bash
# Example for installing on the Linux cpu-only version
pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
```
??? "Alternative OCR engines"
Docling supports multiple OCR engines for processing scanned documents. The current version provides
the following engines.
| Engine | Installation | Usage |
| ------ | ------------ | ----- |
| [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
| Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` |
| Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
```python
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
from docling.document_converter import DocumentConverter
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.ocr_options = TesseractOcrOptions() # Use Tesseract
doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
)
```
<h3>Tesseract installation</h3>
[Tesseract](https://github.com/tesseract-ocr/tesseract) is a popular OCR engine which is available
on most operating systems. For using this engine with Docling, Tesseract must be installed on your
system, using the packaging tool of your choice. Below we provide example commands.
After installing Tesseract you are expected to provide the path to its language files using the
`TESSDATA_PREFIX` environment variable (note that it must terminate with a slash `/`).
=== "macOS (via [Homebrew](https://brew.sh/))"
```console
brew install tesseract leptonica pkg-config
TESSDATA_PREFIX=/opt/homebrew/share/tessdata/
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
```
=== "Debian-based"
```console
apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config
TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
```
=== "RHEL"
```console
dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
```
<h3>Linking to Tesseract</h3>
The most efficient usage of the Tesseract library is via linking. Docling is using
the [Tesserocr](https://github.com/sirfz/tesserocr) package for this.
If you get into installation issues of Tesserocr, we suggest using the following
installation options:
```console
pip uninstall tesserocr
pip install --no-binary :all: tesserocr
```
## Development setup
To develop Docling features, bugfixes etc., install as follows from your local clone's root dir:
```bash
poetry install --all-extras
```

View File

@ -0,0 +1,25 @@
## Get started
Docling is available as an official LlamaIndex extension!
To get started, check out the [step-by-step guide in LlamaIndex \[↗\]](https://docs.llamaindex.ai/en/stable/examples/data_connectors/DoclingReaderDemo/)<!--{target="_blank"}-->.
## Components
### Docling Reader
Reads document files and uses Docling to populate LlamaIndex `Document` objects — either serializing Docling's data model (losslessly, e.g. as JSON) or exporting to a simplified format (lossily, e.g. as Markdown).
- 💻 [GitHub \[↗\]](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/readers/llama-index-readers-docling)<!--{target="_blank"}-->
- 📖 [API docs \[↗\]](https://docs.llamaindex.ai/en/stable/api_reference/readers/docling/)<!--{target="_blank"}-->
- 📦 [PyPI \[↗\]](https://pypi.org/project/llama-index-readers-docling/)<!--{target="_blank"}-->
- 🦙 [LlamaHub \[↗\]](https://llamahub.ai/l/readers/llama-index-readers-docling)<!--{target="_blank"}-->
### Docling Node Parser
Reads LlamaIndex `Document` objects populated in Docling's format by Docling Reader and, using its knowledge of the Docling format, parses them to LlamaIndex `Node` objects for downstream usage in LlamaIndex applications, e.g. as chunks for embedding.
- 💻 [GitHub \[↗\]](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/node_parser/llama-index-node-parser-docling)<!--{target="_blank"}-->
- 📖 [API docs \[↗\]](https://docs.llamaindex.ai/en/stable/api_reference/node_parser/docling/)<!--{target="_blank"}-->
- 📦 [PyPI \[↗\]](https://pypi.org/project/llama-index-node-parser-docling/)<!--{target="_blank"}-->
- 🦙 [LlamaHub \[↗\]](https://llamahub.ai/l/node_parser/llama-index-node-parser-docling)<!--{target="_blank"}-->

7
docs/overrides/main.html Normal file
View File

@ -0,0 +1,7 @@
{% extends "base.html" %}
{#
{% block announce %}
<p>🎉 Docling is now officially supported in LlamaIndex! <a href="{{ 'integrations/llamaindex/' | url }}">Check it out</a>!</p>
{% endblock %}
#}

View File

@ -0,0 +1,3 @@
[data-md-color-scheme="default"] .md-banner a {
color: #5e8bde;
}

View File

@ -1,105 +0,0 @@
import json
import logging
import time
from pathlib import Path
from typing import Iterable
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)
success_count = 0
failure_count = 0
partial_success_count = 0
for conv_res in conv_results:
if conv_res.status == ConversionStatus.SUCCESS:
success_count += 1
doc_filename = conv_res.input.file.stem
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.render_as_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_doctags())
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
_log.info(
f"Document {conv_res.input.file} was partially converted with the following errors:"
)
for item in conv_res.errors:
_log.info(f"\t{item.error_message}")
partial_success_count += 1
else:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
_log.info(
f"Processed {success_count + partial_success_count + failure_count} docs, "
f"of which {failure_count} failed "
f"and {partial_success_count} were partially converted."
)
return success_count, partial_success_count, failure_count
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
Path("./tests/data/2203.01017v2.pdf"),
Path("./tests/data/2305.03393v1.pdf"),
Path("./tests/data/redp5110.pdf"),
Path("./tests/data/redp5695.pdf"),
]
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs)
doc_converter = DocumentConverter()
input = DocumentConversionInput.from_paths(input_doc_paths)
start_time = time.time()
conv_results = doc_converter.convert(input)
success_count, partial_success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__":
main()

1
examples/batch_convert.py Symbolic link
View File

@ -0,0 +1 @@
../docs/examples/batch_convert.py

View File

@ -1,175 +0,0 @@
import json
import logging
import time
from pathlib import Path
from typing import Iterable
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import (
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)
success_count = 0
failure_count = 0
for conv_res in conv_results:
if conv_res.status == ConversionStatus.SUCCESS:
success_count += 1
doc_filename = conv_res.input.file.stem
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open(
"w", encoding="utf-8"
) as fp:
fp.write(json.dumps(conv_res.render_as_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_text())
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_res.render_as_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open(
"w", encoding="utf-8"
) as fp:
fp.write(conv_res.render_as_doctags())
else:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
_log.info(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
)
return success_count, failure_count
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
###########################################################################
# The following sections contain a combination of PipelineOptions
# and PDF Backends for various configurations.
# Uncomment one section at the time to see the differences in the output.
# PyPdfium without EasyOCR
# --------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=False
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = False
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=PyPdfiumDocumentBackend,
# )
# PyPdfium with EasyOCR
# -----------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=PyPdfiumDocumentBackend,
# )
# Docling Parse without EasyOCR
# -------------------------
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
# Docling Parse with EasyOCR
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )
# Docling Parse with Tesseract
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractOcrOptions()
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )
# Docling Parse with Tesseract CLI
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractCliOcrOptions()
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )
###########################################################################
# Define input files
input = DocumentConversionInput.from_paths(input_doc_paths)
start_time = time.time()
conv_results = doc_converter.convert(input)
success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__":
main()

1
examples/custom_convert.py Symbolic link
View File

@ -0,0 +1 @@
../docs/examples/custom_convert.py

View File

@ -1,85 +0,0 @@
import logging
import time
from pathlib import Path
from typing import Tuple
from docling.datamodel.base_models import (
AssembleOptions,
ConversionStatus,
FigureElement,
PageElement,
TableElement,
)
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
IMAGE_RESOLUTION_SCALE = 2.0
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions()
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
doc_converter = DocumentConverter(assemble_options=assemble_options)
start_time = time.time()
conv_results = doc_converter.convert(input_files)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results:
if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue
doc_filename = conv_res.input.file.stem
# Export page images
for page in conv_res.pages:
page_no = page.page_no + 1
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.save(fp, format="PNG")
# Export figures and tables
for element, image in conv_res.render_element_images(
element_types=(FigureElement, TableElement)
):
element_image_filename = (
output_dir / f"{doc_filename}-element-{element.id}.png"
)
with element_image_filename.open("wb") as fp:
image.save(fp, "PNG")
success_count += 1
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__":
main()

1
examples/export_figures.py Symbolic link
View File

@ -0,0 +1 @@
../docs/examples/export_figures.py

View File

@ -1,116 +0,0 @@
import datetime
import logging
import time
from pathlib import Path
import pandas as pd
from docling.datamodel.base_models import AssembleOptions, ConversionStatus
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.utils.export import generate_multimodal_pages
_log = logging.getLogger(__name__)
IMAGE_RESOLUTION_SCALE = 2.0
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions()
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
doc_converter = DocumentConverter(assemble_options=assemble_options)
start_time = time.time()
converted_docs = doc_converter.convert(input_files)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True)
for doc in converted_docs:
if doc.status != ConversionStatus.SUCCESS:
_log.info(f"Document {doc.input.file} failed to convert.")
failure_count += 1
continue
rows = []
for (
content_text,
content_md,
content_dt,
page_cells,
page_segments,
page,
) in generate_multimodal_pages(doc):
dpi = page._default_image_scale * 72
rows.append(
{
"document": doc.input.file.name,
"hash": doc.input.document_hash,
"page_hash": page.page_hash,
"image": {
"width": page.image.width,
"height": page.image.height,
"bytes": page.image.tobytes(),
},
"cells": page_cells,
"contents": content_text,
"contents_md": content_md,
"contents_dt": content_dt,
"segments": page_segments,
"extra": {
"page_num": page.page_no + 1,
"width_in_points": page.size.width,
"height_in_points": page.size.height,
"dpi": dpi,
},
}
)
success_count += 1
# Generate one parquet from all documents
df = pd.json_normalize(rows)
now = datetime.datetime.now()
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
df.to_parquet(output_filename)
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
# This block demonstrates how the file can be opened with the HF datasets library
# from datasets import Dataset
# from PIL import Image
# multimodal_df = pd.read_parquet(output_filename)
# # Convert pandas DataFrame to Hugging Face Dataset and load bytes into image
# dataset = Dataset.from_pandas(multimodal_df)
# def transforms(examples):
# examples["image"] = Image.frombytes('RGB', (examples["image.width"], examples["image.height"]), examples["image.bytes"], 'raw')
# return examples
# dataset = dataset.map(transforms)
if __name__ == "__main__":
main()

View File

@ -0,0 +1 @@
../docs/examples/export_multimodal.py

View File

@ -1,74 +0,0 @@
import logging
import time
from pathlib import Path
from typing import Tuple
import pandas as pd
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")
input_files = DocumentConversionInput.from_paths(input_doc_paths)
doc_converter = DocumentConverter()
start_time = time.time()
conv_results = doc_converter.convert(input_files)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results:
if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue
doc_filename = conv_res.input.file.stem
# Export tables
for table_ix, table in enumerate(conv_res.output.tables):
table_df: pd.DataFrame = table.export_to_dataframe()
print(f"## Table {table_ix}")
print(table_df.to_markdown())
# Save the table as csv
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename)
# Save the table as html
element_html_filename = (
output_dir / f"{doc_filename}-table-{table_ix+1}.html"
)
_log.info(f"Saving HTML table to {element_html_filename}")
with element_html_filename.open("w") as fp:
fp.write(table.export_to_html())
success_count += 1
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__":
main()

1
examples/export_tables.py Symbolic link
View File

@ -0,0 +1 @@
../docs/examples/export_tables.py

View File

@ -1,6 +0,0 @@
from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter()
doc = converter.convert_single(source)
print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"

1
examples/minimal.py Symbolic link
View File

@ -0,0 +1 @@
../docs/examples/minimal.py

97
mkdocs.yml Normal file
View File

@ -0,0 +1,97 @@
site_name: Docling
site_url: https://ds4sd.github.io/docling/
repo_name: DS4SD/docling
repo_url: https://github.com/DS4SD/docling
theme:
name: material
custom_dir: docs/overrides
palette:
# Palette toggle for automatic mode
- media: "(prefers-color-scheme)"
scheme: default
primary: black
toggle:
icon: material/brightness-auto
name: Switch to light mode
# Palette toggle for light mode
- media: "(prefers-color-scheme: light)"
scheme: default
primary: black
toggle:
icon: material/brightness-7
name: Switch to dark mode
# Palette toggle for dark mode
- media: "(prefers-color-scheme: dark)"
scheme: slate
primary: black
toggle:
icon: material/brightness-4
name: Switch to system preference
logo: assets/logo.png
favicon: assets/logo.png
features:
- content.tabs.link
- content.code.annotate
- content.code.copy
- announce.dismiss
- navigation.tabs
# - navigation.indexes # <= if set, each "section" can have its own page, if index.md is used
- navigation.instant
- navigation.instant.prefetch
# - navigation.instant.preview
- navigation.instant.progress
- navigation.path
- navigation.sections # <=
- navigation.top
- navigation.tracking
- search.suggest
- toc.follow
nav:
- Get started:
- Home: index.md
- Installation: installation.md
# - Docling v2: v2.md
# - Concepts:
# - Docling Document: concepts/document.md
# - Chunking: concepts/chunking.md
- Examples:
- Conversion:
- "Simple conversion": examples/minimal.py
- "Custom conversion": examples/custom_convert.py
- "Batch conversion": examples/batch_convert.py
- "Figure export": examples/export_figures.py
- "Table export": examples/export_tables.py
- "Multimodal export": examples/export_multimodal.py
- RAG / QA:
- "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
- "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
# - Chunking:
# - Chunking: examples/chunking.md
# - CLI:
# - CLI: examples/cli.md
- Integrations:
- "LlamaIndex 🦙 extension": integrations/llamaindex.md
# - "LangChain 🦜🔗 extension": integrations/langchain.md
# - API reference:
# - API reference: api_reference/index.md
markdown_extensions:
- pymdownx.superfences
- pymdownx.tabbed:
alternate_style: true
slugify: !!python/object/apply:pymdownx.slugs.slugify
kwds:
case: lower
- admonition
- pymdownx.details
- attr_list
plugins:
- search
- mkdocs-jupyter
extra_css:
- stylesheets/extra.css

491
poetry.lock generated
View File

@ -263,6 +263,20 @@ files = [
pycodestyle = ">=2.11.0"
tomli = {version = "*", markers = "python_version < \"3.11\""}
[[package]]
name = "babel"
version = "2.16.0"
description = "Internationalization utilities"
optional = false
python-versions = ">=3.8"
files = [
{file = "babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b"},
{file = "babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316"},
]
[package.extras]
dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"]
[[package]]
name = "backports-tarfile"
version = "1.2.0"
@ -347,6 +361,24 @@ d = ["aiohttp (>=3.10)"]
jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
uvloop = ["uvloop (>=0.15.2)"]
[[package]]
name = "bleach"
version = "6.1.0"
description = "An easy safelist-based HTML-sanitizing tool."
optional = false
python-versions = ">=3.8"
files = [
{file = "bleach-6.1.0-py3-none-any.whl", hash = "sha256:3225f354cfc436b9789c66c4ee030194bee0568fbf9cbdad3bc8b5c26c5f12b6"},
{file = "bleach-6.1.0.tar.gz", hash = "sha256:0a31f1837963c41d46bbf1331b8778e1308ea0791db03cc4e7357b97cf42a8fe"},
]
[package.dependencies]
six = ">=1.9.0"
webencodings = "*"
[package.extras]
css = ["tinycss2 (>=1.1.0,<1.3)"]
[[package]]
name = "certifi"
version = "2024.8.30"
@ -931,6 +963,17 @@ tqdm = ">=4.64.0,<5.0.0"
[package.extras]
toolkit = ["deepsearch-toolkit (>=0.31.0)"]
[[package]]
name = "defusedxml"
version = "0.7.1"
description = "XML bomb protection for Python stdlib modules"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
files = [
{file = "defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61"},
{file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
]
[[package]]
name = "deprecated"
version = "1.2.14"
@ -1185,6 +1228,20 @@ files = [
[package.extras]
tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"]
[[package]]
name = "fastjsonschema"
version = "2.20.0"
description = "Fastest Python implementation of JSON schema"
optional = false
python-versions = "*"
files = [
{file = "fastjsonschema-2.20.0-py3-none-any.whl", hash = "sha256:5875f0b0fa7a0043a91e93a9b8f793bcbbba9691e7fd83dca95c28ba26d21f0a"},
{file = "fastjsonschema-2.20.0.tar.gz", hash = "sha256:3d48fc5300ee96f5d116f10fe6f28d938e6008f59a6a025c2649475b87f76a23"},
]
[package.extras]
devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]
[[package]]
name = "filelock"
version = "3.16.1"
@ -1444,6 +1501,23 @@ test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe,
test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"]
tqdm = ["tqdm"]
[[package]]
name = "ghp-import"
version = "2.1.0"
description = "Copy your docs directly to the gh-pages branch."
optional = false
python-versions = "*"
files = [
{file = "ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343"},
{file = "ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619"},
]
[package.dependencies]
python-dateutil = ">=2.8.1"
[package.extras]
dev = ["flake8", "markdown", "twine", "wheel"]
[[package]]
name = "gitdb"
version = "4.0.11"
@ -2214,6 +2288,17 @@ traitlets = ">=5.3"
docs = ["myst-parser", "pydata-sphinx-theme", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling", "traitlets"]
test = ["ipykernel", "pre-commit", "pytest (<8)", "pytest-cov", "pytest-timeout"]
[[package]]
name = "jupyterlab-pygments"
version = "0.3.0"
description = "Pygments theme using JupyterLab CSS variables"
optional = false
python-versions = ">=3.8"
files = [
{file = "jupyterlab_pygments-0.3.0-py3-none-any.whl", hash = "sha256:841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780"},
{file = "jupyterlab_pygments-0.3.0.tar.gz", hash = "sha256:721aca4d9029252b11cfa9d185e5b5af4d54772bb8072f9b7036f4170054d35d"},
]
[[package]]
name = "jupyterlab-widgets"
version = "3.0.13"
@ -2225,6 +2310,35 @@ files = [
{file = "jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed"},
]
[[package]]
name = "jupytext"
version = "1.16.4"
description = "Jupyter notebooks as Markdown documents, Julia, Python or R scripts"
optional = false
python-versions = ">=3.8"
files = [
{file = "jupytext-1.16.4-py3-none-any.whl", hash = "sha256:76989d2690e65667ea6fb411d8056abe7cd0437c07bd774660b83d62acf9490a"},
{file = "jupytext-1.16.4.tar.gz", hash = "sha256:28e33f46f2ce7a41fb9d677a4a2c95327285579b64ca104437c4b9eb1e4174e9"},
]
[package.dependencies]
markdown-it-py = ">=1.0"
mdit-py-plugins = "*"
nbformat = "*"
packaging = "*"
pyyaml = "*"
tomli = {version = "*", markers = "python_version < \"3.11\""}
[package.extras]
dev = ["autopep8", "black", "flake8", "gitpython", "ipykernel", "isort", "jupyter-fs (>=1.0)", "jupyter-server (!=2.11)", "nbconvert", "pre-commit", "pytest", "pytest-cov (>=2.6.1)", "pytest-randomly", "pytest-xdist", "sphinx-gallery (<0.8)"]
docs = ["myst-parser", "sphinx", "sphinx-copybutton", "sphinx-rtd-theme"]
test = ["pytest", "pytest-randomly", "pytest-xdist"]
test-cov = ["ipykernel", "jupyter-server (!=2.11)", "nbconvert", "pytest", "pytest-cov (>=2.6.1)", "pytest-randomly", "pytest-xdist"]
test-external = ["autopep8", "black", "flake8", "gitpython", "ipykernel", "isort", "jupyter-fs (>=1.0)", "jupyter-server (!=2.11)", "nbconvert", "pre-commit", "pytest", "pytest-randomly", "pytest-xdist", "sphinx-gallery (<0.8)"]
test-functional = ["pytest", "pytest-randomly", "pytest-xdist"]
test-integration = ["ipykernel", "jupyter-server (!=2.11)", "nbconvert", "pytest", "pytest-randomly", "pytest-xdist"]
test-ui = ["calysto-bash"]
[[package]]
name = "keyring"
version = "25.4.1"
@ -2777,6 +2891,21 @@ html5 = ["html5lib"]
htmlsoup = ["BeautifulSoup4"]
source = ["Cython (==0.29.37)"]
[[package]]
name = "markdown"
version = "3.7"
description = "Python implementation of John Gruber's Markdown."
optional = false
python-versions = ">=3.8"
files = [
{file = "Markdown-3.7-py3-none-any.whl", hash = "sha256:7eb6df5690b81a1d7942992c97fad2938e956e79df20cbc6186e9c3a77b1c803"},
{file = "markdown-3.7.tar.gz", hash = "sha256:2ae2471477cfd02dbbf038d5d9bc226d40def84b4fe2986e49b59b6b472bbed2"},
]
[package.extras]
docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.5)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"]
testing = ["coverage", "pyyaml"]
[[package]]
name = "markdown-it-py"
version = "3.0.0"
@ -3008,6 +3137,25 @@ files = [
{file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
]
[[package]]
name = "mdit-py-plugins"
version = "0.4.2"
description = "Collection of plugins for markdown-it-py"
optional = false
python-versions = ">=3.8"
files = [
{file = "mdit_py_plugins-0.4.2-py3-none-any.whl", hash = "sha256:0c673c3f889399a33b95e88d2f0d111b4447bdfea7f237dab2d488f459835636"},
{file = "mdit_py_plugins-0.4.2.tar.gz", hash = "sha256:5f2cd1fdb606ddf152d37ec30e46101a60512bc0e5fa1a7002c36647b09e26b5"},
]
[package.dependencies]
markdown-it-py = ">=1.0.0,<4.0.0"
[package.extras]
code-style = ["pre-commit"]
rtd = ["myst-parser", "sphinx-book-theme"]
testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
[[package]]
name = "mdurl"
version = "0.1.2"
@ -3034,6 +3182,17 @@ files = [
numpy = "*"
pandas = "*"
[[package]]
name = "mergedeep"
version = "1.3.4"
description = "A deep merge function for 🐍."
optional = false
python-versions = ">=3.6"
files = [
{file = "mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307"},
{file = "mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8"},
]
[[package]]
name = "milvus-lite"
version = "2.4.10"
@ -3067,6 +3226,122 @@ files = [
{file = "minijinja-2.2.0.tar.gz", hash = "sha256:4411052c7a60f8d56468cc6d17d45d72be3d5e89e9578a04f8336cc56601523c"},
]
[[package]]
name = "mistune"
version = "3.0.2"
description = "A sane and fast Markdown parser with useful plugins and renderers"
optional = false
python-versions = ">=3.7"
files = [
{file = "mistune-3.0.2-py3-none-any.whl", hash = "sha256:71481854c30fdbc938963d3605b72501f5c10a9320ecd412c121c163a1c7d205"},
{file = "mistune-3.0.2.tar.gz", hash = "sha256:fc7f93ded930c92394ef2cb6f04a8aabab4117a91449e72dcc8dfa646a508be8"},
]
[[package]]
name = "mkdocs"
version = "1.6.1"
description = "Project documentation with Markdown."
optional = false
python-versions = ">=3.8"
files = [
{file = "mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e"},
{file = "mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2"},
]
[package.dependencies]
click = ">=7.0"
colorama = {version = ">=0.4", markers = "platform_system == \"Windows\""}
ghp-import = ">=1.0"
jinja2 = ">=2.11.1"
markdown = ">=3.3.6"
markupsafe = ">=2.0.1"
mergedeep = ">=1.3.4"
mkdocs-get-deps = ">=0.2.0"
packaging = ">=20.5"
pathspec = ">=0.11.1"
pyyaml = ">=5.1"
pyyaml-env-tag = ">=0.1"
watchdog = ">=2.0"
[package.extras]
i18n = ["babel (>=2.9.0)"]
min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-import (==1.0)", "importlib-metadata (==4.4)", "jinja2 (==2.11.1)", "markdown (==3.3.6)", "markupsafe (==2.0.1)", "mergedeep (==1.3.4)", "mkdocs-get-deps (==0.2.0)", "packaging (==20.5)", "pathspec (==0.11.1)", "pyyaml (==5.1)", "pyyaml-env-tag (==0.1)", "watchdog (==2.0)"]
[[package]]
name = "mkdocs-get-deps"
version = "0.2.0"
description = "MkDocs extension that lists all dependencies according to a mkdocs.yml file"
optional = false
python-versions = ">=3.8"
files = [
{file = "mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134"},
{file = "mkdocs_get_deps-0.2.0.tar.gz", hash = "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c"},
]
[package.dependencies]
mergedeep = ">=1.3.4"
platformdirs = ">=2.2.0"
pyyaml = ">=5.1"
[[package]]
name = "mkdocs-jupyter"
version = "0.25.0"
description = "Use Jupyter in mkdocs websites"
optional = false
python-versions = ">=3.9"
files = [
{file = "mkdocs_jupyter-0.25.0-py3-none-any.whl", hash = "sha256:d83d71deef19f0401505945bf92ec3bd5b40615af89308e72d5112929f8ee00b"},
{file = "mkdocs_jupyter-0.25.0.tar.gz", hash = "sha256:e26c1d341916bc57f96ea3f93d8d0a88fc77c87d4cee222f66d2007798d924f5"},
]
[package.dependencies]
ipykernel = ">6.0.0,<7.0.0"
jupytext = ">1.13.8,<2"
mkdocs = ">=1.4.0,<2"
mkdocs-material = ">9.0.0"
nbconvert = ">=7.2.9,<8"
pygments = ">2.12.0"
[[package]]
name = "mkdocs-material"
version = "9.5.40"
description = "Documentation that simply works"
optional = false
python-versions = ">=3.8"
files = [
{file = "mkdocs_material-9.5.40-py3-none-any.whl", hash = "sha256:8e7a16ada34e79a7b6459ff2602584222f522c738b6a023d1bea853d5049da6f"},
{file = "mkdocs_material-9.5.40.tar.gz", hash = "sha256:b69d70e667ec51fc41f65e006a3184dd00d95b2439d982cb1586e4c018943156"},
]
[package.dependencies]
babel = ">=2.10,<3.0"
colorama = ">=0.4,<1.0"
jinja2 = ">=3.0,<4.0"
markdown = ">=3.2,<4.0"
mkdocs = ">=1.6,<2.0"
mkdocs-material-extensions = ">=1.3,<2.0"
paginate = ">=0.5,<1.0"
pygments = ">=2.16,<3.0"
pymdown-extensions = ">=10.2,<11.0"
regex = ">=2022.4"
requests = ">=2.26,<3.0"
[package.extras]
git = ["mkdocs-git-committers-plugin-2 (>=1.1,<2.0)", "mkdocs-git-revision-date-localized-plugin (>=1.2.4,<2.0)"]
imaging = ["cairosvg (>=2.6,<3.0)", "pillow (>=10.2,<11.0)"]
recommended = ["mkdocs-minify-plugin (>=0.7,<1.0)", "mkdocs-redirects (>=1.2,<2.0)", "mkdocs-rss-plugin (>=1.6,<2.0)"]
[[package]]
name = "mkdocs-material-extensions"
version = "1.3.1"
description = "Extension pack for Python Markdown and MkDocs Material."
optional = false
python-versions = ">=3.8"
files = [
{file = "mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31"},
{file = "mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443"},
]
[[package]]
name = "more-itertools"
version = "10.5.0"
@ -3281,6 +3556,86 @@ files = [
{file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
]
[[package]]
name = "nbclient"
version = "0.10.0"
description = "A client library for executing notebooks. Formerly nbconvert's ExecutePreprocessor."
optional = false
python-versions = ">=3.8.0"
files = [
{file = "nbclient-0.10.0-py3-none-any.whl", hash = "sha256:f13e3529332a1f1f81d82a53210322476a168bb7090a0289c795fe9cc11c9d3f"},
{file = "nbclient-0.10.0.tar.gz", hash = "sha256:4b3f1b7dba531e498449c4db4f53da339c91d449dc11e9af3a43b4eb5c5abb09"},
]
[package.dependencies]
jupyter-client = ">=6.1.12"
jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0"
nbformat = ">=5.1"
traitlets = ">=5.4"
[package.extras]
dev = ["pre-commit"]
docs = ["autodoc-traits", "mock", "moto", "myst-parser", "nbclient[test]", "sphinx (>=1.7)", "sphinx-book-theme", "sphinxcontrib-spelling"]
test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>=7.0.0)", "pytest (>=7.0,<8)", "pytest-asyncio", "pytest-cov (>=4.0)", "testpath", "xmltodict"]
[[package]]
name = "nbconvert"
version = "7.16.4"
description = "Converting Jupyter Notebooks (.ipynb files) to other formats. Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script. nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)."
optional = false
python-versions = ">=3.8"
files = [
{file = "nbconvert-7.16.4-py3-none-any.whl", hash = "sha256:05873c620fe520b6322bf8a5ad562692343fe3452abda5765c7a34b7d1aa3eb3"},
{file = "nbconvert-7.16.4.tar.gz", hash = "sha256:86ca91ba266b0a448dc96fa6c5b9d98affabde2867b363258703536807f9f7f4"},
]
[package.dependencies]
beautifulsoup4 = "*"
bleach = "!=5.0.0"
defusedxml = "*"
jinja2 = ">=3.0"
jupyter-core = ">=4.7"
jupyterlab-pygments = "*"
markupsafe = ">=2.0"
mistune = ">=2.0.3,<4"
nbclient = ">=0.5.0"
nbformat = ">=5.7"
packaging = "*"
pandocfilters = ">=1.4.1"
pygments = ">=2.4.1"
tinycss2 = "*"
traitlets = ">=5.1"
[package.extras]
all = ["flaky", "ipykernel", "ipython", "ipywidgets (>=7.5)", "myst-parser", "nbsphinx (>=0.2.12)", "playwright", "pydata-sphinx-theme", "pyqtwebengine (>=5.15)", "pytest (>=7)", "sphinx (==5.0.2)", "sphinxcontrib-spelling", "tornado (>=6.1)"]
docs = ["ipykernel", "ipython", "myst-parser", "nbsphinx (>=0.2.12)", "pydata-sphinx-theme", "sphinx (==5.0.2)", "sphinxcontrib-spelling"]
qtpdf = ["pyqtwebengine (>=5.15)"]
qtpng = ["pyqtwebengine (>=5.15)"]
serve = ["tornado (>=6.1)"]
test = ["flaky", "ipykernel", "ipywidgets (>=7.5)", "pytest (>=7)"]
webpdf = ["playwright"]
[[package]]
name = "nbformat"
version = "5.10.4"
description = "The Jupyter Notebook format"
optional = false
python-versions = ">=3.8"
files = [
{file = "nbformat-5.10.4-py3-none-any.whl", hash = "sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b"},
{file = "nbformat-5.10.4.tar.gz", hash = "sha256:322168b14f937a5d11362988ecac2a4952d3d8e3a2cbeb2319584631226d5b3a"},
]
[package.dependencies]
fastjsonschema = ">=2.15"
jsonschema = ">=2.6"
jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0"
traitlets = ">=5.1"
[package.extras]
docs = ["myst-parser", "pydata-sphinx-theme", "sphinx", "sphinxcontrib-github-alt", "sphinxcontrib-spelling"]
test = ["pep440", "pre-commit", "pytest", "testpath"]
[[package]]
name = "nbqa"
version = "1.9.0"
@ -3758,6 +4113,21 @@ files = [
{file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
]
[[package]]
name = "paginate"
version = "0.5.7"
description = "Divides large result sets into pages for easier browsing"
optional = false
python-versions = "*"
files = [
{file = "paginate-0.5.7-py2.py3-none-any.whl", hash = "sha256:b885e2af73abcf01d9559fd5216b57ef722f8c42affbb63942377668e35c7591"},
{file = "paginate-0.5.7.tar.gz", hash = "sha256:22bd083ab41e1a8b4f3690544afb2c60c25e5c9a63a30fa2f483f6c60c8e5945"},
]
[package.extras]
dev = ["pytest", "tox"]
lint = ["black"]
[[package]]
name = "pandas"
version = "2.2.3"
@ -3859,6 +4229,17 @@ files = [
numpy = ">=1.23.5"
types-pytz = ">=2022.1.1"
[[package]]
name = "pandocfilters"
version = "1.5.1"
description = "Utilities for writing pandoc filters in python"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
{file = "pandocfilters-1.5.1-py2.py3-none-any.whl", hash = "sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc"},
{file = "pandocfilters-1.5.1.tar.gz", hash = "sha256:002b4a555ee4ebc03f8b66307e287fa492e4a77b4ea14d3f934328297bb4939e"},
]
[[package]]
name = "parso"
version = "0.8.4"
@ -4574,6 +4955,24 @@ tomlkit = ">=0.10.1"
spelling = ["pyenchant (>=3.2,<4.0)"]
testutils = ["gitpython (>3)"]
[[package]]
name = "pymdown-extensions"
version = "10.11.2"
description = "Extension pack for Python Markdown."
optional = false
python-versions = ">=3.8"
files = [
{file = "pymdown_extensions-10.11.2-py3-none-any.whl", hash = "sha256:41cdde0a77290e480cf53892f5c5e50921a7ee3e5cd60ba91bf19837b33badcf"},
{file = "pymdown_extensions-10.11.2.tar.gz", hash = "sha256:bc8847ecc9e784a098efd35e20cba772bc5a1b529dfcef9dc1972db9021a1049"},
]
[package.dependencies]
markdown = ">=3.6"
pyyaml = "*"
[package.extras]
extra = ["pygments (>=2.12)"]
[[package]]
name = "pymilvus"
version = "2.4.7"
@ -5027,6 +5426,20 @@ files = [
{file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
]
[[package]]
name = "pyyaml-env-tag"
version = "0.1"
description = "A custom YAML tag for referencing environment variables in YAML files. "
optional = false
python-versions = ">=3.6"
files = [
{file = "pyyaml_env_tag-0.1-py3-none-any.whl", hash = "sha256:af31106dec8a4d68c60207c1886031cbf839b68aa7abccdb19868200532c2069"},
{file = "pyyaml_env_tag-0.1.tar.gz", hash = "sha256:70092675bda14fdec33b31ba77e7543de9ddc88f2e5b99160396572d11525bdb"},
]
[package.dependencies]
pyyaml = "*"
[[package]]
name = "pyzmq"
version = "26.2.0"
@ -6342,6 +6755,24 @@ requests = ">=2.26.0"
[package.extras]
blobfile = ["blobfile (>=2)"]
[[package]]
name = "tinycss2"
version = "1.3.0"
description = "A tiny CSS parser"
optional = false
python-versions = ">=3.8"
files = [
{file = "tinycss2-1.3.0-py3-none-any.whl", hash = "sha256:54a8dbdffb334d536851be0226030e9505965bb2f30f21a4a82c55fb2a80fae7"},
{file = "tinycss2-1.3.0.tar.gz", hash = "sha256:152f9acabd296a8375fbca5b84c961ff95971fcfc32e79550c8df8e29118c54d"},
]
[package.dependencies]
webencodings = ">=0.4"
[package.extras]
doc = ["sphinx", "sphinx_rtd_theme"]
test = ["pytest", "ruff"]
[[package]]
name = "tokenize-rt"
version = "6.0.0"
@ -6843,11 +7274,6 @@ files = [
{file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
{file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
{file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
{file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"},
{file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"},
{file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"},
{file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"},
{file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"},
]
[package.dependencies]
@ -7084,6 +7510,48 @@ platformdirs = ">=3.9.1,<5"
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
[[package]]
name = "watchdog"
version = "5.0.3"
description = "Filesystem events monitoring"
optional = false
python-versions = ">=3.9"
files = [
{file = "watchdog-5.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:85527b882f3facda0579bce9d743ff7f10c3e1e0db0a0d0e28170a7d0e5ce2ea"},
{file = "watchdog-5.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:53adf73dcdc0ef04f7735066b4a57a4cd3e49ef135daae41d77395f0b5b692cb"},
{file = "watchdog-5.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e25adddab85f674acac303cf1f5835951345a56c5f7f582987d266679979c75b"},
{file = "watchdog-5.0.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f01f4a3565a387080dc49bdd1fefe4ecc77f894991b88ef927edbfa45eb10818"},
{file = "watchdog-5.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:91b522adc25614cdeaf91f7897800b82c13b4b8ac68a42ca959f992f6990c490"},
{file = "watchdog-5.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d52db5beb5e476e6853da2e2d24dbbbed6797b449c8bf7ea118a4ee0d2c9040e"},
{file = "watchdog-5.0.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:94d11b07c64f63f49876e0ab8042ae034674c8653bfcdaa8c4b32e71cfff87e8"},
{file = "watchdog-5.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:349c9488e1d85d0a58e8cb14222d2c51cbc801ce11ac3936ab4c3af986536926"},
{file = "watchdog-5.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:53a3f10b62c2d569e260f96e8d966463dec1a50fa4f1b22aec69e3f91025060e"},
{file = "watchdog-5.0.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:950f531ec6e03696a2414b6308f5c6ff9dab7821a768c9d5788b1314e9a46ca7"},
{file = "watchdog-5.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ae6deb336cba5d71476caa029ceb6e88047fc1dc74b62b7c4012639c0b563906"},
{file = "watchdog-5.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1021223c08ba8d2d38d71ec1704496471ffd7be42cfb26b87cd5059323a389a1"},
{file = "watchdog-5.0.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:752fb40efc7cc8d88ebc332b8f4bcbe2b5cc7e881bccfeb8e25054c00c994ee3"},
{file = "watchdog-5.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a2e8f3f955d68471fa37b0e3add18500790d129cc7efe89971b8a4cc6fdeb0b2"},
{file = "watchdog-5.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b8ca4d854adcf480bdfd80f46fdd6fb49f91dd020ae11c89b3a79e19454ec627"},
{file = "watchdog-5.0.3-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:90a67d7857adb1d985aca232cc9905dd5bc4803ed85cfcdcfcf707e52049eda7"},
{file = "watchdog-5.0.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:720ef9d3a4f9ca575a780af283c8fd3a0674b307651c1976714745090da5a9e8"},
{file = "watchdog-5.0.3-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:223160bb359281bb8e31c8f1068bf71a6b16a8ad3d9524ca6f523ac666bb6a1e"},
{file = "watchdog-5.0.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:560135542c91eaa74247a2e8430cf83c4342b29e8ad4f520ae14f0c8a19cfb5b"},
{file = "watchdog-5.0.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:dd021efa85970bd4824acacbb922066159d0f9e546389a4743d56919b6758b91"},
{file = "watchdog-5.0.3-py3-none-manylinux2014_armv7l.whl", hash = "sha256:78864cc8f23dbee55be34cc1494632a7ba30263951b5b2e8fc8286b95845f82c"},
{file = "watchdog-5.0.3-py3-none-manylinux2014_i686.whl", hash = "sha256:1e9679245e3ea6498494b3028b90c7b25dbb2abe65c7d07423ecfc2d6218ff7c"},
{file = "watchdog-5.0.3-py3-none-manylinux2014_ppc64.whl", hash = "sha256:9413384f26b5d050b6978e6fcd0c1e7f0539be7a4f1a885061473c5deaa57221"},
{file = "watchdog-5.0.3-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:294b7a598974b8e2c6123d19ef15de9abcd282b0fbbdbc4d23dfa812959a9e05"},
{file = "watchdog-5.0.3-py3-none-manylinux2014_s390x.whl", hash = "sha256:26dd201857d702bdf9d78c273cafcab5871dd29343748524695cecffa44a8d97"},
{file = "watchdog-5.0.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:0f9332243355643d567697c3e3fa07330a1d1abf981611654a1f2bf2175612b7"},
{file = "watchdog-5.0.3-py3-none-win32.whl", hash = "sha256:c66f80ee5b602a9c7ab66e3c9f36026590a0902db3aea414d59a2f55188c1f49"},
{file = "watchdog-5.0.3-py3-none-win_amd64.whl", hash = "sha256:f00b4cf737f568be9665563347a910f8bdc76f88c2970121c86243c8cfdf90e9"},
{file = "watchdog-5.0.3-py3-none-win_ia64.whl", hash = "sha256:49f4d36cb315c25ea0d946e018c01bb028048023b9e103d3d3943f58e109dd45"},
{file = "watchdog-5.0.3.tar.gz", hash = "sha256:108f42a7f0345042a854d4d0ad0834b741d421330d5f575b81cb27b883500176"},
]
[package.extras]
watchmedo = ["PyYAML (>=3.10)"]
[[package]]
name = "wcwidth"
version = "0.2.13"
@ -7095,6 +7563,17 @@ files = [
{file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"},
]
[[package]]
name = "webencodings"
version = "0.5.1"
description = "Character encoding aliases for legacy web content"
optional = false
python-versions = "*"
files = [
{file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"},
{file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"},
]
[[package]]
name = "wheel"
version = "0.44.0"
@ -7462,4 +7941,4 @@ tesserocr = ["tesserocr"]
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "536b2f199fe70180aa31e55e7ad47a75a0b64cd20bbe96caec294037966c7b00"
content-hash = "cae7819c1a144a8aa2b700d0399d7e9d78b55b3c743cfb0b118f4bd0baa2d34e"

View File

@ -72,6 +72,8 @@ pandas-stubs = "^2.1.4.231227"
ipykernel = "^6.29.5"
ipywidgets = "^8.1.5"
nbqa = "^1.9.0"
mkdocs-material = "^9.5.40"
mkdocs-jupyter = "^0.25.0"
[tool.poetry.group.examples.dependencies]
datasets = "^2.21.0"