docs: introduce docs site (#141)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
2b1e72d327
commit
d504432c1e
@ -1,6 +1,6 @@
|
||||
<p align="center">
|
||||
<a href="https://github.com/ds4sd/docling">
|
||||
<img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
|
||||
<img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/logo.png" width="150" />
|
||||
</a>
|
||||
</p>
|
||||
|
||||
@ -200,8 +200,8 @@ To see all available options (export formats etc.) run `docling --help`.
|
||||
|
||||
### RAG
|
||||
Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
|
||||
- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
|
||||
- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
|
||||
- [Basic RAG pipeline with LlamaIndex 🦙](https://github.com/DS4SD/docling/tree/main/docs/examples/rag_llamaindex.ipynb)
|
||||
- [Basic RAG pipeline with LangChain 🦜🔗](https://github.com/DS4SD/docling/tree/main/docs/examples/rag_langchain.ipynb)
|
||||
|
||||
## Advanced features
|
||||
|
||||
|
Before Width: | Height: | Size: 258 KiB After Width: | Height: | Size: 258 KiB |
Before Width: | Height: | Size: 18 KiB After Width: | Height: | Size: 18 KiB |
105
docs/examples/batch_convert.py
Normal file
105
docs/examples/batch_convert.py
Normal file
@ -0,0 +1,105 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def export_documents(
|
||||
conv_results: Iterable[ConversionResult],
|
||||
output_dir: Path,
|
||||
):
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
partial_success_count = 0
|
||||
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status == ConversionStatus.SUCCESS:
|
||||
success_count += 1
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export Deep Search document JSON format:
|
||||
with (output_dir / f"{doc_filename}.json").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(json.dumps(conv_res.render_as_dict()))
|
||||
|
||||
# Export Text format:
|
||||
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_text())
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_markdown())
|
||||
|
||||
# Export Document Tags format:
|
||||
with (output_dir / f"{doc_filename}.doctags").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_doctags())
|
||||
|
||||
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
|
||||
_log.info(
|
||||
f"Document {conv_res.input.file} was partially converted with the following errors:"
|
||||
)
|
||||
for item in conv_res.errors:
|
||||
_log.info(f"\t{item.error_message}")
|
||||
partial_success_count += 1
|
||||
else:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
|
||||
_log.info(
|
||||
f"Processed {success_count + partial_success_count + failure_count} docs, "
|
||||
f"of which {failure_count} failed "
|
||||
f"and {partial_success_count} were partially converted."
|
||||
)
|
||||
return success_count, partial_success_count, failure_count
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
Path("./tests/data/2203.01017v2.pdf"),
|
||||
Path("./tests/data/2305.03393v1.pdf"),
|
||||
Path("./tests/data/redp5110.pdf"),
|
||||
Path("./tests/data/redp5695.pdf"),
|
||||
]
|
||||
|
||||
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
|
||||
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
||||
# input = DocumentConversionInput.from_streams(docs)
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input)
|
||||
success_count, partial_success_count, failure_count = export_documents(
|
||||
conv_results, output_dir=Path("./scratch")
|
||||
)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
175
docs/examples/custom_convert.py
Normal file
175
docs/examples/custom_convert.py
Normal file
@ -0,0 +1,175 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import (
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def export_documents(
|
||||
conv_results: Iterable[ConversionResult],
|
||||
output_dir: Path,
|
||||
):
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status == ConversionStatus.SUCCESS:
|
||||
success_count += 1
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export Deep Search document JSON format:
|
||||
with (output_dir / f"{doc_filename}.json").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(json.dumps(conv_res.render_as_dict()))
|
||||
|
||||
# Export Text format:
|
||||
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_text())
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_markdown())
|
||||
|
||||
# Export Document Tags format:
|
||||
with (output_dir / f"{doc_filename}.doctags").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_doctags())
|
||||
|
||||
else:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
|
||||
_log.info(
|
||||
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
|
||||
)
|
||||
|
||||
return success_count, failure_count
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
|
||||
###########################################################################
|
||||
|
||||
# The following sections contain a combination of PipelineOptions
|
||||
# and PDF Backends for various configurations.
|
||||
# Uncomment one section at the time to see the differences in the output.
|
||||
|
||||
# PyPdfium without EasyOCR
|
||||
# --------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=False
|
||||
# pipeline_options.do_table_structure=True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = False
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=PyPdfiumDocumentBackend,
|
||||
# )
|
||||
|
||||
# PyPdfium with EasyOCR
|
||||
# -----------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=True
|
||||
# pipeline_options.do_table_structure=True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=PyPdfiumDocumentBackend,
|
||||
# )
|
||||
|
||||
# Docling Parse without EasyOCR
|
||||
# -------------------------
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
|
||||
# Docling Parse with EasyOCR
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=True
|
||||
# pipeline_options.do_table_structure=True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# )
|
||||
|
||||
# Docling Parse with Tesseract
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
# pipeline_options.ocr_options = TesseractOcrOptions()
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# )
|
||||
|
||||
# Docling Parse with Tesseract CLI
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
# pipeline_options.ocr_options = TesseractCliOcrOptions()
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# )
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Define input files
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input)
|
||||
success_count, failure_count = export_documents(
|
||||
conv_results, output_dir=Path("./scratch")
|
||||
)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
85
docs/examples/export_figures.py
Normal file
85
docs/examples/export_figures.py
Normal file
@ -0,0 +1,85 @@
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
AssembleOptions,
|
||||
ConversionStatus,
|
||||
FigureElement,
|
||||
PageElement,
|
||||
TableElement,
|
||||
)
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
IMAGE_RESOLUTION_SCALE = 2.0
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
output_dir = Path("./scratch")
|
||||
|
||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||
# scale=1 correspond of a standard 72 DPI image
|
||||
assemble_options = AssembleOptions()
|
||||
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
|
||||
doc_converter = DocumentConverter(assemble_options=assemble_options)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input_files)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status != ConversionStatus.SUCCESS:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export page images
|
||||
for page in conv_res.pages:
|
||||
page_no = page.page_no + 1
|
||||
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
|
||||
with page_image_filename.open("wb") as fp:
|
||||
page.image.save(fp, format="PNG")
|
||||
|
||||
# Export figures and tables
|
||||
for element, image in conv_res.render_element_images(
|
||||
element_types=(FigureElement, TableElement)
|
||||
):
|
||||
element_image_filename = (
|
||||
output_dir / f"{doc_filename}-element-{element.id}.png"
|
||||
)
|
||||
with element_image_filename.open("wb") as fp:
|
||||
image.save(fp, "PNG")
|
||||
|
||||
success_count += 1
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
116
docs/examples/export_multimodal.py
Normal file
116
docs/examples/export_multimodal.py
Normal file
@ -0,0 +1,116 @@
|
||||
import datetime
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docling.datamodel.base_models import AssembleOptions, ConversionStatus
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.utils.export import generate_multimodal_pages
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
IMAGE_RESOLUTION_SCALE = 2.0
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
output_dir = Path("./scratch")
|
||||
|
||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||
# scale=1 correspond of a standard 72 DPI image
|
||||
assemble_options = AssembleOptions()
|
||||
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
|
||||
doc_converter = DocumentConverter(assemble_options=assemble_options)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
converted_docs = doc_converter.convert(input_files)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
for doc in converted_docs:
|
||||
if doc.status != ConversionStatus.SUCCESS:
|
||||
_log.info(f"Document {doc.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
rows = []
|
||||
for (
|
||||
content_text,
|
||||
content_md,
|
||||
content_dt,
|
||||
page_cells,
|
||||
page_segments,
|
||||
page,
|
||||
) in generate_multimodal_pages(doc):
|
||||
|
||||
dpi = page._default_image_scale * 72
|
||||
|
||||
rows.append(
|
||||
{
|
||||
"document": doc.input.file.name,
|
||||
"hash": doc.input.document_hash,
|
||||
"page_hash": page.page_hash,
|
||||
"image": {
|
||||
"width": page.image.width,
|
||||
"height": page.image.height,
|
||||
"bytes": page.image.tobytes(),
|
||||
},
|
||||
"cells": page_cells,
|
||||
"contents": content_text,
|
||||
"contents_md": content_md,
|
||||
"contents_dt": content_dt,
|
||||
"segments": page_segments,
|
||||
"extra": {
|
||||
"page_num": page.page_no + 1,
|
||||
"width_in_points": page.size.width,
|
||||
"height_in_points": page.size.height,
|
||||
"dpi": dpi,
|
||||
},
|
||||
}
|
||||
)
|
||||
success_count += 1
|
||||
|
||||
# Generate one parquet from all documents
|
||||
df = pd.json_normalize(rows)
|
||||
now = datetime.datetime.now()
|
||||
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
|
||||
df.to_parquet(output_filename)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
|
||||
# This block demonstrates how the file can be opened with the HF datasets library
|
||||
# from datasets import Dataset
|
||||
# from PIL import Image
|
||||
# multimodal_df = pd.read_parquet(output_filename)
|
||||
|
||||
# # Convert pandas DataFrame to Hugging Face Dataset and load bytes into image
|
||||
# dataset = Dataset.from_pandas(multimodal_df)
|
||||
# def transforms(examples):
|
||||
# examples["image"] = Image.frombytes('RGB', (examples["image.width"], examples["image.height"]), examples["image.bytes"], 'raw')
|
||||
# return examples
|
||||
# dataset = dataset.map(transforms)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
74
docs/examples/export_tables.py
Normal file
74
docs/examples/export_tables.py
Normal file
@ -0,0 +1,74 @@
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
output_dir = Path("./scratch")
|
||||
|
||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input_files)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status != ConversionStatus.SUCCESS:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export tables
|
||||
for table_ix, table in enumerate(conv_res.output.tables):
|
||||
table_df: pd.DataFrame = table.export_to_dataframe()
|
||||
print(f"## Table {table_ix}")
|
||||
print(table_df.to_markdown())
|
||||
|
||||
# Save the table as csv
|
||||
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
|
||||
_log.info(f"Saving CSV table to {element_csv_filename}")
|
||||
table_df.to_csv(element_csv_filename)
|
||||
|
||||
# Save the table as html
|
||||
element_html_filename = (
|
||||
output_dir / f"{doc_filename}-table-{table_ix+1}.html"
|
||||
)
|
||||
_log.info(f"Saving HTML table to {element_html_filename}")
|
||||
with element_html_filename.open("w") as fp:
|
||||
fp.write(table.export_to_html())
|
||||
|
||||
success_count += 1
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
6
docs/examples/minimal.py
Normal file
6
docs/examples/minimal.py
Normal file
@ -0,0 +1,6 @@
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
||||
converter = DocumentConverter()
|
||||
doc = converter.convert_single(source)
|
||||
print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"
|
@ -4,7 +4,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# RAG with Docling and 🦜🔗 LangChain"
|
||||
"# RAG with LangChain 🦜🔗"
|
||||
]
|
||||
},
|
||||
{
|
@ -11,7 +11,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# RAG with Docling and 🦙 LlamaIndex"
|
||||
"# RAG with LlamaIndex 🦙"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -25,9 +25,11 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"LlamaIndex extensions `DoclingReader` and `DoclingNodeParser` presented in this notebook seamlessly integrate Docling into LlamaIndex, enabling you to:\n",
|
||||
"This example leverages the official [LlamaIndex Docling extension](../../integrations/llamaindex/).\n",
|
||||
"\n",
|
||||
"Presented extensions `DoclingReader` and `DoclingNodeParser` enable you to:\n",
|
||||
"- use PDF documents in your LLM applications with ease and speed, and\n",
|
||||
"- leverage Docling's rich format for advanced, document-native grounding."
|
||||
"- harness Docling's rich format for advanced, document-native grounding."
|
||||
]
|
||||
},
|
||||
{
|
29
docs/index.md
Normal file
29
docs/index.md
Normal file
@ -0,0 +1,29 @@
|
||||
# Docling
|
||||
|
||||
<p align="center">
|
||||
<a href="https://ds4sd.github.io/docling/">
|
||||
<img loading="lazy" alt="Docling" src="assets/logo.png" width="150" />
|
||||
</a>
|
||||
</p>
|
||||
|
||||
|
||||
[](https://arxiv.org/abs/2408.09869)
|
||||
[](https://pypi.org/project/docling/)
|
||||

|
||||
[](https://python-poetry.org/)
|
||||
[](https://github.com/psf/black)
|
||||
[](https://pycqa.github.io/isort/)
|
||||
[](https://pydantic.dev)
|
||||
[](https://github.com/pre-commit/pre-commit)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
|
||||
Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
|
||||
|
||||
## Features
|
||||
|
||||
* ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
|
||||
* 📑 Understands detailed page layout, reading order and recovers table structures
|
||||
* 📝 Extracts metadata from the document, such as title, authors, references and language
|
||||
* 🔍 Includes OCR support for scanned PDFs
|
||||
* 🤖 Integrates easily with LLM app / RAG frameworks like LlamaIndex 🦙 & LangChain 🦜🔗
|
||||
* 💻 Provides a simple and convenient CLI
|
100
docs/installation.md
Normal file
100
docs/installation.md
Normal file
@ -0,0 +1,100 @@
|
||||
To use Docling, simply install `docling` from your Python package manager, e.g. pip:
|
||||
```bash
|
||||
pip install docling
|
||||
```
|
||||
|
||||
Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 architectures.
|
||||
|
||||
??? "Alternative PyTorch distributions"
|
||||
|
||||
The Docling models depend on the [PyTorch](https://pytorch.org/) library.
|
||||
Depending on your architecture, you might want to use a different distribution of `torch`.
|
||||
For example, you might want support for different accelerator or for a cpu-only version.
|
||||
All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
|
||||
|
||||
One common situation is the installation on Linux systems with cpu-only support.
|
||||
In this case, we suggest the installation of Docling with the following options
|
||||
|
||||
```bash
|
||||
# Example for installing on the Linux cpu-only version
|
||||
pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
```
|
||||
|
||||
??? "Alternative OCR engines"
|
||||
|
||||
Docling supports multiple OCR engines for processing scanned documents. The current version provides
|
||||
the following engines.
|
||||
|
||||
| Engine | Installation | Usage |
|
||||
| ------ | ------------ | ----- |
|
||||
| [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
|
||||
| Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` |
|
||||
| Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
|
||||
|
||||
The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
|
||||
|
||||
```python
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = True
|
||||
pipeline_options.ocr_options = TesseractOcrOptions() # Use Tesseract
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
)
|
||||
```
|
||||
|
||||
<h3>Tesseract installation</h3>
|
||||
|
||||
[Tesseract](https://github.com/tesseract-ocr/tesseract) is a popular OCR engine which is available
|
||||
on most operating systems. For using this engine with Docling, Tesseract must be installed on your
|
||||
system, using the packaging tool of your choice. Below we provide example commands.
|
||||
After installing Tesseract you are expected to provide the path to its language files using the
|
||||
`TESSDATA_PREFIX` environment variable (note that it must terminate with a slash `/`).
|
||||
|
||||
=== "macOS (via [Homebrew](https://brew.sh/))"
|
||||
|
||||
```console
|
||||
brew install tesseract leptonica pkg-config
|
||||
TESSDATA_PREFIX=/opt/homebrew/share/tessdata/
|
||||
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
||||
```
|
||||
|
||||
=== "Debian-based"
|
||||
|
||||
```console
|
||||
apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config
|
||||
TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
|
||||
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
||||
```
|
||||
|
||||
=== "RHEL"
|
||||
|
||||
```console
|
||||
dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
|
||||
TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
||||
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
||||
```
|
||||
|
||||
<h3>Linking to Tesseract</h3>
|
||||
The most efficient usage of the Tesseract library is via linking. Docling is using
|
||||
the [Tesserocr](https://github.com/sirfz/tesserocr) package for this.
|
||||
|
||||
If you get into installation issues of Tesserocr, we suggest using the following
|
||||
installation options:
|
||||
|
||||
```console
|
||||
pip uninstall tesserocr
|
||||
pip install --no-binary :all: tesserocr
|
||||
```
|
||||
|
||||
## Development setup
|
||||
|
||||
To develop Docling features, bugfixes etc., install as follows from your local clone's root dir:
|
||||
|
||||
```bash
|
||||
poetry install --all-extras
|
||||
```
|
25
docs/integrations/llamaindex.md
Normal file
25
docs/integrations/llamaindex.md
Normal file
@ -0,0 +1,25 @@
|
||||
## Get started
|
||||
|
||||
Docling is available as an official LlamaIndex extension!
|
||||
|
||||
To get started, check out the [step-by-step guide in LlamaIndex \[↗\]](https://docs.llamaindex.ai/en/stable/examples/data_connectors/DoclingReaderDemo/)<!--{target="_blank"}-->.
|
||||
|
||||
## Components
|
||||
|
||||
### Docling Reader
|
||||
|
||||
Reads document files and uses Docling to populate LlamaIndex `Document` objects — either serializing Docling's data model (losslessly, e.g. as JSON) or exporting to a simplified format (lossily, e.g. as Markdown).
|
||||
|
||||
- 💻 [GitHub \[↗\]](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/readers/llama-index-readers-docling)<!--{target="_blank"}-->
|
||||
- 📖 [API docs \[↗\]](https://docs.llamaindex.ai/en/stable/api_reference/readers/docling/)<!--{target="_blank"}-->
|
||||
- 📦 [PyPI \[↗\]](https://pypi.org/project/llama-index-readers-docling/)<!--{target="_blank"}-->
|
||||
- 🦙 [LlamaHub \[↗\]](https://llamahub.ai/l/readers/llama-index-readers-docling)<!--{target="_blank"}-->
|
||||
|
||||
### Docling Node Parser
|
||||
|
||||
Reads LlamaIndex `Document` objects populated in Docling's format by Docling Reader and, using its knowledge of the Docling format, parses them to LlamaIndex `Node` objects for downstream usage in LlamaIndex applications, e.g. as chunks for embedding.
|
||||
|
||||
- 💻 [GitHub \[↗\]](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/node_parser/llama-index-node-parser-docling)<!--{target="_blank"}-->
|
||||
- 📖 [API docs \[↗\]](https://docs.llamaindex.ai/en/stable/api_reference/node_parser/docling/)<!--{target="_blank"}-->
|
||||
- 📦 [PyPI \[↗\]](https://pypi.org/project/llama-index-node-parser-docling/)<!--{target="_blank"}-->
|
||||
- 🦙 [LlamaHub \[↗\]](https://llamahub.ai/l/node_parser/llama-index-node-parser-docling)<!--{target="_blank"}-->
|
7
docs/overrides/main.html
Normal file
7
docs/overrides/main.html
Normal file
@ -0,0 +1,7 @@
|
||||
{% extends "base.html" %}
|
||||
|
||||
{#
|
||||
{% block announce %}
|
||||
<p>🎉 Docling is now officially supported in LlamaIndex! <a href="{{ 'integrations/llamaindex/' | url }}">Check it out</a>!</p>
|
||||
{% endblock %}
|
||||
#}
|
3
docs/stylesheets/extra.css
Normal file
3
docs/stylesheets/extra.css
Normal file
@ -0,0 +1,3 @@
|
||||
[data-md-color-scheme="default"] .md-banner a {
|
||||
color: #5e8bde;
|
||||
}
|
@ -1,105 +0,0 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def export_documents(
|
||||
conv_results: Iterable[ConversionResult],
|
||||
output_dir: Path,
|
||||
):
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
partial_success_count = 0
|
||||
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status == ConversionStatus.SUCCESS:
|
||||
success_count += 1
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export Deep Search document JSON format:
|
||||
with (output_dir / f"{doc_filename}.json").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(json.dumps(conv_res.render_as_dict()))
|
||||
|
||||
# Export Text format:
|
||||
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_text())
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_markdown())
|
||||
|
||||
# Export Document Tags format:
|
||||
with (output_dir / f"{doc_filename}.doctags").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_doctags())
|
||||
|
||||
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
|
||||
_log.info(
|
||||
f"Document {conv_res.input.file} was partially converted with the following errors:"
|
||||
)
|
||||
for item in conv_res.errors:
|
||||
_log.info(f"\t{item.error_message}")
|
||||
partial_success_count += 1
|
||||
else:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
|
||||
_log.info(
|
||||
f"Processed {success_count + partial_success_count + failure_count} docs, "
|
||||
f"of which {failure_count} failed "
|
||||
f"and {partial_success_count} were partially converted."
|
||||
)
|
||||
return success_count, partial_success_count, failure_count
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
Path("./tests/data/2203.01017v2.pdf"),
|
||||
Path("./tests/data/2305.03393v1.pdf"),
|
||||
Path("./tests/data/redp5110.pdf"),
|
||||
Path("./tests/data/redp5695.pdf"),
|
||||
]
|
||||
|
||||
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
|
||||
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
||||
# input = DocumentConversionInput.from_streams(docs)
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input)
|
||||
success_count, partial_success_count, failure_count = export_documents(
|
||||
conv_results, output_dir=Path("./scratch")
|
||||
)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
1
examples/batch_convert.py
Symbolic link
1
examples/batch_convert.py
Symbolic link
@ -0,0 +1 @@
|
||||
../docs/examples/batch_convert.py
|
@ -1,175 +0,0 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import (
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def export_documents(
|
||||
conv_results: Iterable[ConversionResult],
|
||||
output_dir: Path,
|
||||
):
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status == ConversionStatus.SUCCESS:
|
||||
success_count += 1
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export Deep Search document JSON format:
|
||||
with (output_dir / f"{doc_filename}.json").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(json.dumps(conv_res.render_as_dict()))
|
||||
|
||||
# Export Text format:
|
||||
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_text())
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_markdown())
|
||||
|
||||
# Export Document Tags format:
|
||||
with (output_dir / f"{doc_filename}.doctags").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_doctags())
|
||||
|
||||
else:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
|
||||
_log.info(
|
||||
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
|
||||
)
|
||||
|
||||
return success_count, failure_count
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
|
||||
###########################################################################
|
||||
|
||||
# The following sections contain a combination of PipelineOptions
|
||||
# and PDF Backends for various configurations.
|
||||
# Uncomment one section at the time to see the differences in the output.
|
||||
|
||||
# PyPdfium without EasyOCR
|
||||
# --------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=False
|
||||
# pipeline_options.do_table_structure=True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = False
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=PyPdfiumDocumentBackend,
|
||||
# )
|
||||
|
||||
# PyPdfium with EasyOCR
|
||||
# -----------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=True
|
||||
# pipeline_options.do_table_structure=True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=PyPdfiumDocumentBackend,
|
||||
# )
|
||||
|
||||
# Docling Parse without EasyOCR
|
||||
# -------------------------
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
|
||||
# Docling Parse with EasyOCR
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=True
|
||||
# pipeline_options.do_table_structure=True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# )
|
||||
|
||||
# Docling Parse with Tesseract
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
# pipeline_options.ocr_options = TesseractOcrOptions()
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# )
|
||||
|
||||
# Docling Parse with Tesseract CLI
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
# pipeline_options.ocr_options = TesseractCliOcrOptions()
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# )
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Define input files
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input)
|
||||
success_count, failure_count = export_documents(
|
||||
conv_results, output_dir=Path("./scratch")
|
||||
)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
1
examples/custom_convert.py
Symbolic link
1
examples/custom_convert.py
Symbolic link
@ -0,0 +1 @@
|
||||
../docs/examples/custom_convert.py
|
@ -1,85 +0,0 @@
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
AssembleOptions,
|
||||
ConversionStatus,
|
||||
FigureElement,
|
||||
PageElement,
|
||||
TableElement,
|
||||
)
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
IMAGE_RESOLUTION_SCALE = 2.0
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
output_dir = Path("./scratch")
|
||||
|
||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||
# scale=1 correspond of a standard 72 DPI image
|
||||
assemble_options = AssembleOptions()
|
||||
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
|
||||
doc_converter = DocumentConverter(assemble_options=assemble_options)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input_files)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status != ConversionStatus.SUCCESS:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export page images
|
||||
for page in conv_res.pages:
|
||||
page_no = page.page_no + 1
|
||||
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
|
||||
with page_image_filename.open("wb") as fp:
|
||||
page.image.save(fp, format="PNG")
|
||||
|
||||
# Export figures and tables
|
||||
for element, image in conv_res.render_element_images(
|
||||
element_types=(FigureElement, TableElement)
|
||||
):
|
||||
element_image_filename = (
|
||||
output_dir / f"{doc_filename}-element-{element.id}.png"
|
||||
)
|
||||
with element_image_filename.open("wb") as fp:
|
||||
image.save(fp, "PNG")
|
||||
|
||||
success_count += 1
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
1
examples/export_figures.py
Symbolic link
1
examples/export_figures.py
Symbolic link
@ -0,0 +1 @@
|
||||
../docs/examples/export_figures.py
|
@ -1,116 +0,0 @@
|
||||
import datetime
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docling.datamodel.base_models import AssembleOptions, ConversionStatus
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.utils.export import generate_multimodal_pages
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
IMAGE_RESOLUTION_SCALE = 2.0
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
output_dir = Path("./scratch")
|
||||
|
||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||
# scale=1 correspond of a standard 72 DPI image
|
||||
assemble_options = AssembleOptions()
|
||||
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
|
||||
doc_converter = DocumentConverter(assemble_options=assemble_options)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
converted_docs = doc_converter.convert(input_files)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
for doc in converted_docs:
|
||||
if doc.status != ConversionStatus.SUCCESS:
|
||||
_log.info(f"Document {doc.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
rows = []
|
||||
for (
|
||||
content_text,
|
||||
content_md,
|
||||
content_dt,
|
||||
page_cells,
|
||||
page_segments,
|
||||
page,
|
||||
) in generate_multimodal_pages(doc):
|
||||
|
||||
dpi = page._default_image_scale * 72
|
||||
|
||||
rows.append(
|
||||
{
|
||||
"document": doc.input.file.name,
|
||||
"hash": doc.input.document_hash,
|
||||
"page_hash": page.page_hash,
|
||||
"image": {
|
||||
"width": page.image.width,
|
||||
"height": page.image.height,
|
||||
"bytes": page.image.tobytes(),
|
||||
},
|
||||
"cells": page_cells,
|
||||
"contents": content_text,
|
||||
"contents_md": content_md,
|
||||
"contents_dt": content_dt,
|
||||
"segments": page_segments,
|
||||
"extra": {
|
||||
"page_num": page.page_no + 1,
|
||||
"width_in_points": page.size.width,
|
||||
"height_in_points": page.size.height,
|
||||
"dpi": dpi,
|
||||
},
|
||||
}
|
||||
)
|
||||
success_count += 1
|
||||
|
||||
# Generate one parquet from all documents
|
||||
df = pd.json_normalize(rows)
|
||||
now = datetime.datetime.now()
|
||||
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
|
||||
df.to_parquet(output_filename)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
|
||||
# This block demonstrates how the file can be opened with the HF datasets library
|
||||
# from datasets import Dataset
|
||||
# from PIL import Image
|
||||
# multimodal_df = pd.read_parquet(output_filename)
|
||||
|
||||
# # Convert pandas DataFrame to Hugging Face Dataset and load bytes into image
|
||||
# dataset = Dataset.from_pandas(multimodal_df)
|
||||
# def transforms(examples):
|
||||
# examples["image"] = Image.frombytes('RGB', (examples["image.width"], examples["image.height"]), examples["image.bytes"], 'raw')
|
||||
# return examples
|
||||
# dataset = dataset.map(transforms)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
1
examples/export_multimodal.py
Symbolic link
1
examples/export_multimodal.py
Symbolic link
@ -0,0 +1 @@
|
||||
../docs/examples/export_multimodal.py
|
@ -1,74 +0,0 @@
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
output_dir = Path("./scratch")
|
||||
|
||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input_files)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status != ConversionStatus.SUCCESS:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export tables
|
||||
for table_ix, table in enumerate(conv_res.output.tables):
|
||||
table_df: pd.DataFrame = table.export_to_dataframe()
|
||||
print(f"## Table {table_ix}")
|
||||
print(table_df.to_markdown())
|
||||
|
||||
# Save the table as csv
|
||||
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
|
||||
_log.info(f"Saving CSV table to {element_csv_filename}")
|
||||
table_df.to_csv(element_csv_filename)
|
||||
|
||||
# Save the table as html
|
||||
element_html_filename = (
|
||||
output_dir / f"{doc_filename}-table-{table_ix+1}.html"
|
||||
)
|
||||
_log.info(f"Saving HTML table to {element_html_filename}")
|
||||
with element_html_filename.open("w") as fp:
|
||||
fp.write(table.export_to_html())
|
||||
|
||||
success_count += 1
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
1
examples/export_tables.py
Symbolic link
1
examples/export_tables.py
Symbolic link
@ -0,0 +1 @@
|
||||
../docs/examples/export_tables.py
|
@ -1,6 +0,0 @@
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
||||
converter = DocumentConverter()
|
||||
doc = converter.convert_single(source)
|
||||
print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"
|
1
examples/minimal.py
Symbolic link
1
examples/minimal.py
Symbolic link
@ -0,0 +1 @@
|
||||
../docs/examples/minimal.py
|
97
mkdocs.yml
Normal file
97
mkdocs.yml
Normal file
@ -0,0 +1,97 @@
|
||||
site_name: Docling
|
||||
site_url: https://ds4sd.github.io/docling/
|
||||
repo_name: DS4SD/docling
|
||||
repo_url: https://github.com/DS4SD/docling
|
||||
|
||||
theme:
|
||||
name: material
|
||||
custom_dir: docs/overrides
|
||||
palette:
|
||||
# Palette toggle for automatic mode
|
||||
- media: "(prefers-color-scheme)"
|
||||
scheme: default
|
||||
primary: black
|
||||
toggle:
|
||||
icon: material/brightness-auto
|
||||
name: Switch to light mode
|
||||
|
||||
# Palette toggle for light mode
|
||||
- media: "(prefers-color-scheme: light)"
|
||||
scheme: default
|
||||
primary: black
|
||||
toggle:
|
||||
icon: material/brightness-7
|
||||
name: Switch to dark mode
|
||||
|
||||
# Palette toggle for dark mode
|
||||
- media: "(prefers-color-scheme: dark)"
|
||||
scheme: slate
|
||||
primary: black
|
||||
toggle:
|
||||
icon: material/brightness-4
|
||||
name: Switch to system preference
|
||||
|
||||
logo: assets/logo.png
|
||||
favicon: assets/logo.png
|
||||
features:
|
||||
- content.tabs.link
|
||||
- content.code.annotate
|
||||
- content.code.copy
|
||||
- announce.dismiss
|
||||
- navigation.tabs
|
||||
# - navigation.indexes # <= if set, each "section" can have its own page, if index.md is used
|
||||
- navigation.instant
|
||||
- navigation.instant.prefetch
|
||||
# - navigation.instant.preview
|
||||
- navigation.instant.progress
|
||||
- navigation.path
|
||||
- navigation.sections # <=
|
||||
- navigation.top
|
||||
- navigation.tracking
|
||||
- search.suggest
|
||||
- toc.follow
|
||||
nav:
|
||||
- Get started:
|
||||
- Home: index.md
|
||||
- Installation: installation.md
|
||||
# - Docling v2: v2.md
|
||||
# - Concepts:
|
||||
# - Docling Document: concepts/document.md
|
||||
# - Chunking: concepts/chunking.md
|
||||
- Examples:
|
||||
- Conversion:
|
||||
- "Simple conversion": examples/minimal.py
|
||||
- "Custom conversion": examples/custom_convert.py
|
||||
- "Batch conversion": examples/batch_convert.py
|
||||
- "Figure export": examples/export_figures.py
|
||||
- "Table export": examples/export_tables.py
|
||||
- "Multimodal export": examples/export_multimodal.py
|
||||
- RAG / QA:
|
||||
- "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
|
||||
- "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
|
||||
# - Chunking:
|
||||
# - Chunking: examples/chunking.md
|
||||
# - CLI:
|
||||
# - CLI: examples/cli.md
|
||||
- Integrations:
|
||||
- "LlamaIndex 🦙 extension": integrations/llamaindex.md
|
||||
# - "LangChain 🦜🔗 extension": integrations/langchain.md
|
||||
# - API reference:
|
||||
# - API reference: api_reference/index.md
|
||||
|
||||
markdown_extensions:
|
||||
- pymdownx.superfences
|
||||
- pymdownx.tabbed:
|
||||
alternate_style: true
|
||||
slugify: !!python/object/apply:pymdownx.slugs.slugify
|
||||
kwds:
|
||||
case: lower
|
||||
- admonition
|
||||
- pymdownx.details
|
||||
- attr_list
|
||||
plugins:
|
||||
- search
|
||||
- mkdocs-jupyter
|
||||
|
||||
extra_css:
|
||||
- stylesheets/extra.css
|
491
poetry.lock
generated
491
poetry.lock
generated
@ -263,6 +263,20 @@ files = [
|
||||
pycodestyle = ">=2.11.0"
|
||||
tomli = {version = "*", markers = "python_version < \"3.11\""}
|
||||
|
||||
[[package]]
|
||||
name = "babel"
|
||||
version = "2.16.0"
|
||||
description = "Internationalization utilities"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b"},
|
||||
{file = "babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"]
|
||||
|
||||
[[package]]
|
||||
name = "backports-tarfile"
|
||||
version = "1.2.0"
|
||||
@ -347,6 +361,24 @@ d = ["aiohttp (>=3.10)"]
|
||||
jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
|
||||
uvloop = ["uvloop (>=0.15.2)"]
|
||||
|
||||
[[package]]
|
||||
name = "bleach"
|
||||
version = "6.1.0"
|
||||
description = "An easy safelist-based HTML-sanitizing tool."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "bleach-6.1.0-py3-none-any.whl", hash = "sha256:3225f354cfc436b9789c66c4ee030194bee0568fbf9cbdad3bc8b5c26c5f12b6"},
|
||||
{file = "bleach-6.1.0.tar.gz", hash = "sha256:0a31f1837963c41d46bbf1331b8778e1308ea0791db03cc4e7357b97cf42a8fe"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
six = ">=1.9.0"
|
||||
webencodings = "*"
|
||||
|
||||
[package.extras]
|
||||
css = ["tinycss2 (>=1.1.0,<1.3)"]
|
||||
|
||||
[[package]]
|
||||
name = "certifi"
|
||||
version = "2024.8.30"
|
||||
@ -931,6 +963,17 @@ tqdm = ">=4.64.0,<5.0.0"
|
||||
[package.extras]
|
||||
toolkit = ["deepsearch-toolkit (>=0.31.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "defusedxml"
|
||||
version = "0.7.1"
|
||||
description = "XML bomb protection for Python stdlib modules"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
|
||||
files = [
|
||||
{file = "defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61"},
|
||||
{file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "deprecated"
|
||||
version = "1.2.14"
|
||||
@ -1185,6 +1228,20 @@ files = [
|
||||
[package.extras]
|
||||
tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"]
|
||||
|
||||
[[package]]
|
||||
name = "fastjsonschema"
|
||||
version = "2.20.0"
|
||||
description = "Fastest Python implementation of JSON schema"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "fastjsonschema-2.20.0-py3-none-any.whl", hash = "sha256:5875f0b0fa7a0043a91e93a9b8f793bcbbba9691e7fd83dca95c28ba26d21f0a"},
|
||||
{file = "fastjsonschema-2.20.0.tar.gz", hash = "sha256:3d48fc5300ee96f5d116f10fe6f28d938e6008f59a6a025c2649475b87f76a23"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]
|
||||
|
||||
[[package]]
|
||||
name = "filelock"
|
||||
version = "3.16.1"
|
||||
@ -1444,6 +1501,23 @@ test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe,
|
||||
test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"]
|
||||
tqdm = ["tqdm"]
|
||||
|
||||
[[package]]
|
||||
name = "ghp-import"
|
||||
version = "2.1.0"
|
||||
description = "Copy your docs directly to the gh-pages branch."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343"},
|
||||
{file = "ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
python-dateutil = ">=2.8.1"
|
||||
|
||||
[package.extras]
|
||||
dev = ["flake8", "markdown", "twine", "wheel"]
|
||||
|
||||
[[package]]
|
||||
name = "gitdb"
|
||||
version = "4.0.11"
|
||||
@ -2214,6 +2288,17 @@ traitlets = ">=5.3"
|
||||
docs = ["myst-parser", "pydata-sphinx-theme", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling", "traitlets"]
|
||||
test = ["ipykernel", "pre-commit", "pytest (<8)", "pytest-cov", "pytest-timeout"]
|
||||
|
||||
[[package]]
|
||||
name = "jupyterlab-pygments"
|
||||
version = "0.3.0"
|
||||
description = "Pygments theme using JupyterLab CSS variables"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "jupyterlab_pygments-0.3.0-py3-none-any.whl", hash = "sha256:841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780"},
|
||||
{file = "jupyterlab_pygments-0.3.0.tar.gz", hash = "sha256:721aca4d9029252b11cfa9d185e5b5af4d54772bb8072f9b7036f4170054d35d"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jupyterlab-widgets"
|
||||
version = "3.0.13"
|
||||
@ -2225,6 +2310,35 @@ files = [
|
||||
{file = "jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jupytext"
|
||||
version = "1.16.4"
|
||||
description = "Jupyter notebooks as Markdown documents, Julia, Python or R scripts"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "jupytext-1.16.4-py3-none-any.whl", hash = "sha256:76989d2690e65667ea6fb411d8056abe7cd0437c07bd774660b83d62acf9490a"},
|
||||
{file = "jupytext-1.16.4.tar.gz", hash = "sha256:28e33f46f2ce7a41fb9d677a4a2c95327285579b64ca104437c4b9eb1e4174e9"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
markdown-it-py = ">=1.0"
|
||||
mdit-py-plugins = "*"
|
||||
nbformat = "*"
|
||||
packaging = "*"
|
||||
pyyaml = "*"
|
||||
tomli = {version = "*", markers = "python_version < \"3.11\""}
|
||||
|
||||
[package.extras]
|
||||
dev = ["autopep8", "black", "flake8", "gitpython", "ipykernel", "isort", "jupyter-fs (>=1.0)", "jupyter-server (!=2.11)", "nbconvert", "pre-commit", "pytest", "pytest-cov (>=2.6.1)", "pytest-randomly", "pytest-xdist", "sphinx-gallery (<0.8)"]
|
||||
docs = ["myst-parser", "sphinx", "sphinx-copybutton", "sphinx-rtd-theme"]
|
||||
test = ["pytest", "pytest-randomly", "pytest-xdist"]
|
||||
test-cov = ["ipykernel", "jupyter-server (!=2.11)", "nbconvert", "pytest", "pytest-cov (>=2.6.1)", "pytest-randomly", "pytest-xdist"]
|
||||
test-external = ["autopep8", "black", "flake8", "gitpython", "ipykernel", "isort", "jupyter-fs (>=1.0)", "jupyter-server (!=2.11)", "nbconvert", "pre-commit", "pytest", "pytest-randomly", "pytest-xdist", "sphinx-gallery (<0.8)"]
|
||||
test-functional = ["pytest", "pytest-randomly", "pytest-xdist"]
|
||||
test-integration = ["ipykernel", "jupyter-server (!=2.11)", "nbconvert", "pytest", "pytest-randomly", "pytest-xdist"]
|
||||
test-ui = ["calysto-bash"]
|
||||
|
||||
[[package]]
|
||||
name = "keyring"
|
||||
version = "25.4.1"
|
||||
@ -2777,6 +2891,21 @@ html5 = ["html5lib"]
|
||||
htmlsoup = ["BeautifulSoup4"]
|
||||
source = ["Cython (==0.29.37)"]
|
||||
|
||||
[[package]]
|
||||
name = "markdown"
|
||||
version = "3.7"
|
||||
description = "Python implementation of John Gruber's Markdown."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "Markdown-3.7-py3-none-any.whl", hash = "sha256:7eb6df5690b81a1d7942992c97fad2938e956e79df20cbc6186e9c3a77b1c803"},
|
||||
{file = "markdown-3.7.tar.gz", hash = "sha256:2ae2471477cfd02dbbf038d5d9bc226d40def84b4fe2986e49b59b6b472bbed2"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.5)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"]
|
||||
testing = ["coverage", "pyyaml"]
|
||||
|
||||
[[package]]
|
||||
name = "markdown-it-py"
|
||||
version = "3.0.0"
|
||||
@ -3008,6 +3137,25 @@ files = [
|
||||
{file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mdit-py-plugins"
|
||||
version = "0.4.2"
|
||||
description = "Collection of plugins for markdown-it-py"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "mdit_py_plugins-0.4.2-py3-none-any.whl", hash = "sha256:0c673c3f889399a33b95e88d2f0d111b4447bdfea7f237dab2d488f459835636"},
|
||||
{file = "mdit_py_plugins-0.4.2.tar.gz", hash = "sha256:5f2cd1fdb606ddf152d37ec30e46101a60512bc0e5fa1a7002c36647b09e26b5"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
markdown-it-py = ">=1.0.0,<4.0.0"
|
||||
|
||||
[package.extras]
|
||||
code-style = ["pre-commit"]
|
||||
rtd = ["myst-parser", "sphinx-book-theme"]
|
||||
testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
|
||||
|
||||
[[package]]
|
||||
name = "mdurl"
|
||||
version = "0.1.2"
|
||||
@ -3034,6 +3182,17 @@ files = [
|
||||
numpy = "*"
|
||||
pandas = "*"
|
||||
|
||||
[[package]]
|
||||
name = "mergedeep"
|
||||
version = "1.3.4"
|
||||
description = "A deep merge function for 🐍."
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307"},
|
||||
{file = "mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "milvus-lite"
|
||||
version = "2.4.10"
|
||||
@ -3067,6 +3226,122 @@ files = [
|
||||
{file = "minijinja-2.2.0.tar.gz", hash = "sha256:4411052c7a60f8d56468cc6d17d45d72be3d5e89e9578a04f8336cc56601523c"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mistune"
|
||||
version = "3.0.2"
|
||||
description = "A sane and fast Markdown parser with useful plugins and renderers"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "mistune-3.0.2-py3-none-any.whl", hash = "sha256:71481854c30fdbc938963d3605b72501f5c10a9320ecd412c121c163a1c7d205"},
|
||||
{file = "mistune-3.0.2.tar.gz", hash = "sha256:fc7f93ded930c92394ef2cb6f04a8aabab4117a91449e72dcc8dfa646a508be8"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mkdocs"
|
||||
version = "1.6.1"
|
||||
description = "Project documentation with Markdown."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e"},
|
||||
{file = "mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
click = ">=7.0"
|
||||
colorama = {version = ">=0.4", markers = "platform_system == \"Windows\""}
|
||||
ghp-import = ">=1.0"
|
||||
jinja2 = ">=2.11.1"
|
||||
markdown = ">=3.3.6"
|
||||
markupsafe = ">=2.0.1"
|
||||
mergedeep = ">=1.3.4"
|
||||
mkdocs-get-deps = ">=0.2.0"
|
||||
packaging = ">=20.5"
|
||||
pathspec = ">=0.11.1"
|
||||
pyyaml = ">=5.1"
|
||||
pyyaml-env-tag = ">=0.1"
|
||||
watchdog = ">=2.0"
|
||||
|
||||
[package.extras]
|
||||
i18n = ["babel (>=2.9.0)"]
|
||||
min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-import (==1.0)", "importlib-metadata (==4.4)", "jinja2 (==2.11.1)", "markdown (==3.3.6)", "markupsafe (==2.0.1)", "mergedeep (==1.3.4)", "mkdocs-get-deps (==0.2.0)", "packaging (==20.5)", "pathspec (==0.11.1)", "pyyaml (==5.1)", "pyyaml-env-tag (==0.1)", "watchdog (==2.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "mkdocs-get-deps"
|
||||
version = "0.2.0"
|
||||
description = "MkDocs extension that lists all dependencies according to a mkdocs.yml file"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134"},
|
||||
{file = "mkdocs_get_deps-0.2.0.tar.gz", hash = "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
mergedeep = ">=1.3.4"
|
||||
platformdirs = ">=2.2.0"
|
||||
pyyaml = ">=5.1"
|
||||
|
||||
[[package]]
|
||||
name = "mkdocs-jupyter"
|
||||
version = "0.25.0"
|
||||
description = "Use Jupyter in mkdocs websites"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "mkdocs_jupyter-0.25.0-py3-none-any.whl", hash = "sha256:d83d71deef19f0401505945bf92ec3bd5b40615af89308e72d5112929f8ee00b"},
|
||||
{file = "mkdocs_jupyter-0.25.0.tar.gz", hash = "sha256:e26c1d341916bc57f96ea3f93d8d0a88fc77c87d4cee222f66d2007798d924f5"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
ipykernel = ">6.0.0,<7.0.0"
|
||||
jupytext = ">1.13.8,<2"
|
||||
mkdocs = ">=1.4.0,<2"
|
||||
mkdocs-material = ">9.0.0"
|
||||
nbconvert = ">=7.2.9,<8"
|
||||
pygments = ">2.12.0"
|
||||
|
||||
[[package]]
|
||||
name = "mkdocs-material"
|
||||
version = "9.5.40"
|
||||
description = "Documentation that simply works"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "mkdocs_material-9.5.40-py3-none-any.whl", hash = "sha256:8e7a16ada34e79a7b6459ff2602584222f522c738b6a023d1bea853d5049da6f"},
|
||||
{file = "mkdocs_material-9.5.40.tar.gz", hash = "sha256:b69d70e667ec51fc41f65e006a3184dd00d95b2439d982cb1586e4c018943156"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
babel = ">=2.10,<3.0"
|
||||
colorama = ">=0.4,<1.0"
|
||||
jinja2 = ">=3.0,<4.0"
|
||||
markdown = ">=3.2,<4.0"
|
||||
mkdocs = ">=1.6,<2.0"
|
||||
mkdocs-material-extensions = ">=1.3,<2.0"
|
||||
paginate = ">=0.5,<1.0"
|
||||
pygments = ">=2.16,<3.0"
|
||||
pymdown-extensions = ">=10.2,<11.0"
|
||||
regex = ">=2022.4"
|
||||
requests = ">=2.26,<3.0"
|
||||
|
||||
[package.extras]
|
||||
git = ["mkdocs-git-committers-plugin-2 (>=1.1,<2.0)", "mkdocs-git-revision-date-localized-plugin (>=1.2.4,<2.0)"]
|
||||
imaging = ["cairosvg (>=2.6,<3.0)", "pillow (>=10.2,<11.0)"]
|
||||
recommended = ["mkdocs-minify-plugin (>=0.7,<1.0)", "mkdocs-redirects (>=1.2,<2.0)", "mkdocs-rss-plugin (>=1.6,<2.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "mkdocs-material-extensions"
|
||||
version = "1.3.1"
|
||||
description = "Extension pack for Python Markdown and MkDocs Material."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31"},
|
||||
{file = "mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "more-itertools"
|
||||
version = "10.5.0"
|
||||
@ -3281,6 +3556,86 @@ files = [
|
||||
{file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nbclient"
|
||||
version = "0.10.0"
|
||||
description = "A client library for executing notebooks. Formerly nbconvert's ExecutePreprocessor."
|
||||
optional = false
|
||||
python-versions = ">=3.8.0"
|
||||
files = [
|
||||
{file = "nbclient-0.10.0-py3-none-any.whl", hash = "sha256:f13e3529332a1f1f81d82a53210322476a168bb7090a0289c795fe9cc11c9d3f"},
|
||||
{file = "nbclient-0.10.0.tar.gz", hash = "sha256:4b3f1b7dba531e498449c4db4f53da339c91d449dc11e9af3a43b4eb5c5abb09"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
jupyter-client = ">=6.1.12"
|
||||
jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0"
|
||||
nbformat = ">=5.1"
|
||||
traitlets = ">=5.4"
|
||||
|
||||
[package.extras]
|
||||
dev = ["pre-commit"]
|
||||
docs = ["autodoc-traits", "mock", "moto", "myst-parser", "nbclient[test]", "sphinx (>=1.7)", "sphinx-book-theme", "sphinxcontrib-spelling"]
|
||||
test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>=7.0.0)", "pytest (>=7.0,<8)", "pytest-asyncio", "pytest-cov (>=4.0)", "testpath", "xmltodict"]
|
||||
|
||||
[[package]]
|
||||
name = "nbconvert"
|
||||
version = "7.16.4"
|
||||
description = "Converting Jupyter Notebooks (.ipynb files) to other formats. Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script. nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "nbconvert-7.16.4-py3-none-any.whl", hash = "sha256:05873c620fe520b6322bf8a5ad562692343fe3452abda5765c7a34b7d1aa3eb3"},
|
||||
{file = "nbconvert-7.16.4.tar.gz", hash = "sha256:86ca91ba266b0a448dc96fa6c5b9d98affabde2867b363258703536807f9f7f4"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
beautifulsoup4 = "*"
|
||||
bleach = "!=5.0.0"
|
||||
defusedxml = "*"
|
||||
jinja2 = ">=3.0"
|
||||
jupyter-core = ">=4.7"
|
||||
jupyterlab-pygments = "*"
|
||||
markupsafe = ">=2.0"
|
||||
mistune = ">=2.0.3,<4"
|
||||
nbclient = ">=0.5.0"
|
||||
nbformat = ">=5.7"
|
||||
packaging = "*"
|
||||
pandocfilters = ">=1.4.1"
|
||||
pygments = ">=2.4.1"
|
||||
tinycss2 = "*"
|
||||
traitlets = ">=5.1"
|
||||
|
||||
[package.extras]
|
||||
all = ["flaky", "ipykernel", "ipython", "ipywidgets (>=7.5)", "myst-parser", "nbsphinx (>=0.2.12)", "playwright", "pydata-sphinx-theme", "pyqtwebengine (>=5.15)", "pytest (>=7)", "sphinx (==5.0.2)", "sphinxcontrib-spelling", "tornado (>=6.1)"]
|
||||
docs = ["ipykernel", "ipython", "myst-parser", "nbsphinx (>=0.2.12)", "pydata-sphinx-theme", "sphinx (==5.0.2)", "sphinxcontrib-spelling"]
|
||||
qtpdf = ["pyqtwebengine (>=5.15)"]
|
||||
qtpng = ["pyqtwebengine (>=5.15)"]
|
||||
serve = ["tornado (>=6.1)"]
|
||||
test = ["flaky", "ipykernel", "ipywidgets (>=7.5)", "pytest (>=7)"]
|
||||
webpdf = ["playwright"]
|
||||
|
||||
[[package]]
|
||||
name = "nbformat"
|
||||
version = "5.10.4"
|
||||
description = "The Jupyter Notebook format"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "nbformat-5.10.4-py3-none-any.whl", hash = "sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b"},
|
||||
{file = "nbformat-5.10.4.tar.gz", hash = "sha256:322168b14f937a5d11362988ecac2a4952d3d8e3a2cbeb2319584631226d5b3a"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
fastjsonschema = ">=2.15"
|
||||
jsonschema = ">=2.6"
|
||||
jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0"
|
||||
traitlets = ">=5.1"
|
||||
|
||||
[package.extras]
|
||||
docs = ["myst-parser", "pydata-sphinx-theme", "sphinx", "sphinxcontrib-github-alt", "sphinxcontrib-spelling"]
|
||||
test = ["pep440", "pre-commit", "pytest", "testpath"]
|
||||
|
||||
[[package]]
|
||||
name = "nbqa"
|
||||
version = "1.9.0"
|
||||
@ -3758,6 +4113,21 @@ files = [
|
||||
{file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "paginate"
|
||||
version = "0.5.7"
|
||||
description = "Divides large result sets into pages for easier browsing"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "paginate-0.5.7-py2.py3-none-any.whl", hash = "sha256:b885e2af73abcf01d9559fd5216b57ef722f8c42affbb63942377668e35c7591"},
|
||||
{file = "paginate-0.5.7.tar.gz", hash = "sha256:22bd083ab41e1a8b4f3690544afb2c60c25e5c9a63a30fa2f483f6c60c8e5945"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
dev = ["pytest", "tox"]
|
||||
lint = ["black"]
|
||||
|
||||
[[package]]
|
||||
name = "pandas"
|
||||
version = "2.2.3"
|
||||
@ -3859,6 +4229,17 @@ files = [
|
||||
numpy = ">=1.23.5"
|
||||
types-pytz = ">=2022.1.1"
|
||||
|
||||
[[package]]
|
||||
name = "pandocfilters"
|
||||
version = "1.5.1"
|
||||
description = "Utilities for writing pandoc filters in python"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
|
||||
files = [
|
||||
{file = "pandocfilters-1.5.1-py2.py3-none-any.whl", hash = "sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc"},
|
||||
{file = "pandocfilters-1.5.1.tar.gz", hash = "sha256:002b4a555ee4ebc03f8b66307e287fa492e4a77b4ea14d3f934328297bb4939e"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parso"
|
||||
version = "0.8.4"
|
||||
@ -4574,6 +4955,24 @@ tomlkit = ">=0.10.1"
|
||||
spelling = ["pyenchant (>=3.2,<4.0)"]
|
||||
testutils = ["gitpython (>3)"]
|
||||
|
||||
[[package]]
|
||||
name = "pymdown-extensions"
|
||||
version = "10.11.2"
|
||||
description = "Extension pack for Python Markdown."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "pymdown_extensions-10.11.2-py3-none-any.whl", hash = "sha256:41cdde0a77290e480cf53892f5c5e50921a7ee3e5cd60ba91bf19837b33badcf"},
|
||||
{file = "pymdown_extensions-10.11.2.tar.gz", hash = "sha256:bc8847ecc9e784a098efd35e20cba772bc5a1b529dfcef9dc1972db9021a1049"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
markdown = ">=3.6"
|
||||
pyyaml = "*"
|
||||
|
||||
[package.extras]
|
||||
extra = ["pygments (>=2.12)"]
|
||||
|
||||
[[package]]
|
||||
name = "pymilvus"
|
||||
version = "2.4.7"
|
||||
@ -5027,6 +5426,20 @@ files = [
|
||||
{file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyyaml-env-tag"
|
||||
version = "0.1"
|
||||
description = "A custom YAML tag for referencing environment variables in YAML files. "
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "pyyaml_env_tag-0.1-py3-none-any.whl", hash = "sha256:af31106dec8a4d68c60207c1886031cbf839b68aa7abccdb19868200532c2069"},
|
||||
{file = "pyyaml_env_tag-0.1.tar.gz", hash = "sha256:70092675bda14fdec33b31ba77e7543de9ddc88f2e5b99160396572d11525bdb"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pyyaml = "*"
|
||||
|
||||
[[package]]
|
||||
name = "pyzmq"
|
||||
version = "26.2.0"
|
||||
@ -6342,6 +6755,24 @@ requests = ">=2.26.0"
|
||||
[package.extras]
|
||||
blobfile = ["blobfile (>=2)"]
|
||||
|
||||
[[package]]
|
||||
name = "tinycss2"
|
||||
version = "1.3.0"
|
||||
description = "A tiny CSS parser"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "tinycss2-1.3.0-py3-none-any.whl", hash = "sha256:54a8dbdffb334d536851be0226030e9505965bb2f30f21a4a82c55fb2a80fae7"},
|
||||
{file = "tinycss2-1.3.0.tar.gz", hash = "sha256:152f9acabd296a8375fbca5b84c961ff95971fcfc32e79550c8df8e29118c54d"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
webencodings = ">=0.4"
|
||||
|
||||
[package.extras]
|
||||
doc = ["sphinx", "sphinx_rtd_theme"]
|
||||
test = ["pytest", "ruff"]
|
||||
|
||||
[[package]]
|
||||
name = "tokenize-rt"
|
||||
version = "6.0.0"
|
||||
@ -6843,11 +7274,6 @@ files = [
|
||||
{file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
|
||||
{file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
|
||||
{file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
|
||||
{file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"},
|
||||
{file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"},
|
||||
{file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"},
|
||||
{file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"},
|
||||
{file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -7084,6 +7510,48 @@ platformdirs = ">=3.9.1,<5"
|
||||
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
|
||||
test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
|
||||
|
||||
[[package]]
|
||||
name = "watchdog"
|
||||
version = "5.0.3"
|
||||
description = "Filesystem events monitoring"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "watchdog-5.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:85527b882f3facda0579bce9d743ff7f10c3e1e0db0a0d0e28170a7d0e5ce2ea"},
|
||||
{file = "watchdog-5.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:53adf73dcdc0ef04f7735066b4a57a4cd3e49ef135daae41d77395f0b5b692cb"},
|
||||
{file = "watchdog-5.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e25adddab85f674acac303cf1f5835951345a56c5f7f582987d266679979c75b"},
|
||||
{file = "watchdog-5.0.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f01f4a3565a387080dc49bdd1fefe4ecc77f894991b88ef927edbfa45eb10818"},
|
||||
{file = "watchdog-5.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:91b522adc25614cdeaf91f7897800b82c13b4b8ac68a42ca959f992f6990c490"},
|
||||
{file = "watchdog-5.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d52db5beb5e476e6853da2e2d24dbbbed6797b449c8bf7ea118a4ee0d2c9040e"},
|
||||
{file = "watchdog-5.0.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:94d11b07c64f63f49876e0ab8042ae034674c8653bfcdaa8c4b32e71cfff87e8"},
|
||||
{file = "watchdog-5.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:349c9488e1d85d0a58e8cb14222d2c51cbc801ce11ac3936ab4c3af986536926"},
|
||||
{file = "watchdog-5.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:53a3f10b62c2d569e260f96e8d966463dec1a50fa4f1b22aec69e3f91025060e"},
|
||||
{file = "watchdog-5.0.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:950f531ec6e03696a2414b6308f5c6ff9dab7821a768c9d5788b1314e9a46ca7"},
|
||||
{file = "watchdog-5.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ae6deb336cba5d71476caa029ceb6e88047fc1dc74b62b7c4012639c0b563906"},
|
||||
{file = "watchdog-5.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1021223c08ba8d2d38d71ec1704496471ffd7be42cfb26b87cd5059323a389a1"},
|
||||
{file = "watchdog-5.0.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:752fb40efc7cc8d88ebc332b8f4bcbe2b5cc7e881bccfeb8e25054c00c994ee3"},
|
||||
{file = "watchdog-5.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a2e8f3f955d68471fa37b0e3add18500790d129cc7efe89971b8a4cc6fdeb0b2"},
|
||||
{file = "watchdog-5.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b8ca4d854adcf480bdfd80f46fdd6fb49f91dd020ae11c89b3a79e19454ec627"},
|
||||
{file = "watchdog-5.0.3-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:90a67d7857adb1d985aca232cc9905dd5bc4803ed85cfcdcfcf707e52049eda7"},
|
||||
{file = "watchdog-5.0.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:720ef9d3a4f9ca575a780af283c8fd3a0674b307651c1976714745090da5a9e8"},
|
||||
{file = "watchdog-5.0.3-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:223160bb359281bb8e31c8f1068bf71a6b16a8ad3d9524ca6f523ac666bb6a1e"},
|
||||
{file = "watchdog-5.0.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:560135542c91eaa74247a2e8430cf83c4342b29e8ad4f520ae14f0c8a19cfb5b"},
|
||||
{file = "watchdog-5.0.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:dd021efa85970bd4824acacbb922066159d0f9e546389a4743d56919b6758b91"},
|
||||
{file = "watchdog-5.0.3-py3-none-manylinux2014_armv7l.whl", hash = "sha256:78864cc8f23dbee55be34cc1494632a7ba30263951b5b2e8fc8286b95845f82c"},
|
||||
{file = "watchdog-5.0.3-py3-none-manylinux2014_i686.whl", hash = "sha256:1e9679245e3ea6498494b3028b90c7b25dbb2abe65c7d07423ecfc2d6218ff7c"},
|
||||
{file = "watchdog-5.0.3-py3-none-manylinux2014_ppc64.whl", hash = "sha256:9413384f26b5d050b6978e6fcd0c1e7f0539be7a4f1a885061473c5deaa57221"},
|
||||
{file = "watchdog-5.0.3-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:294b7a598974b8e2c6123d19ef15de9abcd282b0fbbdbc4d23dfa812959a9e05"},
|
||||
{file = "watchdog-5.0.3-py3-none-manylinux2014_s390x.whl", hash = "sha256:26dd201857d702bdf9d78c273cafcab5871dd29343748524695cecffa44a8d97"},
|
||||
{file = "watchdog-5.0.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:0f9332243355643d567697c3e3fa07330a1d1abf981611654a1f2bf2175612b7"},
|
||||
{file = "watchdog-5.0.3-py3-none-win32.whl", hash = "sha256:c66f80ee5b602a9c7ab66e3c9f36026590a0902db3aea414d59a2f55188c1f49"},
|
||||
{file = "watchdog-5.0.3-py3-none-win_amd64.whl", hash = "sha256:f00b4cf737f568be9665563347a910f8bdc76f88c2970121c86243c8cfdf90e9"},
|
||||
{file = "watchdog-5.0.3-py3-none-win_ia64.whl", hash = "sha256:49f4d36cb315c25ea0d946e018c01bb028048023b9e103d3d3943f58e109dd45"},
|
||||
{file = "watchdog-5.0.3.tar.gz", hash = "sha256:108f42a7f0345042a854d4d0ad0834b741d421330d5f575b81cb27b883500176"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
watchmedo = ["PyYAML (>=3.10)"]
|
||||
|
||||
[[package]]
|
||||
name = "wcwidth"
|
||||
version = "0.2.13"
|
||||
@ -7095,6 +7563,17 @@ files = [
|
||||
{file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webencodings"
|
||||
version = "0.5.1"
|
||||
description = "Character encoding aliases for legacy web content"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"},
|
||||
{file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wheel"
|
||||
version = "0.44.0"
|
||||
@ -7462,4 +7941,4 @@ tesserocr = ["tesserocr"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "536b2f199fe70180aa31e55e7ad47a75a0b64cd20bbe96caec294037966c7b00"
|
||||
content-hash = "cae7819c1a144a8aa2b700d0399d7e9d78b55b3c743cfb0b118f4bd0baa2d34e"
|
||||
|
@ -72,6 +72,8 @@ pandas-stubs = "^2.1.4.231227"
|
||||
ipykernel = "^6.29.5"
|
||||
ipywidgets = "^8.1.5"
|
||||
nbqa = "^1.9.0"
|
||||
mkdocs-material = "^9.5.40"
|
||||
mkdocs-jupyter = "^0.25.0"
|
||||
|
||||
[tool.poetry.group.examples.dependencies]
|
||||
datasets = "^2.21.0"
|
||||
|
Loading…
Reference in New Issue
Block a user