docs: introduce docs site (#141)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2024-10-14 14:13:13 +02:00 · 2024-10-14 14:13:13 +02:00 · d504432c1e
commit d504432c1e
parent 2b1e72d327
25 changed files with 1324 additions and 574 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 <p align="center">
  <a href="https://github.com/ds4sd/docling">
-    <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
+    <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/logo.png" width="150" />
  </a>
 </p>

@ -200,8 +200,8 @@ To see all available options (export formats etc.) run `docling --help`.

 ### RAG
 Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
+- [Basic RAG pipeline with LlamaIndex 🦙](https://github.com/DS4SD/docling/tree/main/docs/examples/rag_llamaindex.ipynb)
+- [Basic RAG pipeline with LangChain 🦜🔗](https://github.com/DS4SD/docling/tree/main/docs/examples/rag_langchain.ipynb)

 ## Advanced features

--- a/docs/assets/logo.png
+++ b/docs/assets/logo.png
--- a/docs/assets/logo.svg
+++ b/docs/assets/logo.svg
--- a/docs/examples/batch_convert.py
+++ b/docs/examples/batch_convert.py
@ -0,0 +1,105 @@
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Iterable
+
+from docling.datamodel.base_models import ConversionStatus
+from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.document_converter import DocumentConverter
+
+_log = logging.getLogger(__name__)
+
+
+def export_documents(
+    conv_results: Iterable[ConversionResult],
+    output_dir: Path,
+):
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    success_count = 0
+    failure_count = 0
+    partial_success_count = 0
+
+    for conv_res in conv_results:
+        if conv_res.status == ConversionStatus.SUCCESS:
+            success_count += 1
+            doc_filename = conv_res.input.file.stem
+
+            # Export Deep Search document JSON format:
+            with (output_dir / f"{doc_filename}.json").open(
+                "w", encoding="utf-8"
+            ) as fp:
+                fp.write(json.dumps(conv_res.render_as_dict()))
+
+            # Export Text format:
+            with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
+                fp.write(conv_res.render_as_text())
+
+            # Export Markdown format:
+            with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
+                fp.write(conv_res.render_as_markdown())
+
+            # Export Document Tags format:
+            with (output_dir / f"{doc_filename}.doctags").open(
+                "w", encoding="utf-8"
+            ) as fp:
+                fp.write(conv_res.render_as_doctags())
+
+        elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
+            _log.info(
+                f"Document {conv_res.input.file} was partially converted with the following errors:"
+            )
+            for item in conv_res.errors:
+                _log.info(f"\t{item.error_message}")
+            partial_success_count += 1
+        else:
+            _log.info(f"Document {conv_res.input.file} failed to convert.")
+            failure_count += 1
+
+    _log.info(
+        f"Processed {success_count + partial_success_count + failure_count} docs, "
+        f"of which {failure_count} failed "
+        f"and {partial_success_count} were partially converted."
+    )
+    return success_count, partial_success_count, failure_count
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_paths = [
+        Path("./tests/data/2206.01062.pdf"),
+        Path("./tests/data/2203.01017v2.pdf"),
+        Path("./tests/data/2305.03393v1.pdf"),
+        Path("./tests/data/redp5110.pdf"),
+        Path("./tests/data/redp5695.pdf"),
+    ]
+
+    # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
+    # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
+    # input = DocumentConversionInput.from_streams(docs)
+
+    doc_converter = DocumentConverter()
+
+    input = DocumentConversionInput.from_paths(input_doc_paths)
+
+    start_time = time.time()
+
+    conv_results = doc_converter.convert(input)
+    success_count, partial_success_count, failure_count = export_documents(
+        conv_results, output_dir=Path("./scratch")
+    )
+
+    end_time = time.time() - start_time
+
+    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+
+    if failure_count > 0:
+        raise RuntimeError(
+            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@ -0,0 +1,175 @@
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Iterable
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.datamodel.pipeline_options import (
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.document_converter import DocumentConverter
+
+_log = logging.getLogger(__name__)
+
+
+def export_documents(
+    conv_results: Iterable[ConversionResult],
+    output_dir: Path,
+):
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    success_count = 0
+    failure_count = 0
+
+    for conv_res in conv_results:
+        if conv_res.status == ConversionStatus.SUCCESS:
+            success_count += 1
+            doc_filename = conv_res.input.file.stem
+
+            # Export Deep Search document JSON format:
+            with (output_dir / f"{doc_filename}.json").open(
+                "w", encoding="utf-8"
+            ) as fp:
+                fp.write(json.dumps(conv_res.render_as_dict()))
+
+            # Export Text format:
+            with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
+                fp.write(conv_res.render_as_text())
+
+            # Export Markdown format:
+            with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
+                fp.write(conv_res.render_as_markdown())
+
+            # Export Document Tags format:
+            with (output_dir / f"{doc_filename}.doctags").open(
+                "w", encoding="utf-8"
+            ) as fp:
+                fp.write(conv_res.render_as_doctags())
+
+        else:
+            _log.info(f"Document {conv_res.input.file} failed to convert.")
+            failure_count += 1
+
+    _log.info(
+        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
+    )
+
+    return success_count, failure_count
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_paths = [
+        Path("./tests/data/2206.01062.pdf"),
+    ]
+
+    ###########################################################################
+
+    # The following sections contain a combination of PipelineOptions
+    # and PDF Backends for various configurations.
+    # Uncomment one section at the time to see the differences in the output.
+
+    # PyPdfium without EasyOCR
+    # --------------------
+    # pipeline_options = PipelineOptions()
+    # pipeline_options.do_ocr=False
+    # pipeline_options.do_table_structure=True
+    # pipeline_options.table_structure_options.do_cell_matching = False
+
+    # doc_converter = DocumentConverter(
+    #     pipeline_options=pipeline_options,
+    #     pdf_backend=PyPdfiumDocumentBackend,
+    # )
+
+    # PyPdfium with EasyOCR
+    # -----------------
+    # pipeline_options = PipelineOptions()
+    # pipeline_options.do_ocr=True
+    # pipeline_options.do_table_structure=True
+    # pipeline_options.table_structure_options.do_cell_matching = True
+
+    # doc_converter = DocumentConverter(
+    #     pipeline_options=pipeline_options,
+    #     pdf_backend=PyPdfiumDocumentBackend,
+    # )
+
+    # Docling Parse without EasyOCR
+    # -------------------------
+    pipeline_options = PipelineOptions()
+    pipeline_options.do_ocr = False
+    pipeline_options.do_table_structure = True
+    pipeline_options.table_structure_options.do_cell_matching = True
+
+    doc_converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+        pdf_backend=DoclingParseDocumentBackend,
+    )
+
+    # Docling Parse with EasyOCR
+    # ----------------------
+    # pipeline_options = PipelineOptions()
+    # pipeline_options.do_ocr=True
+    # pipeline_options.do_table_structure=True
+    # pipeline_options.table_structure_options.do_cell_matching = True
+
+    # doc_converter = DocumentConverter(
+    #     pipeline_options=pipeline_options,
+    #     pdf_backend=DoclingParseDocumentBackend,
+    # )
+
+    # Docling Parse with Tesseract
+    # ----------------------
+    # pipeline_options = PipelineOptions()
+    # pipeline_options.do_ocr = True
+    # pipeline_options.do_table_structure = True
+    # pipeline_options.table_structure_options.do_cell_matching = True
+    # pipeline_options.ocr_options = TesseractOcrOptions()
+
+    # doc_converter = DocumentConverter(
+    #     pipeline_options=pipeline_options,
+    #     pdf_backend=DoclingParseDocumentBackend,
+    # )
+
+    # Docling Parse with Tesseract CLI
+    # ----------------------
+    # pipeline_options = PipelineOptions()
+    # pipeline_options.do_ocr = True
+    # pipeline_options.do_table_structure = True
+    # pipeline_options.table_structure_options.do_cell_matching = True
+    # pipeline_options.ocr_options = TesseractCliOcrOptions()
+
+    # doc_converter = DocumentConverter(
+    #     pipeline_options=pipeline_options,
+    #     pdf_backend=DoclingParseDocumentBackend,
+    # )
+
+    ###########################################################################
+
+    # Define input files
+    input = DocumentConversionInput.from_paths(input_doc_paths)
+
+    start_time = time.time()
+
+    conv_results = doc_converter.convert(input)
+    success_count, failure_count = export_documents(
+        conv_results, output_dir=Path("./scratch")
+    )
+
+    end_time = time.time() - start_time
+
+    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+
+    if failure_count > 0:
+        raise RuntimeError(
+            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/examples/export_figures.py
+++ b/docs/examples/export_figures.py
@ -0,0 +1,85 @@
+import logging
+import time
+from pathlib import Path
+from typing import Tuple
+
+from docling.datamodel.base_models import (
+    AssembleOptions,
+    ConversionStatus,
+    FigureElement,
+    PageElement,
+    TableElement,
+)
+from docling.datamodel.document import DocumentConversionInput
+from docling.document_converter import DocumentConverter
+
+_log = logging.getLogger(__name__)
+
+IMAGE_RESOLUTION_SCALE = 2.0
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_paths = [
+        Path("./tests/data/2206.01062.pdf"),
+    ]
+    output_dir = Path("./scratch")
+
+    input_files = DocumentConversionInput.from_paths(input_doc_paths)
+
+    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
+    # will destroy them for cleaning up memory.
+    # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
+    # scale=1 correspond of a standard 72 DPI image
+    assemble_options = AssembleOptions()
+    assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
+
+    doc_converter = DocumentConverter(assemble_options=assemble_options)
+
+    start_time = time.time()
+
+    conv_results = doc_converter.convert(input_files)
+
+    success_count = 0
+    failure_count = 0
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for conv_res in conv_results:
+        if conv_res.status != ConversionStatus.SUCCESS:
+            _log.info(f"Document {conv_res.input.file} failed to convert.")
+            failure_count += 1
+            continue
+
+        doc_filename = conv_res.input.file.stem
+
+        # Export page images
+        for page in conv_res.pages:
+            page_no = page.page_no + 1
+            page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
+            with page_image_filename.open("wb") as fp:
+                page.image.save(fp, format="PNG")
+
+        # Export figures and tables
+        for element, image in conv_res.render_element_images(
+            element_types=(FigureElement, TableElement)
+        ):
+            element_image_filename = (
+                output_dir / f"{doc_filename}-element-{element.id}.png"
+            )
+            with element_image_filename.open("wb") as fp:
+                image.save(fp, "PNG")
+
+        success_count += 1
+
+    end_time = time.time() - start_time
+
+    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+
+    if failure_count > 0:
+        raise RuntimeError(
+            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/examples/export_multimodal.py
+++ b/docs/examples/export_multimodal.py
@ -0,0 +1,116 @@
+import datetime
+import logging
+import time
+from pathlib import Path
+
+import pandas as pd
+
+from docling.datamodel.base_models import AssembleOptions, ConversionStatus
+from docling.datamodel.document import DocumentConversionInput
+from docling.document_converter import DocumentConverter
+from docling.utils.export import generate_multimodal_pages
+
+_log = logging.getLogger(__name__)
+
+IMAGE_RESOLUTION_SCALE = 2.0
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_paths = [
+        Path("./tests/data/2206.01062.pdf"),
+    ]
+    output_dir = Path("./scratch")
+
+    input_files = DocumentConversionInput.from_paths(input_doc_paths)
+
+    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
+    # will destroy them for cleaning up memory.
+    # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
+    # scale=1 correspond of a standard 72 DPI image
+    assemble_options = AssembleOptions()
+    assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
+
+    doc_converter = DocumentConverter(assemble_options=assemble_options)
+
+    start_time = time.time()
+
+    converted_docs = doc_converter.convert(input_files)
+
+    success_count = 0
+    failure_count = 0
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for doc in converted_docs:
+        if doc.status != ConversionStatus.SUCCESS:
+            _log.info(f"Document {doc.input.file} failed to convert.")
+            failure_count += 1
+            continue
+
+        rows = []
+        for (
+            content_text,
+            content_md,
+            content_dt,
+            page_cells,
+            page_segments,
+            page,
+        ) in generate_multimodal_pages(doc):
+
+            dpi = page._default_image_scale * 72
+
+            rows.append(
+                {
+                    "document": doc.input.file.name,
+                    "hash": doc.input.document_hash,
+                    "page_hash": page.page_hash,
+                    "image": {
+                        "width": page.image.width,
+                        "height": page.image.height,
+                        "bytes": page.image.tobytes(),
+                    },
+                    "cells": page_cells,
+                    "contents": content_text,
+                    "contents_md": content_md,
+                    "contents_dt": content_dt,
+                    "segments": page_segments,
+                    "extra": {
+                        "page_num": page.page_no + 1,
+                        "width_in_points": page.size.width,
+                        "height_in_points": page.size.height,
+                        "dpi": dpi,
+                    },
+                }
+            )
+        success_count += 1
+
+    # Generate one parquet from all documents
+    df = pd.json_normalize(rows)
+    now = datetime.datetime.now()
+    output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
+    df.to_parquet(output_filename)
+
+    end_time = time.time() - start_time
+
+    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+
+    if failure_count > 0:
+        raise RuntimeError(
+            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
+        )
+
+    # This block demonstrates how the file can be opened with the HF datasets library
+    # from datasets import Dataset
+    # from PIL import Image
+    # multimodal_df = pd.read_parquet(output_filename)
+
+    # # Convert pandas DataFrame to Hugging Face Dataset and load bytes into image
+    # dataset = Dataset.from_pandas(multimodal_df)
+    # def transforms(examples):
+    #     examples["image"] = Image.frombytes('RGB', (examples["image.width"], examples["image.height"]), examples["image.bytes"], 'raw')
+    #     return examples
+    # dataset = dataset.map(transforms)
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/examples/export_tables.py
+++ b/docs/examples/export_tables.py
@ -0,0 +1,74 @@
+import logging
+import time
+from pathlib import Path
+from typing import Tuple
+
+import pandas as pd
+
+from docling.datamodel.base_models import ConversionStatus
+from docling.datamodel.document import DocumentConversionInput
+from docling.document_converter import DocumentConverter
+
+_log = logging.getLogger(__name__)
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_paths = [
+        Path("./tests/data/2206.01062.pdf"),
+    ]
+    output_dir = Path("./scratch")
+
+    input_files = DocumentConversionInput.from_paths(input_doc_paths)
+
+    doc_converter = DocumentConverter()
+
+    start_time = time.time()
+
+    conv_results = doc_converter.convert(input_files)
+
+    success_count = 0
+    failure_count = 0
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for conv_res in conv_results:
+        if conv_res.status != ConversionStatus.SUCCESS:
+            _log.info(f"Document {conv_res.input.file} failed to convert.")
+            failure_count += 1
+            continue
+
+        doc_filename = conv_res.input.file.stem
+
+        # Export tables
+        for table_ix, table in enumerate(conv_res.output.tables):
+            table_df: pd.DataFrame = table.export_to_dataframe()
+            print(f"## Table {table_ix}")
+            print(table_df.to_markdown())
+
+            # Save the table as csv
+            element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
+            _log.info(f"Saving CSV table to {element_csv_filename}")
+            table_df.to_csv(element_csv_filename)
+
+            # Save the table as html
+            element_html_filename = (
+                output_dir / f"{doc_filename}-table-{table_ix+1}.html"
+            )
+            _log.info(f"Saving HTML table to {element_html_filename}")
+            with element_html_filename.open("w") as fp:
+                fp.write(table.export_to_html())
+
+        success_count += 1
+
+    end_time = time.time() - start_time
+
+    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+
+    if failure_count > 0:
+        raise RuntimeError(
+            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/examples/minimal.py
+++ b/docs/examples/minimal.py
@ -0,0 +1,6 @@
+from docling.document_converter import DocumentConverter
+
+source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
+converter = DocumentConverter()
+doc = converter.convert_single(source)
+print(doc.render_as_markdown())  # output: ## Docling Technical Report [...]"
--- a/docs/examples/rag_langchain.ipynb
+++ b/docs/examples/rag_langchain.ipynb
@ -4,7 +4,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# RAG with Docling and 🦜🔗 LangChain"
+    "# RAG with LangChain 🦜🔗"
   ]
  },
  {
--- a/docs/examples/rag_llamaindex.ipynb
+++ b/docs/examples/rag_llamaindex.ipynb
@ -11,7 +11,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# RAG with Docling and 🦙 LlamaIndex"
+    "# RAG with LlamaIndex 🦙"
   ]
  },
  {
@ -25,9 +25,11 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "LlamaIndex extensions `DoclingReader` and `DoclingNodeParser` presented in this notebook seamlessly integrate Docling into LlamaIndex, enabling you to:\n",
+    "This example leverages the official [LlamaIndex Docling extension](../../integrations/llamaindex/).\n",
+    "\n",
+    "Presented extensions `DoclingReader` and `DoclingNodeParser` enable you to:\n",
    "- use PDF documents in your LLM applications with ease and speed, and\n",
-    "- leverage Docling's rich format for advanced, document-native grounding."
+    "- harness Docling's rich format for advanced, document-native grounding."
   ]
  },
  {
--- a/docs/index.md
+++ b/docs/index.md
@ -0,0 +1,29 @@
+# Docling
+
+<p align="center">
+  <a href="https://ds4sd.github.io/docling/">
+    <img loading="lazy" alt="Docling" src="assets/logo.png" width="150" />
+  </a>
+</p>
+
+
+[![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
+[![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
+![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
+[![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
+[![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
+[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
+[![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
+
+Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
+
+## Features
+
+* ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
+* 📑 Understands detailed page layout, reading order and recovers table structures
+* 📝 Extracts metadata from the document, such as title, authors, references and language
+* 🔍 Includes OCR support for scanned PDFs
+* 🤖 Integrates easily with LLM app / RAG frameworks like LlamaIndex&nbsp;🦙 & LangChain&nbsp;🦜🔗
+* 💻 Provides a simple and convenient CLI
--- a/docs/installation.md
+++ b/docs/installation.md
@ -0,0 +1,100 @@
+To use Docling, simply install `docling` from your Python package manager, e.g. pip:
+```bash
+pip install docling
+```
+
+Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 architectures.
+
+??? "Alternative PyTorch distributions"
+
+    The Docling models depend on the [PyTorch](https://pytorch.org/) library.
+    Depending on your architecture, you might want to use a different distribution of `torch`.
+    For example, you might want support for different accelerator or for a cpu-only version.
+    All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
+
+    One common situation is the installation on Linux systems with cpu-only support.
+    In this case, we suggest the installation of Docling with the following options
+
+    ```bash
+    # Example for installing on the Linux cpu-only version
+    pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
+    ```
+
+??? "Alternative OCR engines"
+
+    Docling supports multiple OCR engines for processing scanned documents. The current version provides
+    the following engines.
+
+    | Engine | Installation | Usage |
+    | ------ | ------------ | ----- |
+    | [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
+    | Tesseract | System dependency. See description for Tesseract and Tesserocr below.  | `TesseractOcrOptions` |
+    | Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
+
+    The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
+
+    ```python
+    from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+    from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
+    from docling.document_converter import DocumentConverter
+
+    pipeline_options = PipelineOptions()
+    pipeline_options.do_ocr = True
+    pipeline_options.ocr_options = TesseractOcrOptions()  # Use Tesseract
+
+    doc_converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+    )
+    ```
+
+    <h3>Tesseract installation</h3>
+
+    [Tesseract](https://github.com/tesseract-ocr/tesseract) is a popular OCR engine which is available
+    on most operating systems. For using this engine with Docling, Tesseract must be installed on your
+    system, using the packaging tool of your choice. Below we provide example commands.
+    After installing Tesseract you are expected to provide the path to its language files using the
+    `TESSDATA_PREFIX` environment variable (note that it must terminate with a slash `/`).
+
+    === "macOS (via [Homebrew](https://brew.sh/))"
+
+        ```console
+        brew install tesseract leptonica pkg-config
+        TESSDATA_PREFIX=/opt/homebrew/share/tessdata/
+        echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
+        ```
+
+    === "Debian-based"
+
+        ```console
+        apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config
+        TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
+        echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
+        ```
+
+    === "RHEL"
+
+        ```console
+        dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
+        TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
+        echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
+        ```
+
+    <h3>Linking to Tesseract</h3>
+    The most efficient usage of the Tesseract library is via linking. Docling is using
+    the [Tesserocr](https://github.com/sirfz/tesserocr) package for this.
+
+    If you get into installation issues of Tesserocr, we suggest using the following
+    installation options:
+
+    ```console
+    pip uninstall tesserocr
+    pip install --no-binary :all: tesserocr
+    ```
+
+## Development setup
+
+To develop Docling features, bugfixes etc., install as follows from your local clone's root dir:
+
+```bash
+poetry install --all-extras
+```
--- a/docs/integrations/llamaindex.md
+++ b/docs/integrations/llamaindex.md
@ -0,0 +1,25 @@
+## Get started
+
+Docling is available as an official LlamaIndex extension!
+
+To get started, check out the [step-by-step guide in LlamaIndex \[↗\]](https://docs.llamaindex.ai/en/stable/examples/data_connectors/DoclingReaderDemo/)<!--{target="_blank"}-->.
+
+## Components
+
+### Docling Reader
+
+Reads document files and uses Docling to populate LlamaIndex `Document` objects — either serializing Docling's data model (losslessly, e.g. as JSON) or exporting to a simplified format (lossily, e.g. as Markdown).
+
+- 💻 [GitHub \[↗\]](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/readers/llama-index-readers-docling)<!--{target="_blank"}-->
+- 📖 [API docs \[↗\]](https://docs.llamaindex.ai/en/stable/api_reference/readers/docling/)<!--{target="_blank"}-->
+- 📦 [PyPI \[↗\]](https://pypi.org/project/llama-index-readers-docling/)<!--{target="_blank"}-->
+- 🦙 [LlamaHub \[↗\]](https://llamahub.ai/l/readers/llama-index-readers-docling)<!--{target="_blank"}-->
+
+### Docling Node Parser
+
+Reads LlamaIndex `Document` objects populated in Docling's format by Docling Reader and, using its knowledge of the Docling format, parses them to LlamaIndex `Node` objects for downstream usage in LlamaIndex applications, e.g. as chunks for embedding.
+
+- 💻 [GitHub \[↗\]](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/node_parser/llama-index-node-parser-docling)<!--{target="_blank"}-->
+- 📖 [API docs \[↗\]](https://docs.llamaindex.ai/en/stable/api_reference/node_parser/docling/)<!--{target="_blank"}-->
+- 📦 [PyPI \[↗\]](https://pypi.org/project/llama-index-node-parser-docling/)<!--{target="_blank"}-->
+- 🦙 [LlamaHub \[↗\]](https://llamahub.ai/l/node_parser/llama-index-node-parser-docling)<!--{target="_blank"}-->
--- a/docs/overrides/main.html
+++ b/docs/overrides/main.html
@ -0,0 +1,7 @@
+{% extends "base.html" %}
+
+{#
+{% block announce %}
+  <p>🎉 Docling is now officially supported in LlamaIndex! <a href="{{ 'integrations/llamaindex/' | url }}">Check it out</a>!</p>
+{% endblock %}
+#}
--- a/docs/stylesheets/extra.css
+++ b/docs/stylesheets/extra.css
@ -0,0 +1,3 @@
+[data-md-color-scheme="default"] .md-banner a {
+    color: #5e8bde;
+}
--- a/examples/batch_convert.py
+++ b/examples/batch_convert.py
@ -1,105 +0,0 @@
-import json
-import logging
-import time
-from pathlib import Path
-from typing import Iterable
-
-from docling.datamodel.base_models import ConversionStatus
-from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.document_converter import DocumentConverter
-
-_log = logging.getLogger(__name__)
-
-
-def export_documents(
-    conv_results: Iterable[ConversionResult],
-    output_dir: Path,
-):
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    success_count = 0
-    failure_count = 0
-    partial_success_count = 0
-
-    for conv_res in conv_results:
-        if conv_res.status == ConversionStatus.SUCCESS:
-            success_count += 1
-            doc_filename = conv_res.input.file.stem
-
-            # Export Deep Search document JSON format:
-            with (output_dir / f"{doc_filename}.json").open(
-                "w", encoding="utf-8"
-            ) as fp:
-                fp.write(json.dumps(conv_res.render_as_dict()))
-
-            # Export Text format:
-            with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
-                fp.write(conv_res.render_as_text())
-
-            # Export Markdown format:
-            with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
-                fp.write(conv_res.render_as_markdown())
-
-            # Export Document Tags format:
-            with (output_dir / f"{doc_filename}.doctags").open(
-                "w", encoding="utf-8"
-            ) as fp:
-                fp.write(conv_res.render_as_doctags())
-
-        elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
-            _log.info(
-                f"Document {conv_res.input.file} was partially converted with the following errors:"
-            )
-            for item in conv_res.errors:
-                _log.info(f"\t{item.error_message}")
-            partial_success_count += 1
-        else:
-            _log.info(f"Document {conv_res.input.file} failed to convert.")
-            failure_count += 1
-
-    _log.info(
-        f"Processed {success_count + partial_success_count + failure_count} docs, "
-        f"of which {failure_count} failed "
-        f"and {partial_success_count} were partially converted."
-    )
-    return success_count, partial_success_count, failure_count
-
-
-def main():
-    logging.basicConfig(level=logging.INFO)
-
-    input_doc_paths = [
-        Path("./tests/data/2206.01062.pdf"),
-        Path("./tests/data/2203.01017v2.pdf"),
-        Path("./tests/data/2305.03393v1.pdf"),
-        Path("./tests/data/redp5110.pdf"),
-        Path("./tests/data/redp5695.pdf"),
-    ]
-
-    # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
-    # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
-    # input = DocumentConversionInput.from_streams(docs)
-
-    doc_converter = DocumentConverter()
-
-    input = DocumentConversionInput.from_paths(input_doc_paths)
-
-    start_time = time.time()
-
-    conv_results = doc_converter.convert(input)
-    success_count, partial_success_count, failure_count = export_documents(
-        conv_results, output_dir=Path("./scratch")
-    )
-
-    end_time = time.time() - start_time
-
-    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
-
-    if failure_count > 0:
-        raise RuntimeError(
-            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
-        )
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/batch_convert.py
+++ b/examples/batch_convert.py
@ -0,0 +1 @@
+../docs/examples/batch_convert.py
--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@ -1,175 +0,0 @@
-import json
-import logging
-import time
-from pathlib import Path
-from typing import Iterable
-
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, PipelineOptions
-from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.datamodel.pipeline_options import (
-    TesseractCliOcrOptions,
-    TesseractOcrOptions,
-)
-from docling.document_converter import DocumentConverter
-
-_log = logging.getLogger(__name__)
-
-
-def export_documents(
-    conv_results: Iterable[ConversionResult],
-    output_dir: Path,
-):
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    success_count = 0
-    failure_count = 0
-
-    for conv_res in conv_results:
-        if conv_res.status == ConversionStatus.SUCCESS:
-            success_count += 1
-            doc_filename = conv_res.input.file.stem
-
-            # Export Deep Search document JSON format:
-            with (output_dir / f"{doc_filename}.json").open(
-                "w", encoding="utf-8"
-            ) as fp:
-                fp.write(json.dumps(conv_res.render_as_dict()))
-
-            # Export Text format:
-            with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
-                fp.write(conv_res.render_as_text())
-
-            # Export Markdown format:
-            with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
-                fp.write(conv_res.render_as_markdown())
-
-            # Export Document Tags format:
-            with (output_dir / f"{doc_filename}.doctags").open(
-                "w", encoding="utf-8"
-            ) as fp:
-                fp.write(conv_res.render_as_doctags())
-
-        else:
-            _log.info(f"Document {conv_res.input.file} failed to convert.")
-            failure_count += 1
-
-    _log.info(
-        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
-    )
-
-    return success_count, failure_count
-
-
-def main():
-    logging.basicConfig(level=logging.INFO)
-
-    input_doc_paths = [
-        Path("./tests/data/2206.01062.pdf"),
-    ]
-
-    ###########################################################################
-
-    # The following sections contain a combination of PipelineOptions
-    # and PDF Backends for various configurations.
-    # Uncomment one section at the time to see the differences in the output.
-
-    # PyPdfium without EasyOCR
-    # --------------------
-    # pipeline_options = PipelineOptions()
-    # pipeline_options.do_ocr=False
-    # pipeline_options.do_table_structure=True
-    # pipeline_options.table_structure_options.do_cell_matching = False
-
-    # doc_converter = DocumentConverter(
-    #     pipeline_options=pipeline_options,
-    #     pdf_backend=PyPdfiumDocumentBackend,
-    # )
-
-    # PyPdfium with EasyOCR
-    # -----------------
-    # pipeline_options = PipelineOptions()
-    # pipeline_options.do_ocr=True
-    # pipeline_options.do_table_structure=True
-    # pipeline_options.table_structure_options.do_cell_matching = True
-
-    # doc_converter = DocumentConverter(
-    #     pipeline_options=pipeline_options,
-    #     pdf_backend=PyPdfiumDocumentBackend,
-    # )
-
-    # Docling Parse without EasyOCR
-    # -------------------------
-    pipeline_options = PipelineOptions()
-    pipeline_options.do_ocr = False
-    pipeline_options.do_table_structure = True
-    pipeline_options.table_structure_options.do_cell_matching = True
-
-    doc_converter = DocumentConverter(
-        pipeline_options=pipeline_options,
-        pdf_backend=DoclingParseDocumentBackend,
-    )
-
-    # Docling Parse with EasyOCR
-    # ----------------------
-    # pipeline_options = PipelineOptions()
-    # pipeline_options.do_ocr=True
-    # pipeline_options.do_table_structure=True
-    # pipeline_options.table_structure_options.do_cell_matching = True
-
-    # doc_converter = DocumentConverter(
-    #     pipeline_options=pipeline_options,
-    #     pdf_backend=DoclingParseDocumentBackend,
-    # )
-
-    # Docling Parse with Tesseract
-    # ----------------------
-    # pipeline_options = PipelineOptions()
-    # pipeline_options.do_ocr = True
-    # pipeline_options.do_table_structure = True
-    # pipeline_options.table_structure_options.do_cell_matching = True
-    # pipeline_options.ocr_options = TesseractOcrOptions()
-
-    # doc_converter = DocumentConverter(
-    #     pipeline_options=pipeline_options,
-    #     pdf_backend=DoclingParseDocumentBackend,
-    # )
-
-    # Docling Parse with Tesseract CLI
-    # ----------------------
-    # pipeline_options = PipelineOptions()
-    # pipeline_options.do_ocr = True
-    # pipeline_options.do_table_structure = True
-    # pipeline_options.table_structure_options.do_cell_matching = True
-    # pipeline_options.ocr_options = TesseractCliOcrOptions()
-
-    # doc_converter = DocumentConverter(
-    #     pipeline_options=pipeline_options,
-    #     pdf_backend=DoclingParseDocumentBackend,
-    # )
-
-    ###########################################################################
-
-    # Define input files
-    input = DocumentConversionInput.from_paths(input_doc_paths)
-
-    start_time = time.time()
-
-    conv_results = doc_converter.convert(input)
-    success_count, failure_count = export_documents(
-        conv_results, output_dir=Path("./scratch")
-    )
-
-    end_time = time.time() - start_time
-
-    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
-
-    if failure_count > 0:
-        raise RuntimeError(
-            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
-        )
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@ -0,0 +1 @@
+../docs/examples/custom_convert.py
--- a/examples/export_figures.py
+++ b/examples/export_figures.py
@ -1,85 +0,0 @@
-import logging
-import time
-from pathlib import Path
-from typing import Tuple
-
-from docling.datamodel.base_models import (
-    AssembleOptions,
-    ConversionStatus,
-    FigureElement,
-    PageElement,
-    TableElement,
-)
-from docling.datamodel.document import DocumentConversionInput
-from docling.document_converter import DocumentConverter
-
-_log = logging.getLogger(__name__)
-
-IMAGE_RESOLUTION_SCALE = 2.0
-
-
-def main():
-    logging.basicConfig(level=logging.INFO)
-
-    input_doc_paths = [
-        Path("./tests/data/2206.01062.pdf"),
-    ]
-    output_dir = Path("./scratch")
-
-    input_files = DocumentConversionInput.from_paths(input_doc_paths)
-
-    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
-    # will destroy them for cleaning up memory.
-    # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
-    # scale=1 correspond of a standard 72 DPI image
-    assemble_options = AssembleOptions()
-    assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
-
-    doc_converter = DocumentConverter(assemble_options=assemble_options)
-
-    start_time = time.time()
-
-    conv_results = doc_converter.convert(input_files)
-
-    success_count = 0
-    failure_count = 0
-    output_dir.mkdir(parents=True, exist_ok=True)
-    for conv_res in conv_results:
-        if conv_res.status != ConversionStatus.SUCCESS:
-            _log.info(f"Document {conv_res.input.file} failed to convert.")
-            failure_count += 1
-            continue
-
-        doc_filename = conv_res.input.file.stem
-
-        # Export page images
-        for page in conv_res.pages:
-            page_no = page.page_no + 1
-            page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
-            with page_image_filename.open("wb") as fp:
-                page.image.save(fp, format="PNG")
-
-        # Export figures and tables
-        for element, image in conv_res.render_element_images(
-            element_types=(FigureElement, TableElement)
-        ):
-            element_image_filename = (
-                output_dir / f"{doc_filename}-element-{element.id}.png"
-            )
-            with element_image_filename.open("wb") as fp:
-                image.save(fp, "PNG")
-
-        success_count += 1
-
-    end_time = time.time() - start_time
-
-    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
-
-    if failure_count > 0:
-        raise RuntimeError(
-            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
-        )
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/export_figures.py
+++ b/examples/export_figures.py
@ -0,0 +1 @@
+../docs/examples/export_figures.py
--- a/examples/export_multimodal.py
+++ b/examples/export_multimodal.py
@ -1,116 +0,0 @@
-import datetime
-import logging
-import time
-from pathlib import Path
-
-import pandas as pd
-
-from docling.datamodel.base_models import AssembleOptions, ConversionStatus
-from docling.datamodel.document import DocumentConversionInput
-from docling.document_converter import DocumentConverter
-from docling.utils.export import generate_multimodal_pages
-
-_log = logging.getLogger(__name__)
-
-IMAGE_RESOLUTION_SCALE = 2.0
-
-
-def main():
-    logging.basicConfig(level=logging.INFO)
-
-    input_doc_paths = [
-        Path("./tests/data/2206.01062.pdf"),
-    ]
-    output_dir = Path("./scratch")
-
-    input_files = DocumentConversionInput.from_paths(input_doc_paths)
-
-    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
-    # will destroy them for cleaning up memory.
-    # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
-    # scale=1 correspond of a standard 72 DPI image
-    assemble_options = AssembleOptions()
-    assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
-
-    doc_converter = DocumentConverter(assemble_options=assemble_options)
-
-    start_time = time.time()
-
-    converted_docs = doc_converter.convert(input_files)
-
-    success_count = 0
-    failure_count = 0
-    output_dir.mkdir(parents=True, exist_ok=True)
-    for doc in converted_docs:
-        if doc.status != ConversionStatus.SUCCESS:
-            _log.info(f"Document {doc.input.file} failed to convert.")
-            failure_count += 1
-            continue
-
-        rows = []
-        for (
-            content_text,
-            content_md,
-            content_dt,
-            page_cells,
-            page_segments,
-            page,
-        ) in generate_multimodal_pages(doc):
-
-            dpi = page._default_image_scale * 72
-
-            rows.append(
-                {
-                    "document": doc.input.file.name,
-                    "hash": doc.input.document_hash,
-                    "page_hash": page.page_hash,
-                    "image": {
-                        "width": page.image.width,
-                        "height": page.image.height,
-                        "bytes": page.image.tobytes(),
-                    },
-                    "cells": page_cells,
-                    "contents": content_text,
-                    "contents_md": content_md,
-                    "contents_dt": content_dt,
-                    "segments": page_segments,
-                    "extra": {
-                        "page_num": page.page_no + 1,
-                        "width_in_points": page.size.width,
-                        "height_in_points": page.size.height,
-                        "dpi": dpi,
-                    },
-                }
-            )
-        success_count += 1
-
-    # Generate one parquet from all documents
-    df = pd.json_normalize(rows)
-    now = datetime.datetime.now()
-    output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
-    df.to_parquet(output_filename)
-
-    end_time = time.time() - start_time
-
-    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
-
-    if failure_count > 0:
-        raise RuntimeError(
-            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
-        )
-
-    # This block demonstrates how the file can be opened with the HF datasets library
-    # from datasets import Dataset
-    # from PIL import Image
-    # multimodal_df = pd.read_parquet(output_filename)
-
-    # # Convert pandas DataFrame to Hugging Face Dataset and load bytes into image
-    # dataset = Dataset.from_pandas(multimodal_df)
-    # def transforms(examples):
-    #     examples["image"] = Image.frombytes('RGB', (examples["image.width"], examples["image.height"]), examples["image.bytes"], 'raw')
-    #     return examples
-    # dataset = dataset.map(transforms)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/export_multimodal.py
+++ b/examples/export_multimodal.py
@ -0,0 +1 @@
+../docs/examples/export_multimodal.py
--- a/examples/export_tables.py
+++ b/examples/export_tables.py
@ -1,74 +0,0 @@
-import logging
-import time
-from pathlib import Path
-from typing import Tuple
-
-import pandas as pd
-
-from docling.datamodel.base_models import ConversionStatus
-from docling.datamodel.document import DocumentConversionInput
-from docling.document_converter import DocumentConverter
-
-_log = logging.getLogger(__name__)
-
-
-def main():
-    logging.basicConfig(level=logging.INFO)
-
-    input_doc_paths = [
-        Path("./tests/data/2206.01062.pdf"),
-    ]
-    output_dir = Path("./scratch")
-
-    input_files = DocumentConversionInput.from_paths(input_doc_paths)
-
-    doc_converter = DocumentConverter()
-
-    start_time = time.time()
-
-    conv_results = doc_converter.convert(input_files)
-
-    success_count = 0
-    failure_count = 0
-    output_dir.mkdir(parents=True, exist_ok=True)
-    for conv_res in conv_results:
-        if conv_res.status != ConversionStatus.SUCCESS:
-            _log.info(f"Document {conv_res.input.file} failed to convert.")
-            failure_count += 1
-            continue
-
-        doc_filename = conv_res.input.file.stem
-
-        # Export tables
-        for table_ix, table in enumerate(conv_res.output.tables):
-            table_df: pd.DataFrame = table.export_to_dataframe()
-            print(f"## Table {table_ix}")
-            print(table_df.to_markdown())
-
-            # Save the table as csv
-            element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
-            _log.info(f"Saving CSV table to {element_csv_filename}")
-            table_df.to_csv(element_csv_filename)
-
-            # Save the table as html
-            element_html_filename = (
-                output_dir / f"{doc_filename}-table-{table_ix+1}.html"
-            )
-            _log.info(f"Saving HTML table to {element_html_filename}")
-            with element_html_filename.open("w") as fp:
-                fp.write(table.export_to_html())
-
-        success_count += 1
-
-    end_time = time.time() - start_time
-
-    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
-
-    if failure_count > 0:
-        raise RuntimeError(
-            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
-        )
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/export_tables.py
+++ b/examples/export_tables.py
@ -0,0 +1 @@
+../docs/examples/export_tables.py
--- a/examples/minimal.py
+++ b/examples/minimal.py
@ -1,6 +0,0 @@
-from docling.document_converter import DocumentConverter
-
-source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
-converter = DocumentConverter()
-doc = converter.convert_single(source)
-print(doc.render_as_markdown())  # output: ## Docling Technical Report [...]"
--- a/examples/minimal.py
+++ b/examples/minimal.py
@ -0,0 +1 @@
+../docs/examples/minimal.py
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -0,0 +1,97 @@
+site_name: Docling
+site_url: https://ds4sd.github.io/docling/
+repo_name: DS4SD/docling
+repo_url: https://github.com/DS4SD/docling
+
+theme:
+  name: material
+  custom_dir: docs/overrides
+  palette:
+    # Palette toggle for automatic mode
+    - media: "(prefers-color-scheme)"
+      scheme: default
+      primary: black
+      toggle:
+        icon: material/brightness-auto
+        name: Switch to light mode
+
+    # Palette toggle for light mode
+    - media: "(prefers-color-scheme: light)"
+      scheme: default
+      primary: black
+      toggle:
+        icon: material/brightness-7
+        name: Switch to dark mode
+
+    # Palette toggle for dark mode
+    - media: "(prefers-color-scheme: dark)"
+      scheme: slate
+      primary: black
+      toggle:
+        icon: material/brightness-4
+        name: Switch to system preference
+
+  logo: assets/logo.png
+  favicon: assets/logo.png
+  features:
+    - content.tabs.link
+    - content.code.annotate
+    - content.code.copy
+    - announce.dismiss
+    - navigation.tabs
+    # - navigation.indexes  # <= if set, each "section" can have its own page, if index.md is used
+    - navigation.instant
+    - navigation.instant.prefetch
+    # - navigation.instant.preview
+    - navigation.instant.progress
+    - navigation.path
+    - navigation.sections  # <=
+    - navigation.top
+    - navigation.tracking
+    - search.suggest
+    - toc.follow
+nav:
+  - Get started:
+    - Home: index.md
+    - Installation: installation.md
+  #   - Docling v2: v2.md
+  # - Concepts:
+  #   - Docling Document: concepts/document.md
+  #   - Chunking: concepts/chunking.md
+  - Examples:
+    - Conversion:
+      - "Simple conversion": examples/minimal.py
+      - "Custom conversion": examples/custom_convert.py
+      - "Batch conversion": examples/batch_convert.py
+      - "Figure export": examples/export_figures.py
+      - "Table export": examples/export_tables.py
+      - "Multimodal export": examples/export_multimodal.py
+    - RAG / QA:
+      - "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
+      - "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
+    # - Chunking:
+    #   - Chunking: examples/chunking.md
+    # - CLI:
+    #   - CLI: examples/cli.md
+  - Integrations:
+    - "LlamaIndex 🦙 extension": integrations/llamaindex.md
+    # - "LangChain 🦜🔗 extension": integrations/langchain.md
+  # - API reference:
+  #   - API reference: api_reference/index.md
+
+markdown_extensions:
+  - pymdownx.superfences
+  - pymdownx.tabbed:
+      alternate_style: true
+      slugify: !!python/object/apply:pymdownx.slugs.slugify
+        kwds:
+          case: lower
+  - admonition
+  - pymdownx.details
+  - attr_list
+plugins:
+  - search
+  - mkdocs-jupyter
+
+extra_css:
+  - stylesheets/extra.css
--- a/poetry.lock
+++ b/poetry.lock
@ -263,6 +263,20 @@ files = [
 pycodestyle = ">=2.11.0"
 tomli = {version = "*", markers = "python_version < \"3.11\""}

+[[package]]
+name = "babel"
+version = "2.16.0"
+description = "Internationalization utilities"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b"},
+    {file = "babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316"},
+]
+
+[package.extras]
+dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"]
+
 [[package]]
 name = "backports-tarfile"
 version = "1.2.0"
@ -347,6 +361,24 @@ d = ["aiohttp (>=3.10)"]
 jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
 uvloop = ["uvloop (>=0.15.2)"]

+[[package]]
+name = "bleach"
+version = "6.1.0"
+description = "An easy safelist-based HTML-sanitizing tool."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "bleach-6.1.0-py3-none-any.whl", hash = "sha256:3225f354cfc436b9789c66c4ee030194bee0568fbf9cbdad3bc8b5c26c5f12b6"},
+    {file = "bleach-6.1.0.tar.gz", hash = "sha256:0a31f1837963c41d46bbf1331b8778e1308ea0791db03cc4e7357b97cf42a8fe"},
+]
+
+[package.dependencies]
+six = ">=1.9.0"
+webencodings = "*"
+
+[package.extras]
+css = ["tinycss2 (>=1.1.0,<1.3)"]
+
 [[package]]
 name = "certifi"
 version = "2024.8.30"
@ -931,6 +963,17 @@ tqdm = ">=4.64.0,<5.0.0"
 [package.extras]
 toolkit = ["deepsearch-toolkit (>=0.31.0)"]

+[[package]]
+name = "defusedxml"
+version = "0.7.1"
+description = "XML bomb protection for Python stdlib modules"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+files = [
+    {file = "defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61"},
+    {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
+]
+
 [[package]]
 name = "deprecated"
 version = "1.2.14"
@ -1185,6 +1228,20 @@ files = [
 [package.extras]
 tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"]

+[[package]]
+name = "fastjsonschema"
+version = "2.20.0"
+description = "Fastest Python implementation of JSON schema"
+optional = false
+python-versions = "*"
+files = [
+    {file = "fastjsonschema-2.20.0-py3-none-any.whl", hash = "sha256:5875f0b0fa7a0043a91e93a9b8f793bcbbba9691e7fd83dca95c28ba26d21f0a"},
+    {file = "fastjsonschema-2.20.0.tar.gz", hash = "sha256:3d48fc5300ee96f5d116f10fe6f28d938e6008f59a6a025c2649475b87f76a23"},
+]
+
+[package.extras]
+devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]
+
 [[package]]
 name = "filelock"
 version = "3.16.1"
@ -1444,6 +1501,23 @@ test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe,
 test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"]
 tqdm = ["tqdm"]

+[[package]]
+name = "ghp-import"
+version = "2.1.0"
+description = "Copy your docs directly to the gh-pages branch."
+optional = false
+python-versions = "*"
+files = [
+    {file = "ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343"},
+    {file = "ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619"},
+]
+
+[package.dependencies]
+python-dateutil = ">=2.8.1"
+
+[package.extras]
+dev = ["flake8", "markdown", "twine", "wheel"]
+
 [[package]]
 name = "gitdb"
 version = "4.0.11"
@ -2214,6 +2288,17 @@ traitlets = ">=5.3"
 docs = ["myst-parser", "pydata-sphinx-theme", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling", "traitlets"]
 test = ["ipykernel", "pre-commit", "pytest (<8)", "pytest-cov", "pytest-timeout"]

+[[package]]
+name = "jupyterlab-pygments"
+version = "0.3.0"
+description = "Pygments theme using JupyterLab CSS variables"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "jupyterlab_pygments-0.3.0-py3-none-any.whl", hash = "sha256:841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780"},
+    {file = "jupyterlab_pygments-0.3.0.tar.gz", hash = "sha256:721aca4d9029252b11cfa9d185e5b5af4d54772bb8072f9b7036f4170054d35d"},
+]
+
 [[package]]
 name = "jupyterlab-widgets"
 version = "3.0.13"
@ -2225,6 +2310,35 @@ files = [
    {file = "jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed"},
 ]

+[[package]]
+name = "jupytext"
+version = "1.16.4"
+description = "Jupyter notebooks as Markdown documents, Julia, Python or R scripts"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "jupytext-1.16.4-py3-none-any.whl", hash = "sha256:76989d2690e65667ea6fb411d8056abe7cd0437c07bd774660b83d62acf9490a"},
+    {file = "jupytext-1.16.4.tar.gz", hash = "sha256:28e33f46f2ce7a41fb9d677a4a2c95327285579b64ca104437c4b9eb1e4174e9"},
+]
+
+[package.dependencies]
+markdown-it-py = ">=1.0"
+mdit-py-plugins = "*"
+nbformat = "*"
+packaging = "*"
+pyyaml = "*"
+tomli = {version = "*", markers = "python_version < \"3.11\""}
+
+[package.extras]
+dev = ["autopep8", "black", "flake8", "gitpython", "ipykernel", "isort", "jupyter-fs (>=1.0)", "jupyter-server (!=2.11)", "nbconvert", "pre-commit", "pytest", "pytest-cov (>=2.6.1)", "pytest-randomly", "pytest-xdist", "sphinx-gallery (<0.8)"]
+docs = ["myst-parser", "sphinx", "sphinx-copybutton", "sphinx-rtd-theme"]
+test = ["pytest", "pytest-randomly", "pytest-xdist"]
+test-cov = ["ipykernel", "jupyter-server (!=2.11)", "nbconvert", "pytest", "pytest-cov (>=2.6.1)", "pytest-randomly", "pytest-xdist"]
+test-external = ["autopep8", "black", "flake8", "gitpython", "ipykernel", "isort", "jupyter-fs (>=1.0)", "jupyter-server (!=2.11)", "nbconvert", "pre-commit", "pytest", "pytest-randomly", "pytest-xdist", "sphinx-gallery (<0.8)"]
+test-functional = ["pytest", "pytest-randomly", "pytest-xdist"]
+test-integration = ["ipykernel", "jupyter-server (!=2.11)", "nbconvert", "pytest", "pytest-randomly", "pytest-xdist"]
+test-ui = ["calysto-bash"]
+
 [[package]]
 name = "keyring"
 version = "25.4.1"
@ -2777,6 +2891,21 @@ html5 = ["html5lib"]
 htmlsoup = ["BeautifulSoup4"]
 source = ["Cython (==0.29.37)"]

+[[package]]
+name = "markdown"
+version = "3.7"
+description = "Python implementation of John Gruber's Markdown."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "Markdown-3.7-py3-none-any.whl", hash = "sha256:7eb6df5690b81a1d7942992c97fad2938e956e79df20cbc6186e9c3a77b1c803"},
+    {file = "markdown-3.7.tar.gz", hash = "sha256:2ae2471477cfd02dbbf038d5d9bc226d40def84b4fe2986e49b59b6b472bbed2"},
+]
+
+[package.extras]
+docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.5)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"]
+testing = ["coverage", "pyyaml"]
+
 [[package]]
 name = "markdown-it-py"
 version = "3.0.0"
@ -3008,6 +3137,25 @@ files = [
    {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
 ]

+[[package]]
+name = "mdit-py-plugins"
+version = "0.4.2"
+description = "Collection of plugins for markdown-it-py"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "mdit_py_plugins-0.4.2-py3-none-any.whl", hash = "sha256:0c673c3f889399a33b95e88d2f0d111b4447bdfea7f237dab2d488f459835636"},
+    {file = "mdit_py_plugins-0.4.2.tar.gz", hash = "sha256:5f2cd1fdb606ddf152d37ec30e46101a60512bc0e5fa1a7002c36647b09e26b5"},
+]
+
+[package.dependencies]
+markdown-it-py = ">=1.0.0,<4.0.0"
+
+[package.extras]
+code-style = ["pre-commit"]
+rtd = ["myst-parser", "sphinx-book-theme"]
+testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
+
 [[package]]
 name = "mdurl"
 version = "0.1.2"
@ -3034,6 +3182,17 @@ files = [
 numpy = "*"
 pandas = "*"

+[[package]]
+name = "mergedeep"
+version = "1.3.4"
+description = "A deep merge function for 🐍."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307"},
+    {file = "mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8"},
+]
+
 [[package]]
 name = "milvus-lite"
 version = "2.4.10"
@ -3067,6 +3226,122 @@ files = [
    {file = "minijinja-2.2.0.tar.gz", hash = "sha256:4411052c7a60f8d56468cc6d17d45d72be3d5e89e9578a04f8336cc56601523c"},
 ]

+[[package]]
+name = "mistune"
+version = "3.0.2"
+description = "A sane and fast Markdown parser with useful plugins and renderers"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "mistune-3.0.2-py3-none-any.whl", hash = "sha256:71481854c30fdbc938963d3605b72501f5c10a9320ecd412c121c163a1c7d205"},
+    {file = "mistune-3.0.2.tar.gz", hash = "sha256:fc7f93ded930c92394ef2cb6f04a8aabab4117a91449e72dcc8dfa646a508be8"},
+]
+
+[[package]]
+name = "mkdocs"
+version = "1.6.1"
+description = "Project documentation with Markdown."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e"},
+    {file = "mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2"},
+]
+
+[package.dependencies]
+click = ">=7.0"
+colorama = {version = ">=0.4", markers = "platform_system == \"Windows\""}
+ghp-import = ">=1.0"
+jinja2 = ">=2.11.1"
+markdown = ">=3.3.6"
+markupsafe = ">=2.0.1"
+mergedeep = ">=1.3.4"
+mkdocs-get-deps = ">=0.2.0"
+packaging = ">=20.5"
+pathspec = ">=0.11.1"
+pyyaml = ">=5.1"
+pyyaml-env-tag = ">=0.1"
+watchdog = ">=2.0"
+
+[package.extras]
+i18n = ["babel (>=2.9.0)"]
+min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-import (==1.0)", "importlib-metadata (==4.4)", "jinja2 (==2.11.1)", "markdown (==3.3.6)", "markupsafe (==2.0.1)", "mergedeep (==1.3.4)", "mkdocs-get-deps (==0.2.0)", "packaging (==20.5)", "pathspec (==0.11.1)", "pyyaml (==5.1)", "pyyaml-env-tag (==0.1)", "watchdog (==2.0)"]
+
+[[package]]
+name = "mkdocs-get-deps"
+version = "0.2.0"
+description = "MkDocs extension that lists all dependencies according to a mkdocs.yml file"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134"},
+    {file = "mkdocs_get_deps-0.2.0.tar.gz", hash = "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c"},
+]
+
+[package.dependencies]
+mergedeep = ">=1.3.4"
+platformdirs = ">=2.2.0"
+pyyaml = ">=5.1"
+
+[[package]]
+name = "mkdocs-jupyter"
+version = "0.25.0"
+description = "Use Jupyter in mkdocs websites"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "mkdocs_jupyter-0.25.0-py3-none-any.whl", hash = "sha256:d83d71deef19f0401505945bf92ec3bd5b40615af89308e72d5112929f8ee00b"},
+    {file = "mkdocs_jupyter-0.25.0.tar.gz", hash = "sha256:e26c1d341916bc57f96ea3f93d8d0a88fc77c87d4cee222f66d2007798d924f5"},
+]
+
+[package.dependencies]
+ipykernel = ">6.0.0,<7.0.0"
+jupytext = ">1.13.8,<2"
+mkdocs = ">=1.4.0,<2"
+mkdocs-material = ">9.0.0"
+nbconvert = ">=7.2.9,<8"
+pygments = ">2.12.0"
+
+[[package]]
+name = "mkdocs-material"
+version = "9.5.40"
+description = "Documentation that simply works"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "mkdocs_material-9.5.40-py3-none-any.whl", hash = "sha256:8e7a16ada34e79a7b6459ff2602584222f522c738b6a023d1bea853d5049da6f"},
+    {file = "mkdocs_material-9.5.40.tar.gz", hash = "sha256:b69d70e667ec51fc41f65e006a3184dd00d95b2439d982cb1586e4c018943156"},
+]
+
+[package.dependencies]
+babel = ">=2.10,<3.0"
+colorama = ">=0.4,<1.0"
+jinja2 = ">=3.0,<4.0"
+markdown = ">=3.2,<4.0"
+mkdocs = ">=1.6,<2.0"
+mkdocs-material-extensions = ">=1.3,<2.0"
+paginate = ">=0.5,<1.0"
+pygments = ">=2.16,<3.0"
+pymdown-extensions = ">=10.2,<11.0"
+regex = ">=2022.4"
+requests = ">=2.26,<3.0"
+
+[package.extras]
+git = ["mkdocs-git-committers-plugin-2 (>=1.1,<2.0)", "mkdocs-git-revision-date-localized-plugin (>=1.2.4,<2.0)"]
+imaging = ["cairosvg (>=2.6,<3.0)", "pillow (>=10.2,<11.0)"]
+recommended = ["mkdocs-minify-plugin (>=0.7,<1.0)", "mkdocs-redirects (>=1.2,<2.0)", "mkdocs-rss-plugin (>=1.6,<2.0)"]
+
+[[package]]
+name = "mkdocs-material-extensions"
+version = "1.3.1"
+description = "Extension pack for Python Markdown and MkDocs Material."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31"},
+    {file = "mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443"},
+]
+
 [[package]]
 name = "more-itertools"
 version = "10.5.0"
@ -3281,6 +3556,86 @@ files = [
    {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
 ]

+[[package]]
+name = "nbclient"
+version = "0.10.0"
+description = "A client library for executing notebooks. Formerly nbconvert's ExecutePreprocessor."
+optional = false
+python-versions = ">=3.8.0"
+files = [
+    {file = "nbclient-0.10.0-py3-none-any.whl", hash = "sha256:f13e3529332a1f1f81d82a53210322476a168bb7090a0289c795fe9cc11c9d3f"},
+    {file = "nbclient-0.10.0.tar.gz", hash = "sha256:4b3f1b7dba531e498449c4db4f53da339c91d449dc11e9af3a43b4eb5c5abb09"},
+]
+
+[package.dependencies]
+jupyter-client = ">=6.1.12"
+jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0"
+nbformat = ">=5.1"
+traitlets = ">=5.4"
+
+[package.extras]
+dev = ["pre-commit"]
+docs = ["autodoc-traits", "mock", "moto", "myst-parser", "nbclient[test]", "sphinx (>=1.7)", "sphinx-book-theme", "sphinxcontrib-spelling"]
+test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>=7.0.0)", "pytest (>=7.0,<8)", "pytest-asyncio", "pytest-cov (>=4.0)", "testpath", "xmltodict"]
+
+[[package]]
+name = "nbconvert"
+version = "7.16.4"
+description = "Converting Jupyter Notebooks (.ipynb files) to other formats.  Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script.  nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "nbconvert-7.16.4-py3-none-any.whl", hash = "sha256:05873c620fe520b6322bf8a5ad562692343fe3452abda5765c7a34b7d1aa3eb3"},
+    {file = "nbconvert-7.16.4.tar.gz", hash = "sha256:86ca91ba266b0a448dc96fa6c5b9d98affabde2867b363258703536807f9f7f4"},
+]
+
+[package.dependencies]
+beautifulsoup4 = "*"
+bleach = "!=5.0.0"
+defusedxml = "*"
+jinja2 = ">=3.0"
+jupyter-core = ">=4.7"
+jupyterlab-pygments = "*"
+markupsafe = ">=2.0"
+mistune = ">=2.0.3,<4"
+nbclient = ">=0.5.0"
+nbformat = ">=5.7"
+packaging = "*"
+pandocfilters = ">=1.4.1"
+pygments = ">=2.4.1"
+tinycss2 = "*"
+traitlets = ">=5.1"
+
+[package.extras]
+all = ["flaky", "ipykernel", "ipython", "ipywidgets (>=7.5)", "myst-parser", "nbsphinx (>=0.2.12)", "playwright", "pydata-sphinx-theme", "pyqtwebengine (>=5.15)", "pytest (>=7)", "sphinx (==5.0.2)", "sphinxcontrib-spelling", "tornado (>=6.1)"]
+docs = ["ipykernel", "ipython", "myst-parser", "nbsphinx (>=0.2.12)", "pydata-sphinx-theme", "sphinx (==5.0.2)", "sphinxcontrib-spelling"]
+qtpdf = ["pyqtwebengine (>=5.15)"]
+qtpng = ["pyqtwebengine (>=5.15)"]
+serve = ["tornado (>=6.1)"]
+test = ["flaky", "ipykernel", "ipywidgets (>=7.5)", "pytest (>=7)"]
+webpdf = ["playwright"]
+
+[[package]]
+name = "nbformat"
+version = "5.10.4"
+description = "The Jupyter Notebook format"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "nbformat-5.10.4-py3-none-any.whl", hash = "sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b"},
+    {file = "nbformat-5.10.4.tar.gz", hash = "sha256:322168b14f937a5d11362988ecac2a4952d3d8e3a2cbeb2319584631226d5b3a"},
+]
+
+[package.dependencies]
+fastjsonschema = ">=2.15"
+jsonschema = ">=2.6"
+jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0"
+traitlets = ">=5.1"
+
+[package.extras]
+docs = ["myst-parser", "pydata-sphinx-theme", "sphinx", "sphinxcontrib-github-alt", "sphinxcontrib-spelling"]
+test = ["pep440", "pre-commit", "pytest", "testpath"]
+
 [[package]]
 name = "nbqa"
 version = "1.9.0"
@ -3758,6 +4113,21 @@ files = [
    {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
 ]

+[[package]]
+name = "paginate"
+version = "0.5.7"
+description = "Divides large result sets into pages for easier browsing"
+optional = false
+python-versions = "*"
+files = [
+    {file = "paginate-0.5.7-py2.py3-none-any.whl", hash = "sha256:b885e2af73abcf01d9559fd5216b57ef722f8c42affbb63942377668e35c7591"},
+    {file = "paginate-0.5.7.tar.gz", hash = "sha256:22bd083ab41e1a8b4f3690544afb2c60c25e5c9a63a30fa2f483f6c60c8e5945"},
+]
+
+[package.extras]
+dev = ["pytest", "tox"]
+lint = ["black"]
+
 [[package]]
 name = "pandas"
 version = "2.2.3"
@ -3859,6 +4229,17 @@ files = [
 numpy = ">=1.23.5"
 types-pytz = ">=2022.1.1"

+[[package]]
+name = "pandocfilters"
+version = "1.5.1"
+description = "Utilities for writing pandoc filters in python"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "pandocfilters-1.5.1-py2.py3-none-any.whl", hash = "sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc"},
+    {file = "pandocfilters-1.5.1.tar.gz", hash = "sha256:002b4a555ee4ebc03f8b66307e287fa492e4a77b4ea14d3f934328297bb4939e"},
+]
+
 [[package]]
 name = "parso"
 version = "0.8.4"
@ -4574,6 +4955,24 @@ tomlkit = ">=0.10.1"
 spelling = ["pyenchant (>=3.2,<4.0)"]
 testutils = ["gitpython (>3)"]

+[[package]]
+name = "pymdown-extensions"
+version = "10.11.2"
+description = "Extension pack for Python Markdown."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pymdown_extensions-10.11.2-py3-none-any.whl", hash = "sha256:41cdde0a77290e480cf53892f5c5e50921a7ee3e5cd60ba91bf19837b33badcf"},
+    {file = "pymdown_extensions-10.11.2.tar.gz", hash = "sha256:bc8847ecc9e784a098efd35e20cba772bc5a1b529dfcef9dc1972db9021a1049"},
+]
+
+[package.dependencies]
+markdown = ">=3.6"
+pyyaml = "*"
+
+[package.extras]
+extra = ["pygments (>=2.12)"]
+
 [[package]]
 name = "pymilvus"
 version = "2.4.7"
@ -5027,6 +5426,20 @@ files = [
    {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
 ]

+[[package]]
+name = "pyyaml-env-tag"
+version = "0.1"
+description = "A custom YAML tag for referencing environment variables in YAML files. "
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "pyyaml_env_tag-0.1-py3-none-any.whl", hash = "sha256:af31106dec8a4d68c60207c1886031cbf839b68aa7abccdb19868200532c2069"},
+    {file = "pyyaml_env_tag-0.1.tar.gz", hash = "sha256:70092675bda14fdec33b31ba77e7543de9ddc88f2e5b99160396572d11525bdb"},
+]
+
+[package.dependencies]
+pyyaml = "*"
+
 [[package]]
 name = "pyzmq"
 version = "26.2.0"
@ -6342,6 +6755,24 @@ requests = ">=2.26.0"
 [package.extras]
 blobfile = ["blobfile (>=2)"]

+[[package]]
+name = "tinycss2"
+version = "1.3.0"
+description = "A tiny CSS parser"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "tinycss2-1.3.0-py3-none-any.whl", hash = "sha256:54a8dbdffb334d536851be0226030e9505965bb2f30f21a4a82c55fb2a80fae7"},
+    {file = "tinycss2-1.3.0.tar.gz", hash = "sha256:152f9acabd296a8375fbca5b84c961ff95971fcfc32e79550c8df8e29118c54d"},
+]
+
+[package.dependencies]
+webencodings = ">=0.4"
+
+[package.extras]
+doc = ["sphinx", "sphinx_rtd_theme"]
+test = ["pytest", "ruff"]
+
 [[package]]
 name = "tokenize-rt"
 version = "6.0.0"
@ -6843,11 +7274,6 @@ files = [
    {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
    {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
    {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
-    {file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"},
-    {file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"},
-    {file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"},
-    {file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"},
-    {file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"},
 ]

 [package.dependencies]
@ -7084,6 +7510,48 @@ platformdirs = ">=3.9.1,<5"
 docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
 test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]

+[[package]]
+name = "watchdog"
+version = "5.0.3"
+description = "Filesystem events monitoring"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "watchdog-5.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:85527b882f3facda0579bce9d743ff7f10c3e1e0db0a0d0e28170a7d0e5ce2ea"},
+    {file = "watchdog-5.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:53adf73dcdc0ef04f7735066b4a57a4cd3e49ef135daae41d77395f0b5b692cb"},
+    {file = "watchdog-5.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e25adddab85f674acac303cf1f5835951345a56c5f7f582987d266679979c75b"},
+    {file = "watchdog-5.0.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f01f4a3565a387080dc49bdd1fefe4ecc77f894991b88ef927edbfa45eb10818"},
+    {file = "watchdog-5.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:91b522adc25614cdeaf91f7897800b82c13b4b8ac68a42ca959f992f6990c490"},
+    {file = "watchdog-5.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d52db5beb5e476e6853da2e2d24dbbbed6797b449c8bf7ea118a4ee0d2c9040e"},
+    {file = "watchdog-5.0.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:94d11b07c64f63f49876e0ab8042ae034674c8653bfcdaa8c4b32e71cfff87e8"},
+    {file = "watchdog-5.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:349c9488e1d85d0a58e8cb14222d2c51cbc801ce11ac3936ab4c3af986536926"},
+    {file = "watchdog-5.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:53a3f10b62c2d569e260f96e8d966463dec1a50fa4f1b22aec69e3f91025060e"},
+    {file = "watchdog-5.0.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:950f531ec6e03696a2414b6308f5c6ff9dab7821a768c9d5788b1314e9a46ca7"},
+    {file = "watchdog-5.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ae6deb336cba5d71476caa029ceb6e88047fc1dc74b62b7c4012639c0b563906"},
+    {file = "watchdog-5.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1021223c08ba8d2d38d71ec1704496471ffd7be42cfb26b87cd5059323a389a1"},
+    {file = "watchdog-5.0.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:752fb40efc7cc8d88ebc332b8f4bcbe2b5cc7e881bccfeb8e25054c00c994ee3"},
+    {file = "watchdog-5.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a2e8f3f955d68471fa37b0e3add18500790d129cc7efe89971b8a4cc6fdeb0b2"},
+    {file = "watchdog-5.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b8ca4d854adcf480bdfd80f46fdd6fb49f91dd020ae11c89b3a79e19454ec627"},
+    {file = "watchdog-5.0.3-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:90a67d7857adb1d985aca232cc9905dd5bc4803ed85cfcdcfcf707e52049eda7"},
+    {file = "watchdog-5.0.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:720ef9d3a4f9ca575a780af283c8fd3a0674b307651c1976714745090da5a9e8"},
+    {file = "watchdog-5.0.3-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:223160bb359281bb8e31c8f1068bf71a6b16a8ad3d9524ca6f523ac666bb6a1e"},
+    {file = "watchdog-5.0.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:560135542c91eaa74247a2e8430cf83c4342b29e8ad4f520ae14f0c8a19cfb5b"},
+    {file = "watchdog-5.0.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:dd021efa85970bd4824acacbb922066159d0f9e546389a4743d56919b6758b91"},
+    {file = "watchdog-5.0.3-py3-none-manylinux2014_armv7l.whl", hash = "sha256:78864cc8f23dbee55be34cc1494632a7ba30263951b5b2e8fc8286b95845f82c"},
+    {file = "watchdog-5.0.3-py3-none-manylinux2014_i686.whl", hash = "sha256:1e9679245e3ea6498494b3028b90c7b25dbb2abe65c7d07423ecfc2d6218ff7c"},
+    {file = "watchdog-5.0.3-py3-none-manylinux2014_ppc64.whl", hash = "sha256:9413384f26b5d050b6978e6fcd0c1e7f0539be7a4f1a885061473c5deaa57221"},
+    {file = "watchdog-5.0.3-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:294b7a598974b8e2c6123d19ef15de9abcd282b0fbbdbc4d23dfa812959a9e05"},
+    {file = "watchdog-5.0.3-py3-none-manylinux2014_s390x.whl", hash = "sha256:26dd201857d702bdf9d78c273cafcab5871dd29343748524695cecffa44a8d97"},
+    {file = "watchdog-5.0.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:0f9332243355643d567697c3e3fa07330a1d1abf981611654a1f2bf2175612b7"},
+    {file = "watchdog-5.0.3-py3-none-win32.whl", hash = "sha256:c66f80ee5b602a9c7ab66e3c9f36026590a0902db3aea414d59a2f55188c1f49"},
+    {file = "watchdog-5.0.3-py3-none-win_amd64.whl", hash = "sha256:f00b4cf737f568be9665563347a910f8bdc76f88c2970121c86243c8cfdf90e9"},
+    {file = "watchdog-5.0.3-py3-none-win_ia64.whl", hash = "sha256:49f4d36cb315c25ea0d946e018c01bb028048023b9e103d3d3943f58e109dd45"},
+    {file = "watchdog-5.0.3.tar.gz", hash = "sha256:108f42a7f0345042a854d4d0ad0834b741d421330d5f575b81cb27b883500176"},
+]
+
+[package.extras]
+watchmedo = ["PyYAML (>=3.10)"]
+
 [[package]]
 name = "wcwidth"
 version = "0.2.13"
@ -7095,6 +7563,17 @@ files = [
    {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"},
 ]

+[[package]]
+name = "webencodings"
+version = "0.5.1"
+description = "Character encoding aliases for legacy web content"
+optional = false
+python-versions = "*"
+files = [
+    {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"},
+    {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"},
+]
+
 [[package]]
 name = "wheel"
 version = "0.44.0"
@ -7462,4 +7941,4 @@ tesserocr = ["tesserocr"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "536b2f199fe70180aa31e55e7ad47a75a0b64cd20bbe96caec294037966c7b00"
+content-hash = "cae7819c1a144a8aa2b700d0399d7e9d78b55b3c743cfb0b118f4bd0baa2d34e"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -72,6 +72,8 @@ pandas-stubs = "^2.1.4.231227"
 ipykernel = "^6.29.5"
 ipywidgets = "^8.1.5"
 nbqa = "^1.9.0"
+mkdocs-material = "^9.5.40"
+mkdocs-jupyter = "^0.25.0"

 [tool.poetry.group.examples.dependencies]
 datasets = "^2.21.0"