{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# RAG with Docling and 🦜🔗 LangChain" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "# requirements for this example:\n", "%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "\n", "warnings.filterwarnings(action=\"ignore\", category=UserWarning, module=\"pydantic|torch\")\n", "warnings.filterwarnings(action=\"ignore\", category=FutureWarning, module=\"easyocr\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Loader and splitter" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Below we set up:\n", "- a `Loader` which will be used to create LangChain documents, and\n", "- a splitter, which will be used to split these documents" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from enum import Enum\n", "from typing import Iterator\n", "\n", "from langchain_core.document_loaders import BaseLoader\n", "from langchain_core.documents import Document as LCDocument\n", "from pydantic import BaseModel\n", "\n", "from docling.document_converter import DocumentConverter\n", "\n", "\n", "class DocumentMetadata(BaseModel):\n", " dl_doc_hash: str\n", " # source: str\n", "\n", "\n", "class DoclingPDFLoader(BaseLoader):\n", " class ParseType(str, Enum):\n", " MARKDOWN = \"markdown\"\n", " # JSON = \"json\"\n", "\n", " def __init__(self, file_path: str | list[str], parse_type: ParseType) -> None:\n", " self._file_paths = file_path if isinstance(file_path, list) else [file_path]\n", " self._parse_type = parse_type\n", " self._converter = DocumentConverter()\n", "\n", " def lazy_load(self) -> Iterator[LCDocument]:\n", " for source in self._file_paths:\n", " dl_doc = self._converter.convert_single(source).output\n", " match self._parse_type:\n", " case self.ParseType.MARKDOWN:\n", " text = dl_doc.export_to_markdown()\n", " # case self.ParseType.JSON:\n", " # text = dl_doc.model_dump_json()\n", " case _:\n", " raise RuntimeError(\n", " f\"Unexpected parse type encountered: {self._parse_type}\"\n", " )\n", " lc_doc = LCDocument(\n", " page_content=text,\n", " metadata=DocumentMetadata(\n", " dl_doc_hash=dl_doc.file_info.document_hash,\n", " ).model_dump(),\n", " )\n", " yield lc_doc" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "FILE_PATH = \"https://arxiv.org/pdf/2206.01062\" # DocLayNet paper" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1b38d07d5fed4618a44ecf261e1e5c44", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Fetching 7 files: 0%| | 0/7 [00:00