From db3ceefd4ae6251a97e333bcb03051698b3fa71a Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Fri, 28 Feb 2025 14:54:46 +0100
Subject: [PATCH] docs: improve docs on token limit warning triggered by
 HybridChunker (#1077)

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
---
 docs/examples/hybrid_chunking.ipynb | 21 ++++++++++++++++++---
 docs/faq.md                         | 15 +++++++++------
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/docs/examples/hybrid_chunking.ipynb b/docs/examples/hybrid_chunking.ipynb
index 2b7861a..6a5f588 100644
--- a/docs/examples/hybrid_chunking.ipynb
+++ b/docs/examples/hybrid_chunking.ipynb
@@ -83,7 +83,15 @@
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors\n"
+     ]
+    }
+   ],
    "source": [
     "from docling.chunking import HybridChunker\n",
     "\n",
@@ -91,6 +99,13 @@
     "chunk_iter = chunker.chunk(dl_doc=doc)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> 👉 **NOTE**: As you see above, using the `HybridChunker` can sometimes lead to a warning from the transformers library, however this is a \"false alarm\" — for details check [here](https://ds4sd.github.io/docling/faq/#hybridchunker-triggers-warning-token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-length-for-this-model)."
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -337,11 +352,11 @@
    "source": [
     "for i, chunk in enumerate(chunks):\n",
     "    print(f\"=== {i} ===\")\n",
-    "    txt_tokens = len(tokenizer.tokenize(chunk.text, max_length=None))\n",
+    "    txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
     "    print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
     "\n",
     "    ser_txt = chunker.serialize(chunk=chunk)\n",
-    "    ser_tokens = len(tokenizer.tokenize(ser_txt, max_length=None))\n",
+    "    ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
     "    print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
     "\n",
     "    print()"
diff --git a/docs/faq.md b/docs/faq.md
index 5f54d86..ae57446 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -150,7 +150,7 @@ This is a collection of FAQ collected from the user questions on <https://github
     **Details**:
 
     Using the [`HybridChunker`](./concepts/chunking.md#hybrid-chunker) often triggers a warning like this:
-    > Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors
+    > Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors
 
     This is a warning that is emitted by transformers, saying that actually *running this sequence through the model* will result in indexing errors, i.e. the problematic case is only if one indeed passes the particular sequence through the (embedding) model.
 
@@ -163,14 +163,17 @@ This is a collection of FAQ collected from the user questions on <https://github
     The snippet below can be used for getting the actual maximum chunk size (for users wanting to confirm that this does not exceed the model limit):
 
     ```python
-    max_len = 0
+    chunk_max_len = 0
     for i, chunk in enumerate(chunks):
         ser_txt = chunker.serialize(chunk=chunk)
-        ser_tokens = len(tokenizer.tokenize(ser_txt, max_len_length=None))
-        if ser_tokens > max_len:
-            max_len = ser_tokens
+        ser_tokens = len(tokenizer.tokenize(ser_txt))
+        if ser_tokens > chunk_max_len:
+            chunk_max_len = ser_tokens
         print(f"{i}\t{ser_tokens}\t{repr(ser_txt[:100])}...")
-    print(f"{max_len=}")
+    print(f"Longest chunk yielded: {chunk_max_len} tokens")
+    print(f"Model max length: {tokenizer.model_max_length}")
     ```
 
+    Also see [docling#725](https://github.com/DS4SD/docling/issues/725).
+
     Source: Issue [docling-core#119](https://github.com/DS4SD/docling-core/issues/119)