From db3ceefd4ae6251a97e333bcb03051698b3fa71a Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Fri, 28 Feb 2025 14:54:46 +0100 Subject: [PATCH] docs: improve docs on token limit warning triggered by HybridChunker (#1077) Signed-off-by: Panos Vagenas --- docs/examples/hybrid_chunking.ipynb | 21 ++++++++++++++++++--- docs/faq.md | 15 +++++++++------ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/docs/examples/hybrid_chunking.ipynb b/docs/examples/hybrid_chunking.ipynb index 2b7861a..6a5f588 100644 --- a/docs/examples/hybrid_chunking.ipynb +++ b/docs/examples/hybrid_chunking.ipynb @@ -83,7 +83,15 @@ "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors\n" + ] + } + ], "source": [ "from docling.chunking import HybridChunker\n", "\n", @@ -91,6 +99,13 @@ "chunk_iter = chunker.chunk(dl_doc=doc)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> 👉 **NOTE**: As you see above, using the `HybridChunker` can sometimes lead to a warning from the transformers library, however this is a \"false alarm\" — for details check [here](https://ds4sd.github.io/docling/faq/#hybridchunker-triggers-warning-token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-length-for-this-model)." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -337,11 +352,11 @@ "source": [ "for i, chunk in enumerate(chunks):\n", " print(f\"=== {i} ===\")\n", - " txt_tokens = len(tokenizer.tokenize(chunk.text, max_length=None))\n", + " txt_tokens = len(tokenizer.tokenize(chunk.text))\n", " print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n", "\n", " ser_txt = chunker.serialize(chunk=chunk)\n", - " ser_tokens = len(tokenizer.tokenize(ser_txt, max_length=None))\n", + " ser_tokens = len(tokenizer.tokenize(ser_txt))\n", " print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n", "\n", " print()" diff --git a/docs/faq.md b/docs/faq.md index 5f54d86..ae57446 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -150,7 +150,7 @@ This is a collection of FAQ collected from the user questions on Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors + > Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors This is a warning that is emitted by transformers, saying that actually *running this sequence through the model* will result in indexing errors, i.e. the problematic case is only if one indeed passes the particular sequence through the (embedding) model. @@ -163,14 +163,17 @@ This is a collection of FAQ collected from the user questions on max_len: - max_len = ser_tokens + ser_tokens = len(tokenizer.tokenize(ser_txt)) + if ser_tokens > chunk_max_len: + chunk_max_len = ser_tokens print(f"{i}\t{ser_tokens}\t{repr(ser_txt[:100])}...") - print(f"{max_len=}") + print(f"Longest chunk yielded: {chunk_max_len} tokens") + print(f"Model max length: {tokenizer.model_max_length}") ``` + Also see [docling#725](https://github.com/DS4SD/docling/issues/725). + Source: Issue [docling-core#119](https://github.com/DS4SD/docling-core/issues/119)