docs: update vlm models api examples with LM Studio (#1759)

update vlm models api examples Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-06-12 05:58:44 -05:00 · 2025-06-12 05:58:44 -05:00 · 0432a31b2f
commit 0432a31b2f
parent 7a275c7637
2 changed files with 97 additions and 15 deletions
--- a/docs/examples/pictures_description_api.py
+++ b/docs/examples/pictures_description_api.py
@ -13,6 +13,12 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption

+### Example of PictureDescriptionApiOptions definitions
+
+#### Using vLLM
+# Models can be launched via:
+# $ vllm serve MODEL_NAME
+

 def vllm_local_options(model: str):
    options = PictureDescriptionApiOptions(
@ -28,6 +34,26 @@ def vllm_local_options(model: str):
    return options


+#### Using LM Studio
+
+
+def lms_local_options(model: str):
+    options = PictureDescriptionApiOptions(
+        url="http://localhost:1234/v1/chat/completions",
+        params=dict(
+            model=model,
+            seed=42,
+            max_completion_tokens=200,
+        ),
+        prompt="Describe the image in three sentences. Be consise and accurate.",
+        timeout=90,
+    )
+    return options
+
+
+#### Using a cloud service like IBM watsonx.ai
+
+
 def watsonx_vlm_options():
    load_dotenv()
    api_key = os.environ.get("WX_API_KEY")
@ -49,7 +75,7 @@ def watsonx_vlm_options():
    options = PictureDescriptionApiOptions(
        url="https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29",
        params=dict(
-            model_id="meta-llama/llama-3-2-11b-vision-instruct",
+            model_id="ibm/granite-vision-3-2-2b",
            project_id=project_id,
            parameters=dict(
                max_new_tokens=400,
@ -64,6 +90,9 @@ def watsonx_vlm_options():
    return options


+### Usage and conversion
+
+
 def main():
    logging.basicConfig(level=logging.INFO)

@ -80,20 +109,28 @@ def main():
    # One possibility is self-hosting model, e.g. via VLLM.
    # $ vllm serve MODEL_NAME
    # Then PictureDescriptionApiOptions can point to the localhost endpoint.
-    #
-    # Example for the Granite Vision model: (uncomment the following lines)
+
+    # Example for the Granite Vision model:
+    # (uncomment the following lines)
    # pipeline_options.picture_description_options = vllm_local_options(
    #     model="ibm-granite/granite-vision-3.1-2b-preview"
    # )
-    #
-    # Example for the SmolVLM model: (uncomment the following lines)
-    pipeline_options.picture_description_options = vllm_local_options(
-        model="HuggingFaceTB/SmolVLM-256M-Instruct"
+
+    # Example for the SmolVLM model:
+    # (uncomment the following lines)
+    # pipeline_options.picture_description_options = vllm_local_options(
+    #     model="HuggingFaceTB/SmolVLM-256M-Instruct"
+    # )
+
+    # For using models on LM Studio using the built-in GGUF or MLX runtimes, e.g. the SmolVLM model:
+    # (uncomment the following lines)
+    pipeline_options.picture_description_options = lms_local_options(
+        model="smolvlm-256m-instruct"
    )
-    #
+
    # Another possibility is using online services, e.g. watsonx.ai.
    # Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID.
-    # Uncomment the following line for this option:
+    # (uncomment the following lines)
    # pipeline_options.picture_description_options = watsonx_vlm_options()

    doc_converter = DocumentConverter(
--- a/docs/examples/vlm_pipeline_api_model.py
+++ b/docs/examples/vlm_pipeline_api_model.py
@ -13,6 +13,27 @@ from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, Response
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline

+### Example of ApiVlmOptions definitions
+
+#### Using LM Studio
+
+
+def lms_vlm_options(model: str, prompt: str, format: ResponseFormat):
+    options = ApiVlmOptions(
+        url="http://localhost:1234/v1/chat/completions",  # the default LM Studio
+        params=dict(
+            model=model,
+        ),
+        prompt=prompt,
+        timeout=90,
+        scale=1.0,
+        response_format=format,
+    )
+    return options
+
+
+#### Using Ollama
+

 def ollama_vlm_options(model: str, prompt: str):
    options = ApiVlmOptions(
@ -28,6 +49,9 @@ def ollama_vlm_options(model: str, prompt: str):
    return options


+#### Using a cloud service like IBM watsonx.ai
+
+
 def watsonx_vlm_options(model: str, prompt: str):
    load_dotenv()
    api_key = os.environ.get("WX_API_KEY")
@ -65,6 +89,9 @@ def watsonx_vlm_options(model: str, prompt: str):
    return options


+### Usage and conversion
+
+
 def main():
    logging.basicConfig(level=logging.INFO)

@ -78,16 +105,34 @@ def main():
    # The ApiVlmOptions() allows to interface with APIs supporting
    # the multi-modal chat interface. Here follow a few example on how to configure those.

-    # One possibility is self-hosting model, e.g. via Ollama.
-    # Example using the Granite Vision  model: (uncomment the following lines)
-    pipeline_options.vlm_options = ollama_vlm_options(
-        model="granite3.2-vision:2b",
-        prompt="OCR the full page to markdown.",
+    # One possibility is self-hosting model, e.g. via LM Studio, Ollama or others.
+
+    # Example using the SmolDocling model with LM Studio:
+    # (uncomment the following lines)
+    pipeline_options.vlm_options = lms_vlm_options(
+        model="smoldocling-256m-preview-mlx-docling-snap",
+        prompt="Convert this page to docling.",
+        format=ResponseFormat.DOCTAGS,
    )

+    # Example using the Granite Vision model with LM Studio:
+    # (uncomment the following lines)
+    # pipeline_options.vlm_options = lms_vlm_options(
+    #     model="granite-vision-3.2-2b",
+    #     prompt="OCR the full page to markdown.",
+    #     format=ResponseFormat.MARKDOWN,
+    # )
+
+    # Example using the Granite Vision model with Ollama:
+    # (uncomment the following lines)
+    # pipeline_options.vlm_options = ollama_vlm_options(
+    #     model="granite3.2-vision:2b",
+    #     prompt="OCR the full page to markdown.",
+    # )
+
    # Another possibility is using online services, e.g. watsonx.ai.
    # Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID.
-    # Uncomment the following line for this option:
+    # (uncomment the following lines)
    # pipeline_options.vlm_options = watsonx_vlm_options(
    #     model="ibm/granite-vision-3-2-2b", prompt="OCR the full page to markdown."
    # )