docs: update vlm models api examples with LM Studio (#1759)

update vlm models api examples Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-06-12 05:58:44 -05:00
parent 7a275c7637
commit 0432a31b2f
2 changed files with 97 additions and 15 deletions
--- a/docs/examples/pictures_description_api.py
+++ b/docs/examples/pictures_description_api.py
@@ -13,6 +13,12 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
 ### Example of PictureDescriptionApiOptions definitions
 #### Using vLLM
 # Models can be launched via:
 # $ vllm serve MODEL_NAME
 def vllm_local_options(model: str):
    options = PictureDescriptionApiOptions(
@@ -28,6 +34,26 @@ def vllm_local_options(model: str):
    return options
 #### Using LM Studio
 def lms_local_options(model: str):
    options = PictureDescriptionApiOptions(
        url="http://localhost:1234/v1/chat/completions",
        params=dict(
            model=model,
            seed=42,
            max_completion_tokens=200,
        ),
        prompt="Describe the image in three sentences. Be consise and accurate.",
        timeout=90,
    )
    return options
 #### Using a cloud service like IBM watsonx.ai
 def watsonx_vlm_options():
    load_dotenv()
    api_key = os.environ.get("WX_API_KEY")
@@ -49,7 +75,7 @@ def watsonx_vlm_options():
    options = PictureDescriptionApiOptions(
        url="https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29",
        params=dict(
-            model_id="meta-llama/llama-3-2-11b-vision-instruct",
+            model_id="ibm/granite-vision-3-2-2b",
            project_id=project_id,
            parameters=dict(
                max_new_tokens=400,
@@ -64,6 +90,9 @@ def watsonx_vlm_options():
    return options
 ### Usage and conversion
 def main():
    logging.basicConfig(level=logging.INFO)
@@ -80,20 +109,28 @@ def main():
    # One possibility is self-hosting model, e.g. via VLLM.
    # $ vllm serve MODEL_NAME
    # Then PictureDescriptionApiOptions can point to the localhost endpoint.
-    #
+
-    # Example for the Granite Vision model: (uncomment the following lines)
+    # Example for the Granite Vision model:
    # (uncomment the following lines)
    # pipeline_options.picture_description_options = vllm_local_options(
    #     model="ibm-granite/granite-vision-3.1-2b-preview"
    # )
-    #
+
-    # Example for the SmolVLM model: (uncomment the following lines)
+    # Example for the SmolVLM model:
-    pipeline_options.picture_description_options = vllm_local_options(
+    # (uncomment the following lines)
-        model="HuggingFaceTB/SmolVLM-256M-Instruct"
+    # pipeline_options.picture_description_options = vllm_local_options(
    #     model="HuggingFaceTB/SmolVLM-256M-Instruct"
    # )
    # For using models on LM Studio using the built-in GGUF or MLX runtimes, e.g. the SmolVLM model:
    # (uncomment the following lines)
    pipeline_options.picture_description_options = lms_local_options(
        model="smolvlm-256m-instruct"
    )
-    #
+
    # Another possibility is using online services, e.g. watsonx.ai.
    # Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID.
-    # Uncomment the following line for this option:
+    # (uncomment the following lines)
    # pipeline_options.picture_description_options = watsonx_vlm_options()
    doc_converter = DocumentConverter(
--- a/docs/examples/vlm_pipeline_api_model.py
+++ b/docs/examples/vlm_pipeline_api_model.py
@@ -13,6 +13,27 @@ from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, Response
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
 ### Example of ApiVlmOptions definitions
 #### Using LM Studio
 def lms_vlm_options(model: str, prompt: str, format: ResponseFormat):
    options = ApiVlmOptions(
        url="http://localhost:1234/v1/chat/completions",  # the default LM Studio
        params=dict(
            model=model,
        ),
        prompt=prompt,
        timeout=90,
        scale=1.0,
        response_format=format,
    )
    return options
 #### Using Ollama
 def ollama_vlm_options(model: str, prompt: str):
    options = ApiVlmOptions(
@@ -28,6 +49,9 @@ def ollama_vlm_options(model: str, prompt: str):
    return options
 #### Using a cloud service like IBM watsonx.ai
 def watsonx_vlm_options(model: str, prompt: str):
    load_dotenv()
    api_key = os.environ.get("WX_API_KEY")
@@ -65,6 +89,9 @@ def watsonx_vlm_options(model: str, prompt: str):
    return options
 ### Usage and conversion
 def main():
    logging.basicConfig(level=logging.INFO)
@@ -78,16 +105,34 @@ def main():
    # The ApiVlmOptions() allows to interface with APIs supporting
    # the multi-modal chat interface. Here follow a few example on how to configure those.
-    # One possibility is self-hosting model, e.g. via Ollama.
+    # One possibility is self-hosting model, e.g. via LM Studio, Ollama or others.
-    # Example using the Granite Vision  model: (uncomment the following lines)
+
-    pipeline_options.vlm_options = ollama_vlm_options(
+    # Example using the SmolDocling model with LM Studio:
-        model="granite3.2-vision:2b",
+    # (uncomment the following lines)
-        prompt="OCR the full page to markdown.",
+    pipeline_options.vlm_options = lms_vlm_options(
        model="smoldocling-256m-preview-mlx-docling-snap",
        prompt="Convert this page to docling.",
        format=ResponseFormat.DOCTAGS,
    )
    # Example using the Granite Vision model with LM Studio:
    # (uncomment the following lines)
    # pipeline_options.vlm_options = lms_vlm_options(
    #     model="granite-vision-3.2-2b",
    #     prompt="OCR the full page to markdown.",
    #     format=ResponseFormat.MARKDOWN,
    # )
    # Example using the Granite Vision model with Ollama:
    # (uncomment the following lines)
    # pipeline_options.vlm_options = ollama_vlm_options(
    #     model="granite3.2-vision:2b",
    #     prompt="OCR the full page to markdown.",
    # )
    # Another possibility is using online services, e.g. watsonx.ai.
    # Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID.
-    # Uncomment the following line for this option:
+    # (uncomment the following lines)
    # pipeline_options.vlm_options = watsonx_vlm_options(
    #     model="ibm/granite-vision-3-2-2b", prompt="OCR the full page to markdown."
    # )