diff --git a/docs/examples/pictures_description_api.py b/docs/examples/pictures_description_api.py index 8e105d2..938d0a5 100644 --- a/docs/examples/pictures_description_api.py +++ b/docs/examples/pictures_description_api.py @@ -13,6 +13,12 @@ from docling.datamodel.pipeline_options import ( ) from docling.document_converter import DocumentConverter, PdfFormatOption +### Example of PictureDescriptionApiOptions definitions + +#### Using vLLM +# Models can be launched via: +# $ vllm serve MODEL_NAME + def vllm_local_options(model: str): options = PictureDescriptionApiOptions( @@ -28,6 +34,26 @@ def vllm_local_options(model: str): return options +#### Using LM Studio + + +def lms_local_options(model: str): + options = PictureDescriptionApiOptions( + url="http://localhost:1234/v1/chat/completions", + params=dict( + model=model, + seed=42, + max_completion_tokens=200, + ), + prompt="Describe the image in three sentences. Be consise and accurate.", + timeout=90, + ) + return options + + +#### Using a cloud service like IBM watsonx.ai + + def watsonx_vlm_options(): load_dotenv() api_key = os.environ.get("WX_API_KEY") @@ -49,7 +75,7 @@ def watsonx_vlm_options(): options = PictureDescriptionApiOptions( url="https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29", params=dict( - model_id="meta-llama/llama-3-2-11b-vision-instruct", + model_id="ibm/granite-vision-3-2-2b", project_id=project_id, parameters=dict( max_new_tokens=400, @@ -64,6 +90,9 @@ def watsonx_vlm_options(): return options +### Usage and conversion + + def main(): logging.basicConfig(level=logging.INFO) @@ -80,20 +109,28 @@ def main(): # One possibility is self-hosting model, e.g. via VLLM. # $ vllm serve MODEL_NAME # Then PictureDescriptionApiOptions can point to the localhost endpoint. - # - # Example for the Granite Vision model: (uncomment the following lines) + + # Example for the Granite Vision model: + # (uncomment the following lines) # pipeline_options.picture_description_options = vllm_local_options( # model="ibm-granite/granite-vision-3.1-2b-preview" # ) - # - # Example for the SmolVLM model: (uncomment the following lines) - pipeline_options.picture_description_options = vllm_local_options( - model="HuggingFaceTB/SmolVLM-256M-Instruct" + + # Example for the SmolVLM model: + # (uncomment the following lines) + # pipeline_options.picture_description_options = vllm_local_options( + # model="HuggingFaceTB/SmolVLM-256M-Instruct" + # ) + + # For using models on LM Studio using the built-in GGUF or MLX runtimes, e.g. the SmolVLM model: + # (uncomment the following lines) + pipeline_options.picture_description_options = lms_local_options( + model="smolvlm-256m-instruct" ) - # + # Another possibility is using online services, e.g. watsonx.ai. # Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID. - # Uncomment the following line for this option: + # (uncomment the following lines) # pipeline_options.picture_description_options = watsonx_vlm_options() doc_converter = DocumentConverter( diff --git a/docs/examples/vlm_pipeline_api_model.py b/docs/examples/vlm_pipeline_api_model.py index ec29e21..20ca259 100644 --- a/docs/examples/vlm_pipeline_api_model.py +++ b/docs/examples/vlm_pipeline_api_model.py @@ -13,6 +13,27 @@ from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, Response from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline +### Example of ApiVlmOptions definitions + +#### Using LM Studio + + +def lms_vlm_options(model: str, prompt: str, format: ResponseFormat): + options = ApiVlmOptions( + url="http://localhost:1234/v1/chat/completions", # the default LM Studio + params=dict( + model=model, + ), + prompt=prompt, + timeout=90, + scale=1.0, + response_format=format, + ) + return options + + +#### Using Ollama + def ollama_vlm_options(model: str, prompt: str): options = ApiVlmOptions( @@ -28,6 +49,9 @@ def ollama_vlm_options(model: str, prompt: str): return options +#### Using a cloud service like IBM watsonx.ai + + def watsonx_vlm_options(model: str, prompt: str): load_dotenv() api_key = os.environ.get("WX_API_KEY") @@ -65,6 +89,9 @@ def watsonx_vlm_options(model: str, prompt: str): return options +### Usage and conversion + + def main(): logging.basicConfig(level=logging.INFO) @@ -78,16 +105,34 @@ def main(): # The ApiVlmOptions() allows to interface with APIs supporting # the multi-modal chat interface. Here follow a few example on how to configure those. - # One possibility is self-hosting model, e.g. via Ollama. - # Example using the Granite Vision model: (uncomment the following lines) - pipeline_options.vlm_options = ollama_vlm_options( - model="granite3.2-vision:2b", - prompt="OCR the full page to markdown.", + # One possibility is self-hosting model, e.g. via LM Studio, Ollama or others. + + # Example using the SmolDocling model with LM Studio: + # (uncomment the following lines) + pipeline_options.vlm_options = lms_vlm_options( + model="smoldocling-256m-preview-mlx-docling-snap", + prompt="Convert this page to docling.", + format=ResponseFormat.DOCTAGS, ) + # Example using the Granite Vision model with LM Studio: + # (uncomment the following lines) + # pipeline_options.vlm_options = lms_vlm_options( + # model="granite-vision-3.2-2b", + # prompt="OCR the full page to markdown.", + # format=ResponseFormat.MARKDOWN, + # ) + + # Example using the Granite Vision model with Ollama: + # (uncomment the following lines) + # pipeline_options.vlm_options = ollama_vlm_options( + # model="granite3.2-vision:2b", + # prompt="OCR the full page to markdown.", + # ) + # Another possibility is using online services, e.g. watsonx.ai. # Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID. - # Uncomment the following line for this option: + # (uncomment the following lines) # pipeline_options.vlm_options = watsonx_vlm_options( # model="ibm/granite-vision-3-2-2b", prompt="OCR the full page to markdown." # )