From 2d66e99b69f39a282109c366fae3679f41c6e081 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Thu, 13 Feb 2025 08:33:12 +0100 Subject: [PATCH] docs: Examples for picture descriptions (#951) * add more examples for picture descriptions Signed-off-by: Michele Dolfi * fix merge typo Signed-off-by: Michele Dolfi --------- Signed-off-by: Michele Dolfi --- docs/examples/pictures_description_api.py | 91 +++++++++++++++++++---- mkdocs.yml | 2 + 2 files changed, 78 insertions(+), 15 deletions(-) diff --git a/docs/examples/pictures_description_api.py b/docs/examples/pictures_description_api.py index 05689c5..8e105d2 100644 --- a/docs/examples/pictures_description_api.py +++ b/docs/examples/pictures_description_api.py @@ -1,7 +1,10 @@ import logging +import os from pathlib import Path +import requests from docling_core.types.doc import PictureItem +from dotenv import load_dotenv from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( @@ -11,29 +14,87 @@ from docling.datamodel.pipeline_options import ( from docling.document_converter import DocumentConverter, PdfFormatOption -def main(): - logging.basicConfig(level=logging.INFO) - - input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") - - # This is using a local API server to do picture description. - # For example, you can launch it locally with: - # $ vllm serve "HuggingFaceTB/SmolVLM-256M-Instruct" - - pipeline_options = PdfPipelineOptions( - enable_remote_services=True # <-- this is required! - ) - pipeline_options.do_picture_description = True - pipeline_options.picture_description_options = PictureDescriptionApiOptions( +def vllm_local_options(model: str): + options = PictureDescriptionApiOptions( url="http://localhost:8000/v1/chat/completions", params=dict( - model="HuggingFaceTB/SmolVLM-256M-Instruct", + model=model, seed=42, max_completion_tokens=200, ), prompt="Describe the image in three sentences. Be consise and accurate.", timeout=90, ) + return options + + +def watsonx_vlm_options(): + load_dotenv() + api_key = os.environ.get("WX_API_KEY") + project_id = os.environ.get("WX_PROJECT_ID") + + def _get_iam_access_token(api_key: str) -> str: + res = requests.post( + url="https://iam.cloud.ibm.com/identity/token", + headers={ + "Content-Type": "application/x-www-form-urlencoded", + }, + data=f"grant_type=urn:ibm:params:oauth:grant-type:apikey&apikey={api_key}", + ) + res.raise_for_status() + api_out = res.json() + print(f"{api_out=}") + return api_out["access_token"] + + options = PictureDescriptionApiOptions( + url="https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29", + params=dict( + model_id="meta-llama/llama-3-2-11b-vision-instruct", + project_id=project_id, + parameters=dict( + max_new_tokens=400, + ), + ), + headers={ + "Authorization": "Bearer " + _get_iam_access_token(api_key=api_key), + }, + prompt="Describe the image in three sentences. Be consise and accurate.", + timeout=60, + ) + return options + + +def main(): + logging.basicConfig(level=logging.INFO) + + input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + + pipeline_options = PdfPipelineOptions( + enable_remote_services=True # <-- this is required! + ) + pipeline_options.do_picture_description = True + + # The PictureDescriptionApiOptions() allows to interface with APIs supporting + # the multi-modal chat interface. Here follow a few example on how to configure those. + # + # One possibility is self-hosting model, e.g. via VLLM. + # $ vllm serve MODEL_NAME + # Then PictureDescriptionApiOptions can point to the localhost endpoint. + # + # Example for the Granite Vision model: (uncomment the following lines) + # pipeline_options.picture_description_options = vllm_local_options( + # model="ibm-granite/granite-vision-3.1-2b-preview" + # ) + # + # Example for the SmolVLM model: (uncomment the following lines) + pipeline_options.picture_description_options = vllm_local_options( + model="HuggingFaceTB/SmolVLM-256M-Instruct" + ) + # + # Another possibility is using online services, e.g. watsonx.ai. + # Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID. + # Uncomment the following line for this option: + # pipeline_options.picture_description_options = watsonx_vlm_options() doc_converter = DocumentConverter( format_options={ diff --git a/mkdocs.yml b/mkdocs.yml index abb93a2..46de22a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -75,6 +75,8 @@ nav: - "Figure enrichment": examples/develop_picture_enrichment.py - "Table export": examples/export_tables.py - "Multimodal export": examples/export_multimodal.py + - "Annotate picture with local vlm": examples/pictures_description.py + - "Annotate picture with remote vlm": examples/pictures_description_api.py - "Force full page OCR": examples/full_page_ocr.py - "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py - "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py