Docling/docling/models/picture_description_api_model.py
Michele Dolfi 4cc6e3ea5e
feat: Describe pictures using vision models (#259)
* draft for picture description models

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* vlm description using AutoModelForVision2Seq

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add generation options

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update vlm API

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* allow only localhost traffic

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* rename model

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* do not run with vlm api

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* more renaming

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix examples path

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply CLI download login

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix name of cli argument

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* use with_smolvlm in models download

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-02-07 16:30:42 +01:00

106 lines
3.1 KiB
Python

import base64
import io
import logging
from typing import Iterable, List, Optional
import httpx
from docling_core.types.doc import PictureItem
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
PictureDescriptionData,
)
from PIL import Image
from pydantic import BaseModel, ConfigDict
from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
_log = logging.getLogger(__name__)
class ChatMessage(BaseModel):
role: str
content: str
class ResponseChoice(BaseModel):
index: int
message: ChatMessage
finish_reason: str
class ResponseUsage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class ApiResponse(BaseModel):
model_config = ConfigDict(
protected_namespaces=(),
)
id: str
model: Optional[str] = None # returned by openai
choices: List[ResponseChoice]
created: int
usage: ResponseUsage
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
# elements_batch_size = 4
def __init__(self, enabled: bool, options: PictureDescriptionApiOptions):
super().__init__(enabled=enabled, options=options)
self.options: PictureDescriptionApiOptions
if self.enabled:
if options.url.host != "localhost":
raise NotImplementedError(
"The options try to connect to remote APIs which are not yet allowed."
)
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
# Note: technically we could make a batch request here,
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
for image in images:
img_io = io.BytesIO()
image.save(img_io, "PNG")
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": self.options.prompt,
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
},
},
],
}
]
payload = {
"messages": messages,
**self.options.params,
}
r = httpx.post(
str(self.options.url),
headers=self.options.headers,
json=payload,
timeout=self.options.timeout,
)
if not r.is_success:
_log.error(f"Error calling the API. Reponse was {r.text}")
r.raise_for_status()
api_resp = ApiResponse.model_validate_json(r.text)
generated_text = api_resp.choices[0].message.content.strip()
yield generated_text