
* draft for picture description models Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * vlm description using AutoModelForVision2Seq Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add generation options Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update vlm API Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * allow only localhost traffic Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename model Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * do not run with vlm api Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * more renaming Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix examples path Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply CLI download login Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix name of cli argument Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use with_smolvlm in models download Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
106 lines
3.1 KiB
Python
106 lines
3.1 KiB
Python
import base64
|
|
import io
|
|
import logging
|
|
from typing import Iterable, List, Optional
|
|
|
|
import httpx
|
|
from docling_core.types.doc import PictureItem
|
|
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
|
PictureDescriptionData,
|
|
)
|
|
from PIL import Image
|
|
from pydantic import BaseModel, ConfigDict
|
|
|
|
from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
|
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
|
|
|
_log = logging.getLogger(__name__)
|
|
|
|
|
|
class ChatMessage(BaseModel):
|
|
role: str
|
|
content: str
|
|
|
|
|
|
class ResponseChoice(BaseModel):
|
|
index: int
|
|
message: ChatMessage
|
|
finish_reason: str
|
|
|
|
|
|
class ResponseUsage(BaseModel):
|
|
prompt_tokens: int
|
|
completion_tokens: int
|
|
total_tokens: int
|
|
|
|
|
|
class ApiResponse(BaseModel):
|
|
model_config = ConfigDict(
|
|
protected_namespaces=(),
|
|
)
|
|
|
|
id: str
|
|
model: Optional[str] = None # returned by openai
|
|
choices: List[ResponseChoice]
|
|
created: int
|
|
usage: ResponseUsage
|
|
|
|
|
|
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|
# elements_batch_size = 4
|
|
|
|
def __init__(self, enabled: bool, options: PictureDescriptionApiOptions):
|
|
super().__init__(enabled=enabled, options=options)
|
|
self.options: PictureDescriptionApiOptions
|
|
|
|
if self.enabled:
|
|
if options.url.host != "localhost":
|
|
raise NotImplementedError(
|
|
"The options try to connect to remote APIs which are not yet allowed."
|
|
)
|
|
|
|
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
|
# Note: technically we could make a batch request here,
|
|
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
|
for image in images:
|
|
img_io = io.BytesIO()
|
|
image.save(img_io, "PNG")
|
|
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
|
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": self.options.prompt,
|
|
},
|
|
{
|
|
"type": "image_url",
|
|
"image_url": {
|
|
"url": f"data:image/png;base64,{image_base64}"
|
|
},
|
|
},
|
|
],
|
|
}
|
|
]
|
|
|
|
payload = {
|
|
"messages": messages,
|
|
**self.options.params,
|
|
}
|
|
|
|
r = httpx.post(
|
|
str(self.options.url),
|
|
headers=self.options.headers,
|
|
json=payload,
|
|
timeout=self.options.timeout,
|
|
)
|
|
if not r.is_success:
|
|
_log.error(f"Error calling the API. Reponse was {r.text}")
|
|
r.raise_for_status()
|
|
|
|
api_resp = ApiResponse.model_validate_json(r.text)
|
|
generated_text = api_resp.choices[0].message.content.strip()
|
|
yield generated_text
|