feat: new vlm-models support (#1570)
* feat: adding new vlm-models support Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the transformers Signed-off-by: Peter Staar <taa@zurich.ibm.com> * got microsoft/Phi-4-multimodal-instruct to work Signed-off-by: Peter Staar <taa@zurich.ibm.com> * working on vlm's Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactoring the VLM part Signed-off-by: Peter Staar <taa@zurich.ibm.com> * all working, now serious refacgtoring necessary Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactoring the download_model Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the formulate_prompt Signed-off-by: Peter Staar <taa@zurich.ibm.com> * pixtral 12b runs via MLX and native transformers Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the VlmPredictionToken Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactoring minimal_vlm_pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the MyPy Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added pipeline_model_specializations file Signed-off-by: Peter Staar <taa@zurich.ibm.com> * need to get Phi4 working again ... Signed-off-by: Peter Staar <taa@zurich.ibm.com> * finalising last points for vlms support Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the pipeline for Phi4 Signed-off-by: Peter Staar <taa@zurich.ibm.com> * streamlining all code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted the code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixing the tests Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the html backend to the VLM pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the static load_from_doctags Signed-off-by: Peter Staar <taa@zurich.ibm.com> * restore stable imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use AutoModelForVision2Seq for Pixtral and review example (including rename) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove unused value Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * refactor instances of VLM models Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * skip compare example in CI Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use lowercase and uppercase only Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add new minimal_vlm example and refactor pipeline_options_vlm_model for cleaner import Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename pipeline_vlm_model_spec Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * move more argument to options and simplify model init Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add supported_devices Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove not-needed function Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * exclude minimal_vlm Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * missing file Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add message for transformers version Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename to specs Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use module import and remove MLX from non-darwin Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove hf_vlm_model and add extra_generation_args Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use single HF VLM model class Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove torch type Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add docs for vision models Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
08dcacc5cb
commit
cfdf4cea25
68
docling/datamodel/accelerator_options.py
Normal file
68
docling/datamodel/accelerator_options.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from enum import Enum
|
||||
from typing import Any, Union
|
||||
|
||||
from pydantic import field_validator, model_validator
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AcceleratorDevice(str, Enum):
|
||||
"""Devices to run model inference"""
|
||||
|
||||
AUTO = "auto"
|
||||
CPU = "cpu"
|
||||
CUDA = "cuda"
|
||||
MPS = "mps"
|
||||
|
||||
|
||||
class AcceleratorOptions(BaseSettings):
|
||||
model_config = SettingsConfigDict(
|
||||
env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
|
||||
)
|
||||
|
||||
num_threads: int = 4
|
||||
device: Union[str, AcceleratorDevice] = "auto"
|
||||
cuda_use_flash_attention2: bool = False
|
||||
|
||||
@field_validator("device")
|
||||
def validate_device(cls, value):
|
||||
# "auto", "cpu", "cuda", "mps", or "cuda:N"
|
||||
if value in {d.value for d in AcceleratorDevice} or re.match(
|
||||
r"^cuda(:\d+)?$", value
|
||||
):
|
||||
return value
|
||||
raise ValueError(
|
||||
"Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
|
||||
)
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_alternative_envvars(cls, data: Any) -> Any:
|
||||
r"""
|
||||
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
|
||||
The alternative envvar is used only if it is valid and the regular envvar is not set.
|
||||
|
||||
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
|
||||
the same functionality. In case the alias envvar is set and the user tries to override the
|
||||
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
|
||||
as an extra input instead of simply overwriting the evvar value for that parameter.
|
||||
"""
|
||||
if isinstance(data, dict):
|
||||
input_num_threads = data.get("num_threads")
|
||||
# Check if to set the num_threads from the alternative envvar
|
||||
if input_num_threads is None:
|
||||
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
||||
omp_num_threads = os.getenv("OMP_NUM_THREADS")
|
||||
if docling_num_threads is None and omp_num_threads is not None:
|
||||
try:
|
||||
data["num_threads"] = int(omp_num_threads)
|
||||
except ValueError:
|
||||
_log.error(
|
||||
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
|
||||
omp_num_threads,
|
||||
)
|
||||
return data
|
||||
@@ -13,11 +13,11 @@ from docling_core.types.doc import (
|
||||
TableCell,
|
||||
)
|
||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||
|
||||
# DO NOT REMOVE; explicitly exposed from this location
|
||||
from docling_core.types.io import (
|
||||
DocumentStream,
|
||||
)
|
||||
|
||||
# DO NOT REMOVE; explicitly exposed from this location
|
||||
from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict, Field, computed_field
|
||||
|
||||
@@ -131,12 +131,6 @@ class ErrorItem(BaseModel):
|
||||
error_message: str
|
||||
|
||||
|
||||
# class Cell(BaseModel):
|
||||
# id: int
|
||||
# text: str
|
||||
# bbox: BoundingBox
|
||||
|
||||
|
||||
class Cluster(BaseModel):
|
||||
id: int
|
||||
label: DocItemLabel
|
||||
@@ -158,8 +152,16 @@ class LayoutPrediction(BaseModel):
|
||||
clusters: List[Cluster] = []
|
||||
|
||||
|
||||
class VlmPredictionToken(BaseModel):
|
||||
text: str = ""
|
||||
token: int = -1
|
||||
logprob: float = -1
|
||||
|
||||
|
||||
class VlmPrediction(BaseModel):
|
||||
text: str = ""
|
||||
generated_tokens: list[VlmPredictionToken] = []
|
||||
generation_time: float = -1
|
||||
|
||||
|
||||
class ContainerElement(
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
||||
@@ -10,73 +8,28 @@ from pydantic import (
|
||||
BaseModel,
|
||||
ConfigDict,
|
||||
Field,
|
||||
field_validator,
|
||||
model_validator,
|
||||
)
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
from typing_extensions import deprecated
|
||||
|
||||
# Import the following for backwards compatibility
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
ApiVlmOptions,
|
||||
InferenceFramework,
|
||||
InlineVlmOptions,
|
||||
ResponseFormat,
|
||||
)
|
||||
from docling.datamodel.vlm_model_specs import (
|
||||
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
||||
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
|
||||
SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
|
||||
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
|
||||
VlmModelType,
|
||||
)
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AcceleratorDevice(str, Enum):
|
||||
"""Devices to run model inference"""
|
||||
|
||||
AUTO = "auto"
|
||||
CPU = "cpu"
|
||||
CUDA = "cuda"
|
||||
MPS = "mps"
|
||||
|
||||
|
||||
class AcceleratorOptions(BaseSettings):
|
||||
model_config = SettingsConfigDict(
|
||||
env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
|
||||
)
|
||||
|
||||
num_threads: int = 4
|
||||
device: Union[str, AcceleratorDevice] = "auto"
|
||||
cuda_use_flash_attention2: bool = False
|
||||
|
||||
@field_validator("device")
|
||||
def validate_device(cls, value):
|
||||
# "auto", "cpu", "cuda", "mps", or "cuda:N"
|
||||
if value in {d.value for d in AcceleratorDevice} or re.match(
|
||||
r"^cuda(:\d+)?$", value
|
||||
):
|
||||
return value
|
||||
raise ValueError(
|
||||
"Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
|
||||
)
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_alternative_envvars(cls, data: Any) -> Any:
|
||||
r"""
|
||||
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
|
||||
The alternative envvar is used only if it is valid and the regular envvar is not set.
|
||||
|
||||
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
|
||||
the same functionality. In case the alias envvar is set and the user tries to override the
|
||||
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
|
||||
as an extra input instead of simply overwriting the evvar value for that parameter.
|
||||
"""
|
||||
if isinstance(data, dict):
|
||||
input_num_threads = data.get("num_threads")
|
||||
# Check if to set the num_threads from the alternative envvar
|
||||
if input_num_threads is None:
|
||||
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
||||
omp_num_threads = os.getenv("OMP_NUM_THREADS")
|
||||
if docling_num_threads is None and omp_num_threads is not None:
|
||||
try:
|
||||
data["num_threads"] = int(omp_num_threads)
|
||||
except ValueError:
|
||||
_log.error(
|
||||
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
|
||||
omp_num_threads,
|
||||
)
|
||||
return data
|
||||
|
||||
|
||||
class BaseOptions(BaseModel):
|
||||
"""Base class for options."""
|
||||
|
||||
@@ -121,24 +74,22 @@ class RapidOcrOptions(OcrOptions):
|
||||
lang: List[str] = [
|
||||
"english",
|
||||
"chinese",
|
||||
] # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
|
||||
# For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
|
||||
]
|
||||
# However, language as a parameter is not supported by rapidocr yet
|
||||
# and hence changing this options doesn't affect anything.
|
||||
|
||||
# For more details on supported languages by RapidOCR visit
|
||||
# https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
|
||||
|
||||
# For more details on the following options visit
|
||||
# https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
|
||||
|
||||
# For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
|
||||
text_score: float = 0.5 # same default as rapidocr
|
||||
|
||||
use_det: Optional[bool] = None # same default as rapidocr
|
||||
use_cls: Optional[bool] = None # same default as rapidocr
|
||||
use_rec: Optional[bool] = None # same default as rapidocr
|
||||
|
||||
# class Device(Enum):
|
||||
# CPU = "CPU"
|
||||
# CUDA = "CUDA"
|
||||
# DIRECTML = "DIRECTML"
|
||||
# AUTO = "AUTO"
|
||||
|
||||
# device: Device = Device.AUTO # Default value is AUTO
|
||||
|
||||
print_verbose: bool = False # same default as rapidocr
|
||||
|
||||
det_model_path: Optional[str] = None # same default as rapidocr
|
||||
@@ -244,101 +195,18 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
||||
return self.repo_id.replace("/", "--")
|
||||
|
||||
|
||||
# SmolVLM
|
||||
smolvlm_picture_description = PictureDescriptionVlmOptions(
|
||||
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
|
||||
)
|
||||
# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
|
||||
|
||||
# GraniteVision
|
||||
granite_picture_description = PictureDescriptionVlmOptions(
|
||||
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
||||
prompt="What is shown in this image?",
|
||||
)
|
||||
|
||||
|
||||
class BaseVlmOptions(BaseModel):
|
||||
kind: str
|
||||
prompt: str
|
||||
|
||||
|
||||
class ResponseFormat(str, Enum):
|
||||
DOCTAGS = "doctags"
|
||||
MARKDOWN = "markdown"
|
||||
|
||||
|
||||
class InferenceFramework(str, Enum):
|
||||
MLX = "mlx"
|
||||
TRANSFORMERS = "transformers"
|
||||
OPENAI = "openai"
|
||||
|
||||
|
||||
class HuggingFaceVlmOptions(BaseVlmOptions):
|
||||
kind: Literal["hf_model_options"] = "hf_model_options"
|
||||
|
||||
repo_id: str
|
||||
load_in_8bit: bool = True
|
||||
llm_int8_threshold: float = 6.0
|
||||
quantized: bool = False
|
||||
|
||||
inference_framework: InferenceFramework
|
||||
response_format: ResponseFormat
|
||||
|
||||
@property
|
||||
def repo_cache_folder(self) -> str:
|
||||
return self.repo_id.replace("/", "--")
|
||||
|
||||
|
||||
class ApiVlmOptions(BaseVlmOptions):
|
||||
kind: Literal["api_model_options"] = "api_model_options"
|
||||
|
||||
url: AnyUrl = AnyUrl(
|
||||
"http://localhost:11434/v1/chat/completions"
|
||||
) # Default to ollama
|
||||
headers: Dict[str, str] = {}
|
||||
params: Dict[str, Any] = {}
|
||||
scale: float = 2.0
|
||||
timeout: float = 60
|
||||
concurrency: int = 1
|
||||
response_format: ResponseFormat
|
||||
|
||||
|
||||
smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
||||
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
||||
prompt="Convert this page to docling.",
|
||||
response_format=ResponseFormat.DOCTAGS,
|
||||
inference_framework=InferenceFramework.MLX,
|
||||
)
|
||||
|
||||
|
||||
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
|
||||
repo_id="ds4sd/SmolDocling-256M-preview",
|
||||
prompt="Convert this page to docling.",
|
||||
response_format=ResponseFormat.DOCTAGS,
|
||||
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||
)
|
||||
|
||||
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
||||
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
||||
# prompt="OCR the full page to markdown.",
|
||||
prompt="OCR this image.",
|
||||
response_format=ResponseFormat.MARKDOWN,
|
||||
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||
)
|
||||
|
||||
granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
|
||||
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
||||
params={"model": "granite3.2-vision:2b"},
|
||||
prompt="OCR the full page to markdown.",
|
||||
scale=1.0,
|
||||
timeout=120,
|
||||
response_format=ResponseFormat.MARKDOWN,
|
||||
)
|
||||
|
||||
|
||||
class VlmModelType(str, Enum):
|
||||
SMOLDOCLING = "smoldocling"
|
||||
GRANITE_VISION = "granite_vision"
|
||||
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
||||
|
||||
|
||||
# Define an enum for the backend options
|
||||
class PdfBackend(str, Enum):
|
||||
"""Enum of valid PDF backends."""
|
||||
@@ -387,7 +255,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
||||
False # (To be used with vlms, or other generative models)
|
||||
)
|
||||
# If True, text from backend will be used instead of generated text
|
||||
vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
|
||||
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
|
||||
smoldocling_vlm_conversion_options
|
||||
)
|
||||
|
||||
|
||||
81
docling/datamodel/pipeline_options_vlm_model.py
Normal file
81
docling/datamodel/pipeline_options_vlm_model.py
Normal file
@@ -0,0 +1,81 @@
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Literal
|
||||
|
||||
from pydantic import AnyUrl, BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||
|
||||
|
||||
class BaseVlmOptions(BaseModel):
|
||||
kind: str
|
||||
prompt: str
|
||||
|
||||
|
||||
class ResponseFormat(str, Enum):
|
||||
DOCTAGS = "doctags"
|
||||
MARKDOWN = "markdown"
|
||||
HTML = "html"
|
||||
|
||||
|
||||
class InferenceFramework(str, Enum):
|
||||
MLX = "mlx"
|
||||
TRANSFORMERS = "transformers"
|
||||
|
||||
|
||||
class TransformersModelType(str, Enum):
|
||||
AUTOMODEL = "automodel"
|
||||
AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
|
||||
AUTOMODEL_CAUSALLM = "automodel-causallm"
|
||||
|
||||
|
||||
class InlineVlmOptions(BaseVlmOptions):
|
||||
kind: Literal["inline_model_options"] = "inline_model_options"
|
||||
|
||||
repo_id: str
|
||||
trust_remote_code: bool = False
|
||||
load_in_8bit: bool = True
|
||||
llm_int8_threshold: float = 6.0
|
||||
quantized: bool = False
|
||||
|
||||
inference_framework: InferenceFramework
|
||||
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
|
||||
response_format: ResponseFormat
|
||||
|
||||
supported_devices: List[AcceleratorDevice] = [
|
||||
AcceleratorDevice.CPU,
|
||||
AcceleratorDevice.CUDA,
|
||||
AcceleratorDevice.MPS,
|
||||
]
|
||||
|
||||
scale: float = 2.0
|
||||
|
||||
temperature: float = 0.0
|
||||
stop_strings: List[str] = []
|
||||
extra_generation_config: Dict[str, Any] = {}
|
||||
|
||||
use_kv_cache: bool = True
|
||||
max_new_tokens: int = 4096
|
||||
|
||||
@property
|
||||
def repo_cache_folder(self) -> str:
|
||||
return self.repo_id.replace("/", "--")
|
||||
|
||||
|
||||
@deprecated("Use InlineVlmOptions instead.")
|
||||
class HuggingFaceVlmOptions(InlineVlmOptions):
|
||||
pass
|
||||
|
||||
|
||||
class ApiVlmOptions(BaseVlmOptions):
|
||||
kind: Literal["api_model_options"] = "api_model_options"
|
||||
|
||||
url: AnyUrl = AnyUrl(
|
||||
"http://localhost:11434/v1/chat/completions"
|
||||
) # Default to ollama
|
||||
headers: Dict[str, str] = {}
|
||||
params: Dict[str, Any] = {}
|
||||
scale: float = 2.0
|
||||
timeout: float = 60
|
||||
concurrency: int = 1
|
||||
response_format: ResponseFormat
|
||||
144
docling/datamodel/vlm_model_specs.py
Normal file
144
docling/datamodel/vlm_model_specs.py
Normal file
@@ -0,0 +1,144 @@
|
||||
import logging
|
||||
from enum import Enum
|
||||
|
||||
from pydantic import (
|
||||
AnyUrl,
|
||||
)
|
||||
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
ApiVlmOptions,
|
||||
InferenceFramework,
|
||||
InlineVlmOptions,
|
||||
ResponseFormat,
|
||||
TransformersModelType,
|
||||
)
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# SmolDocling
|
||||
SMOLDOCLING_MLX = InlineVlmOptions(
|
||||
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
||||
prompt="Convert this page to docling.",
|
||||
response_format=ResponseFormat.DOCTAGS,
|
||||
inference_framework=InferenceFramework.MLX,
|
||||
supported_devices=[AcceleratorDevice.MPS],
|
||||
scale=2.0,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
|
||||
repo_id="ds4sd/SmolDocling-256M-preview",
|
||||
prompt="Convert this page to docling.",
|
||||
response_format=ResponseFormat.DOCTAGS,
|
||||
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||
transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
|
||||
supported_devices=[
|
||||
AcceleratorDevice.CPU,
|
||||
AcceleratorDevice.CUDA,
|
||||
AcceleratorDevice.MPS,
|
||||
],
|
||||
scale=2.0,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# GraniteVision
|
||||
GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
|
||||
repo_id="ibm-granite/granite-vision-3.2-2b",
|
||||
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||
response_format=ResponseFormat.MARKDOWN,
|
||||
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||
transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
|
||||
supported_devices=[
|
||||
AcceleratorDevice.CPU,
|
||||
AcceleratorDevice.CUDA,
|
||||
AcceleratorDevice.MPS,
|
||||
],
|
||||
scale=2.0,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
GRANITE_VISION_OLLAMA = ApiVlmOptions(
|
||||
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
||||
params={"model": "granite3.2-vision:2b"},
|
||||
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||
scale=1.0,
|
||||
timeout=120,
|
||||
response_format=ResponseFormat.MARKDOWN,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# Pixtral
|
||||
PIXTRAL_12B_TRANSFORMERS = InlineVlmOptions(
|
||||
repo_id="mistral-community/pixtral-12b",
|
||||
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||
response_format=ResponseFormat.MARKDOWN,
|
||||
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||
transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
|
||||
supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
|
||||
scale=2.0,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
PIXTRAL_12B_MLX = InlineVlmOptions(
|
||||
repo_id="mlx-community/pixtral-12b-bf16",
|
||||
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||
response_format=ResponseFormat.MARKDOWN,
|
||||
inference_framework=InferenceFramework.MLX,
|
||||
supported_devices=[AcceleratorDevice.MPS],
|
||||
scale=2.0,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# Phi4
|
||||
PHI4_TRANSFORMERS = InlineVlmOptions(
|
||||
repo_id="microsoft/Phi-4-multimodal-instruct",
|
||||
prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
|
||||
trust_remote_code=True,
|
||||
response_format=ResponseFormat.MARKDOWN,
|
||||
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||
transformers_model_type=TransformersModelType.AUTOMODEL_CAUSALLM,
|
||||
supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
|
||||
scale=2.0,
|
||||
temperature=0.0,
|
||||
extra_generation_config=dict(num_logits_to_keep=0),
|
||||
)
|
||||
|
||||
# Qwen
|
||||
QWEN25_VL_3B_MLX = InlineVlmOptions(
|
||||
repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
|
||||
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||
response_format=ResponseFormat.MARKDOWN,
|
||||
inference_framework=InferenceFramework.MLX,
|
||||
supported_devices=[AcceleratorDevice.MPS],
|
||||
scale=2.0,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# Gemma-3
|
||||
GEMMA3_12B_MLX = InlineVlmOptions(
|
||||
repo_id="mlx-community/gemma-3-12b-it-bf16",
|
||||
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||
response_format=ResponseFormat.MARKDOWN,
|
||||
inference_framework=InferenceFramework.MLX,
|
||||
supported_devices=[AcceleratorDevice.MPS],
|
||||
scale=2.0,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
GEMMA3_27B_MLX = InlineVlmOptions(
|
||||
repo_id="mlx-community/gemma-3-27b-it-bf16",
|
||||
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||
response_format=ResponseFormat.MARKDOWN,
|
||||
inference_framework=InferenceFramework.MLX,
|
||||
supported_devices=[AcceleratorDevice.MPS],
|
||||
scale=2.0,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
|
||||
class VlmModelType(str, Enum):
|
||||
SMOLDOCLING = "smoldocling"
|
||||
GRANITE_VISION = "granite_vision"
|
||||
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
||||
Reference in New Issue
Block a user