feat: Maximum image size for Vlm models (#1802)
* Image scale moved to base vlm options. Added max_size image limit (options and vlm models). * DCO Remediation Commit for Shkarupa Alex <shkarupa.alex@gmail.com> I, Shkarupa Alex <shkarupa.alex@gmail.com>, hereby add my Signed-off-by to this commit: e93602a0d02fdb6f6dea1f65686cffcc4c616011 Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com> --------- Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com>
This commit is contained in:
parent
dbab30e92c
commit
215b540f6c
@ -253,11 +253,18 @@ class Page(BaseModel):
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
def get_image(
|
def get_image(
|
||||||
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
|
self,
|
||||||
|
scale: float = 1.0,
|
||||||
|
max_size: Optional[int] = None,
|
||||||
|
cropbox: Optional[BoundingBox] = None,
|
||||||
) -> Optional[Image]:
|
) -> Optional[Image]:
|
||||||
if self._backend is None:
|
if self._backend is None:
|
||||||
return self._image_cache.get(scale, None)
|
return self._image_cache.get(scale, None)
|
||||||
|
|
||||||
|
if max_size:
|
||||||
|
assert self.size is not None
|
||||||
|
scale = min(scale, max_size / max(self.size.as_tuple()))
|
||||||
|
|
||||||
if scale not in self._image_cache:
|
if scale not in self._image_cache:
|
||||||
if cropbox is None:
|
if cropbox is None:
|
||||||
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
||||||
|
@ -10,6 +10,8 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
|
|||||||
class BaseVlmOptions(BaseModel):
|
class BaseVlmOptions(BaseModel):
|
||||||
kind: str
|
kind: str
|
||||||
prompt: str
|
prompt: str
|
||||||
|
scale: float = 2.0
|
||||||
|
max_size: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
class ResponseFormat(str, Enum):
|
class ResponseFormat(str, Enum):
|
||||||
@ -49,8 +51,6 @@ class InlineVlmOptions(BaseVlmOptions):
|
|||||||
AcceleratorDevice.MPS,
|
AcceleratorDevice.MPS,
|
||||||
]
|
]
|
||||||
|
|
||||||
scale: float = 2.0
|
|
||||||
|
|
||||||
temperature: float = 0.0
|
temperature: float = 0.0
|
||||||
stop_strings: List[str] = []
|
stop_strings: List[str] = []
|
||||||
extra_generation_config: Dict[str, Any] = {}
|
extra_generation_config: Dict[str, Any] = {}
|
||||||
@ -76,7 +76,6 @@ class ApiVlmOptions(BaseVlmOptions):
|
|||||||
) # Default to ollama
|
) # Default to ollama
|
||||||
headers: Dict[str, str] = {}
|
headers: Dict[str, str] = {}
|
||||||
params: Dict[str, Any] = {}
|
params: Dict[str, Any] = {}
|
||||||
scale: float = 2.0
|
|
||||||
timeout: float = 60
|
timeout: float = 60
|
||||||
concurrency: int = 1
|
concurrency: int = 1
|
||||||
response_format: ResponseFormat
|
response_format: ResponseFormat
|
||||||
|
@ -48,7 +48,9 @@ class ApiVlmModel(BasePageModel):
|
|||||||
with TimeRecorder(conv_res, "vlm"):
|
with TimeRecorder(conv_res, "vlm"):
|
||||||
assert page.size is not None
|
assert page.size is not None
|
||||||
|
|
||||||
hi_res_image = page.get_image(scale=self.vlm_options.scale)
|
hi_res_image = page.get_image(
|
||||||
|
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
|
||||||
|
)
|
||||||
assert hi_res_image is not None
|
assert hi_res_image is not None
|
||||||
if hi_res_image:
|
if hi_res_image:
|
||||||
if hi_res_image.mode != "RGB":
|
if hi_res_image.mode != "RGB":
|
||||||
|
@ -123,7 +123,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|||||||
with TimeRecorder(conv_res, "vlm"):
|
with TimeRecorder(conv_res, "vlm"):
|
||||||
assert page.size is not None
|
assert page.size is not None
|
||||||
|
|
||||||
hi_res_image = page.get_image(scale=self.vlm_options.scale)
|
hi_res_image = page.get_image(
|
||||||
|
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
|
||||||
|
)
|
||||||
|
|
||||||
# Define prompt structure
|
# Define prompt structure
|
||||||
prompt = self.formulate_prompt()
|
prompt = self.formulate_prompt()
|
||||||
|
@ -73,7 +73,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
|||||||
with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
|
with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
|
||||||
assert page.size is not None
|
assert page.size is not None
|
||||||
|
|
||||||
hi_res_image = page.get_image(scale=self.vlm_options.scale)
|
hi_res_image = page.get_image(
|
||||||
|
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
|
||||||
|
)
|
||||||
if hi_res_image is not None:
|
if hi_res_image is not None:
|
||||||
im_width, im_height = hi_res_image.size
|
im_width, im_height = hi_res_image.size
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user