From 215b540f6c078a72464310ef22975ebb6cde4f0a Mon Sep 17 00:00:00 2001 From: Shkarupa Alex Date: Wed, 18 Jun 2025 13:57:37 +0300 Subject: [PATCH] feat: Maximum image size for Vlm models (#1802) * Image scale moved to base vlm options. Added max_size image limit (options and vlm models). * DCO Remediation Commit for Shkarupa Alex I, Shkarupa Alex , hereby add my Signed-off-by to this commit: e93602a0d02fdb6f6dea1f65686cffcc4c616011 Signed-off-by: Shkarupa Alex --------- Signed-off-by: Shkarupa Alex --- docling/datamodel/base_models.py | 9 ++++++++- docling/datamodel/pipeline_options_vlm_model.py | 5 ++--- docling/models/api_vlm_model.py | 4 +++- .../models/vlm_models_inline/hf_transformers_model.py | 4 +++- docling/models/vlm_models_inline/mlx_model.py | 4 +++- 5 files changed, 19 insertions(+), 7 deletions(-) diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 4180090..c1fdb03 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -253,11 +253,18 @@ class Page(BaseModel): return [] def get_image( - self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None + self, + scale: float = 1.0, + max_size: Optional[int] = None, + cropbox: Optional[BoundingBox] = None, ) -> Optional[Image]: if self._backend is None: return self._image_cache.get(scale, None) + if max_size: + assert self.size is not None + scale = min(scale, max_size / max(self.size.as_tuple())) + if scale not in self._image_cache: if cropbox is None: self._image_cache[scale] = self._backend.get_page_image(scale=scale) diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py index 2289c3c..90ab668 100644 --- a/docling/datamodel/pipeline_options_vlm_model.py +++ b/docling/datamodel/pipeline_options_vlm_model.py @@ -10,6 +10,8 @@ from docling.datamodel.accelerator_options import AcceleratorDevice class BaseVlmOptions(BaseModel): kind: str prompt: str + scale: float = 2.0 + max_size: Optional[int] = None class ResponseFormat(str, Enum): @@ -49,8 +51,6 @@ class InlineVlmOptions(BaseVlmOptions): AcceleratorDevice.MPS, ] - scale: float = 2.0 - temperature: float = 0.0 stop_strings: List[str] = [] extra_generation_config: Dict[str, Any] = {} @@ -76,7 +76,6 @@ class ApiVlmOptions(BaseVlmOptions): ) # Default to ollama headers: Dict[str, str] = {} params: Dict[str, Any] = {} - scale: float = 2.0 timeout: float = 60 concurrency: int = 1 response_format: ResponseFormat diff --git a/docling/models/api_vlm_model.py b/docling/models/api_vlm_model.py index 30bc43e..bfd0000 100644 --- a/docling/models/api_vlm_model.py +++ b/docling/models/api_vlm_model.py @@ -48,7 +48,9 @@ class ApiVlmModel(BasePageModel): with TimeRecorder(conv_res, "vlm"): assert page.size is not None - hi_res_image = page.get_image(scale=self.vlm_options.scale) + hi_res_image = page.get_image( + scale=self.vlm_options.scale, max_size=self.vlm_options.max_size + ) assert hi_res_image is not None if hi_res_image: if hi_res_image.mode != "RGB": diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py index 00fdfa5..bd35888 100644 --- a/docling/models/vlm_models_inline/hf_transformers_model.py +++ b/docling/models/vlm_models_inline/hf_transformers_model.py @@ -123,7 +123,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix with TimeRecorder(conv_res, "vlm"): assert page.size is not None - hi_res_image = page.get_image(scale=self.vlm_options.scale) + hi_res_image = page.get_image( + scale=self.vlm_options.scale, max_size=self.vlm_options.max_size + ) # Define prompt structure prompt = self.formulate_prompt() diff --git a/docling/models/vlm_models_inline/mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py index d8b9040..58f037f 100644 --- a/docling/models/vlm_models_inline/mlx_model.py +++ b/docling/models/vlm_models_inline/mlx_model.py @@ -73,7 +73,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin): with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"): assert page.size is not None - hi_res_image = page.get_image(scale=self.vlm_options.scale) + hi_res_image = page.get_image( + scale=self.vlm_options.scale, max_size=self.vlm_options.max_size + ) if hi_res_image is not None: im_width, im_height = hi_res_image.size