fix: enable cuda_use_flash_attention2 for PictureDescriptionVlmModel (#1496)

fix: enable use_cuda_flash_attention2 for PictureDescriptionVlmModel Signed-off-by: Zach Cox <zach.s.cox@gmail.com>
2025-04-30 02:02:52 -04:00 · 2025-04-30 02:02:52 -04:00 · cc453961a9
commit cc453961a9
parent 976e92e289
1 changed files with 4 additions and 1 deletions
--- a/docling/models/picture_description_vlm_model.py
+++ b/docling/models/picture_description_vlm_model.py
@ -57,7 +57,10 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
                artifacts_path,
                torch_dtype=torch.bfloat16,
                _attn_implementation=(
-                    "flash_attention_2" if self.device.startswith("cuda") else "eager"
+                    "flash_attention_2"
+                    if self.device.startswith("cuda")
+                    and accelerator_options.cuda_use_flash_attention2
+                    else "eager"
                ),
            ).to(self.device)