From cc453961a9196c79f6428305b9007402e448f300 Mon Sep 17 00:00:00 2001
From: Zach Cox <zach.s.cox@gmail.com>
Date: Wed, 30 Apr 2025 02:02:52 -0400
Subject: [PATCH] fix: enable cuda_use_flash_attention2 for
 PictureDescriptionVlmModel (#1496)

fix: enable use_cuda_flash_attention2 for PictureDescriptionVlmModel

Signed-off-by: Zach Cox <zach.s.cox@gmail.com>
---
 docling/models/picture_description_vlm_model.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docling/models/picture_description_vlm_model.py b/docling/models/picture_description_vlm_model.py
index 374f575..679e80c 100644
--- a/docling/models/picture_description_vlm_model.py
+++ b/docling/models/picture_description_vlm_model.py
@@ -57,7 +57,10 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
                 artifacts_path,
                 torch_dtype=torch.bfloat16,
                 _attn_implementation=(
-                    "flash_attention_2" if self.device.startswith("cuda") else "eager"
+                    "flash_attention_2"
+                    if self.device.startswith("cuda")
+                    and accelerator_options.cuda_use_flash_attention2
+                    else "eager"
                 ),
             ).to(self.device)