[Bugfix][Model] Fix Mllama SDPA illegal memory access for batched mul…

…ti-image (vllm-project#9626) Signed-off-by: mgoin <[email protected]> Signed-off-by: NickLucche <[email protected]>
NickLucche · Oct 31, 2024 · 9973825 · 9973825
1 parent 1c48022
commit 9973825
Showing 1 changed file with 5 additions and 3 deletions.
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
@@ -795,17 +795,19 @@ def attention_with_mask(
         kv_len = k.shape[0]
         q = q.transpose(0, 1).view(self.num_local_key_value_heads,
                                    self.num_key_value_groups, q_len,
-                                   self.head_dim)
+                                   self.head_dim).contiguous()
         k = k.transpose(0,
                         1)[:,
                            None, :, :].expand(self.num_local_key_value_heads,
                                               self.num_key_value_groups,
-                                              kv_len, self.head_dim)
+                                              kv_len,
+                                              self.head_dim).contiguous()
         v = v.transpose(0,
                         1)[:,
                            None, :, :].expand(self.num_local_key_value_heads,
                                               self.num_key_value_groups,
-                                              kv_len, self.head_dim)
+                                              kv_len,
+                                              self.head_dim).contiguous()
         attention_mask = attention_mask.view(1, 1, q_len, kv_len)
         output = F.scaled_dot_product_attention(q,
                                                 k,