Sperate model runner

vllm-project · Apr 3, 2024 · 24a5a18 · 24a5a18
1 parent 77a6572
commit 24a5a18
Show file tree

Hide file tree

Showing 4 changed files with 409 additions and 16 deletions.
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
@@ -141,7 +141,7 @@ def forward(
                                                 attn_metadata.kv_cache_dtype)
 
         if attn_metadata.is_prompt:
-            if (kv_cache is None or attn_metadata.block_tables.numel() == 0):
+            if (kv_cache is None or attn_metadata.block_tables is None):
                 if self.num_kv_heads != self.num_heads:
                     key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
                     value = value.repeat_interleave(self.num_queries_per_kv,
@@ -221,8 +221,8 @@ def _make_alibi_bias(
         bias = bias[None, :] - bias[:, None]
 
         num_heads = alibi_slopes.shape[0]
-        bias = bias[None, :].expand(num_heads, prompt_len, prompt_len)
-        bias.mul_(alibi_slopes[:, None, None])
+        bias = bias[None, :].expand(num_heads, prompt_len, prompt_len)\
+                .mul(alibi_slopes[:, None, None])
         inf_mask = torch.empty(
             (1, prompt_len, prompt_len),
             dtype=bias.dtype).fill_(-torch.inf).triu_(diagonal=1)

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -370,7 +370,6 @@ def is_pin_memory_available() -> bool:
         print_warning_once("Pin memory is not supported on Neuron.")
         return False
     elif is_cpu():
-        print_warning_once("Pin memory is not supported on CPU.")
         return False
     return True