From 7cf85f3a64af794b58f6ccbe795c05a419d734f1 Mon Sep 17 00:00:00 2001 From: Aman Gupta Karmani Date: Wed, 30 Aug 2023 00:52:13 -0400 Subject: [PATCH] use flash-attn via xformers (#877) --- tests/kernels/test_attention.py | 2 -- vllm/model_executor/layers/attention.py | 3 --- 2 files changed, 5 deletions(-) diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index d8199c8e60754..452ac4c61853e 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -266,7 +266,6 @@ def run_multi_query_kv_attention( qkv.uniform_(-1e-3, 1e-3) query, key, value = qkv.unbind(dim=1) - attn_op = xops.fmha.cutlass.FwOp() attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens) output = xops.memory_efficient_attention_forward( query.unsqueeze(0), @@ -275,7 +274,6 @@ def run_multi_query_kv_attention( attn_bias=attn_bias, p=0.0, scale=scale, - op=attn_op, ) output = output.squeeze(0) diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index a9bbb64b7eb54..c59208e293c36 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -61,7 +61,6 @@ def __init__(self, self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) - self.attn_op = xops.fmha.cutlass.FwOp() self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads assert self.num_heads % self.num_kv_heads == 0 @@ -115,7 +114,6 @@ def multi_query_kv_attention( attn_bias=input_metadata.attn_bias[0], p=0.0, scale=self.scale, - op=self.attn_op, ) # TODO(woosuk): Unnecessary copy. Optimize. output.copy_(out.squeeze(0)) @@ -404,7 +402,6 @@ def multi_query_kv_attention( attn_bias=input_metadata.attn_bias[i], p=0.0, scale=self.scale, - op=self.attn_op, ) # TODO(woosuk): Unnecessary copy. Optimize. output[start:end].copy_(out.squeeze(0))