Skip to content

Commit

Permalink
Add correct memory-allocation at DeepSpeed-Attention (#2474)
Browse files Browse the repository at this point in the history
Co-authored-by: Jeff Rasley <[email protected]>
Co-authored-by: Connor Holmes <[email protected]>
  • Loading branch information
3 people authored Nov 8, 2022
1 parent a47c3e0 commit 9cfcf74
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 14 deletions.
14 changes: 0 additions & 14 deletions csrc/transformer/inference/csrc/pt_binding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -849,20 +849,6 @@ at::Tensor ds_linear_layer(at::Tensor& input,
int head_size = input_cont.size(2) / num_heads;
int bsz = input.size(0) * input.size(1);
T* workspace = (T*)Context::Instance().GetWorkSpace();
// Reallocate memory if we received a new prompt
if (!workspace) {
cublasSetStream(Context::Instance().GetCublasHandle(),
Context::Instance().GetCurrentStream());
allocate_workspace<T>(input.size(2),
input.size(0),
input.size(1),
num_layers,
num_heads,
1,
external_cache,
0);
workspace = (T*)Context::Instance().GetWorkSpace();
}
auto output = at::from_blob(workspace, {input.size(0), input.size(1), weight.size(1)}, options);

float alpha = (T)1.0;
Expand Down
1 change: 1 addition & 0 deletions deepspeed/module_inject/replace_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ def replace_attn(child, policy, layer_id):
heads=heads,
fp16=fp16,
triangular_masking=False,
max_out_tokens=4096,
)
attn_module = transformer_inference.DeepSpeedAttention(config)

Expand Down
15 changes: 15 additions & 0 deletions deepspeed/ops/transformer/inference/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,9 @@ def __init__(
inference_cuda_module.linear_layer_fp32
self.cuda_graph_created = False
self.enable_cuda_graph = False
self.allocate_workspace = inference_cuda_module.allocate_workspace_fp32 if (not config.fp16) else \
inference_cuda_module.allocate_workspace_fp16
self.iter = 0

def _graph_replay(self, *inputs, **kwargs):
for i in range(len(inputs)):
Expand Down Expand Up @@ -275,6 +278,18 @@ def forward(self, *inputs, **kwargs):
return outputs

def _forward(self, input, context=None, input_mask=None):
# Allocate memory only on first layer forward
if self.config.layer_id == 0 and self.iter == 0:
self.iter += 1
self.allocate_workspace(self.config.hidden_size,
input.size()[0],
input.size()[1],
DeepSpeedAttention.layer_id,
self.config.heads,
self.config.mp_size,
self.config.bigscience_bloom,
0,
self.config.max_out_tokens)
output = DeepSpeedAttentionFunction.apply(input,
context,
input_mask,
Expand Down

0 comments on commit 9cfcf74

Please sign in to comment.