bugfix with beta value in attention computation

bigscience-workshop · Sep 16, 2021 · 5b5afb2 · 5b5afb2
1 parent 06dc2d7
commit 5b5afb2
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
@@ -304,7 +304,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None,
             matmul_result,
             query_layer.transpose(0, 1),   # [b * np, sq, hn]
             key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-            beta=0.0, alpha=(1.0/self.norm_factor))
+            beta=0.0 if alibi is None else 1.0, alpha=(1.0/self.norm_factor))
 
         # change view to [b, np, sq, sk]
         attention_scores = matmul_result.view(*output_size)