Skip to content

Commit

Permalink
fix a bug in fused rope (NVIDIA#1750)
Browse files Browse the repository at this point in the history
Signed-off-by: Xin Yao <[email protected]>
  • Loading branch information
yaox12 authored and caaatch22 committed Dec 23, 2024
1 parent 78344f2 commit 5b7e0d5
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 5 deletions.
5 changes: 2 additions & 3 deletions csrc/megatron/fused_rotary_positional_embedding.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@ __global__ void fused_rope_forward(int sq, int b, int np, int hn, int hn2,
int offset_head = offset_block + head_id * hn;
#pragma unroll
for (int hn_id = hn2 + threadIdx.x; hn_id < hn; hn_id += blockDim.x) {
int offset_src_dst = offset_head + hn_id;
dst[offset_src_dst] = src[offset_src_dst];
dst[offset_head + hn_id] = src[offset_head + hn_id];
}
}
}
Expand Down Expand Up @@ -89,7 +88,7 @@ __global__ void fused_rope_backward(int sq, int b, int np, int hn, int hn2,
int offset_head = offset_block + head_id * hn;
#pragma unroll
for (int hn_id = hn2 + threadIdx.x; hn_id < hn; hn_id += blockDim.x) {
dst[offset_head + hn_id] = 1.0;
dst[offset_head + hn_id] = src[offset_head + hn_id];
}
}
}
Expand Down
6 changes: 4 additions & 2 deletions tests/L0/run_transformer/test_fused_rope.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,15 @@ def test_forward_backward(self):

# unfused
output_unfused = apply_rotary_pos_emb(t, emb)
output_unfused.sum().backward()
loss_unfused = output_unfused.sum() * 2
loss_unfused.backward()
grad_unfused = t.grad.detach().clone()
t.grad = None

# fused
output_fused = fused_apply_rotary_pos_emb(t, emb)
output_fused.sum().backward()
loss_fused = output_fused.sum() * 2
loss_fused.backward()
grad_fused = t.grad.detach().clone()

self.assertEqual(
Expand Down

0 comments on commit 5b7e0d5

Please sign in to comment.