From 3fb56475ea98e9684c5444e1169f39c896f200ce Mon Sep 17 00:00:00 2001 From: Nusrat Islam Date: Mon, 30 Sep 2024 13:55:39 -0500 Subject: [PATCH] apps/nccl: add unroll in allred8 --- apps/nccl/src/allreduce.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 14d0dfba..0637c5ad 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -382,7 +382,7 @@ __global__ void __launch_bounds__(512, 1) __syncthreads(); // Starts allgather for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) { - for (int i = 0; i < nPeer; i++) { + for (int i = 0; i < NPEER; i++) { const int peerIdx = (i + blockIdx.x) % nPeer; const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock]; @@ -399,13 +399,13 @@ __global__ void __launch_bounds__(512, 1) for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) { int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock]; - for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + for (int peerIdx = 0; peerIdx < NPEER; peerIdx++) { const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx]; data = add_vectors(val, data); } resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data; - for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + for (int peerIdx = 0; peerIdx < NPEER; peerIdx++) { outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4), data); } @@ -419,7 +419,7 @@ __global__ void __launch_bounds__(512, 1) } __syncthreads(); for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) { - for (int i = 0; i < nPeer; i++) { + for (int i = 0; i < NPEER; i++) { const int peerIdx = (i + blockIdx.x) % nPeer; const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock]; @@ -435,13 +435,13 @@ __global__ void __launch_bounds__(512, 1) for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) { int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock]; - for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + for (int peerIdx = 0; peerIdx < NPEER; peerIdx++) { const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx]; data = add_vectors(val, data); } resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data; - for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + for (int peerIdx = 0; peerIdx < NPEER; peerIdx++) { outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4), data); }