Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bisection test #203

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions doc/PERFORMANCE.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,16 @@ Similarly to broadcast, all data need to be sent to the root, hence :
And :

`B = S/t`

### Bisection

In the bisection operation, each rank is paired with a single other rank and sends him one message of size S, hence:

t = S/B

And :

B = S/t

### Summary

Expand All @@ -140,5 +150,6 @@ To obtain a bus bandwidth which should be independent of the number of ranks _n_
* AllGather : (_n_-1)/_n_
* Broadcast : 1
* Reduce : 1
* Bisection : 1

The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.
2 changes: 1 addition & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%)
DST_DIR := $(BUILDDIR)
SRC_FILES := $(wildcard *.cu)
OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube bisection
BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)

build: ${BIN_FILES}
Expand Down
121 changes: 121 additions & 0 deletions src/bisection.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

#include "cuda_runtime.h"
#include "common.h"

void BisectionGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = count;
}

int getPeer(int rank, int n_ranks){
if (n_ranks % 4 == 0)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is n_ranks % 4 == 0 a special case?

Copy link
Author

@x41lakazam x41lakazam Jun 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This allow cross rail bisection for rail-optimized topology

return ((n_ranks / 2 + rank) % n_ranks) + (rank % 2 ? -1 : 1);
// If there is an odd number of ranks, the last rank is ignored and paired with itself
else if (n_ranks % 2 == 1 && rank == n_ranks-1)
return rank;
else
return (rank + n_ranks/2) % (n_ranks - n_ranks % 2);
}

testResult_t BisectionInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus;

for (int i=0; i<args->nGpus; i++) {
CUDACHECK(cudaSetDevice(args->gpus[i]));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
int peer = getPeer(rank, nranks);
TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
CUDACHECK(cudaDeviceSynchronize());
}
// We don't support in-place sendrecv
args->reportErrors = in_place ? 0 : 1;
return testSuccess;
}

void BisectionGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
*busBw = *algBw = (double)(count * typesize) / 1.0E9 / sec;
}

testResult_t BisectionRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
int n_ranks, comm_rank, peer;

NCCLCHECK(ncclCommUserRank(comm, &comm_rank));
NCCLCHECK(ncclCommCount(comm, &n_ranks));

peer = getPeer(comm_rank, n_ranks);

NCCLCHECK(ncclGroupStart());
NCCLCHECK(ncclSend(sendbuff, count, type, peer, comm, stream));
NCCLCHECK(ncclRecv(recvbuff, count, type, peer, comm, stream));
NCCLCHECK(ncclGroupEnd());

return testSuccess;
}

struct testColl bisectionTest = {
"Bisection",
BisectionGetCollByteCount,
BisectionInitData,
BisectionGetBw,
BisectionRunColl
};

void BisectionGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
BisectionGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}

testResult_t BisectionRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &bisectionTest;
ncclDataType_t *run_types;
ncclRedOp_t *run_ops;
const char **run_typenames, **run_opnames;
int type_count, op_count;

if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}

if ((int)op != -1) {
op_count = 1;
run_ops = &op;
run_opnames = &opName;
} else {
op_count = test_opnum;
run_ops = test_ops;
run_opnames = test_opnames;
}

for (int i=0; i<type_count; i++) {
for (int j=0; j<op_count; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
}
}
return testSuccess;
}

struct testEngine bisectionEngine = {
BisectionGetBuffSize,
BisectionRunTest
};

#pragma weak ncclTestEngine=bisectionEngine