Skip to content

Commit

Permalink
Resync with NCCL 2.13
Browse files Browse the repository at this point in the history
* Added "verifiable", a suite of kernels for generating and verifying reduction
  input and output arrays in a bit-precise way.
* Data corruption errors now reported in number of wrong elements instead of max
  deviation.
* Use ncclGetLastError.
* Don't run hypercube on non-powers of 2 ranks.
* Fix to hypercube data verification.
* Use "thread local" as the defaut CUDA capture mode.
* Replaced pthread_yield -> sched_yield()
* Bugfix to the cpu-side barrier/allreduce implementations.
  • Loading branch information
jbachan committed Aug 23, 2022
1 parent 8274cb4 commit 51af557
Show file tree
Hide file tree
Showing 18 changed files with 1,705 additions and 514 deletions.
8 changes: 6 additions & 2 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,16 @@ build: ${BIN_FILES}
clean:
rm -rf ${DST_DIR}

${DST_DIR}/%.o: %.cu common.h
TEST_VERIFIABLE_SRCDIR := ../verifiable
TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
include ../verifiable/verifiable.mk

${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
$(NVCC) -o $@ $(NVCUFLAGS) -c $<

${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o $(TEST_VERIFIABLE_OBJS)
@printf "Linking %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
$(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS}
Expand Down
16 changes: 2 additions & 14 deletions src/all_gather.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,6 @@
#include "cuda_runtime.h"
#include "common.h"

void print_header() {
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}

void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s", size, count, typeName);
}

void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count/nranks;
*recvcount = (count/nranks)*nranks;
Expand All @@ -38,9 +26,9 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes*j, sendcount, type, rep, j));
TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
}
CUDACHECK(cudaDeviceSynchronize());
}
Expand Down
14 changes: 1 addition & 13 deletions src/all_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,6 @@
#include "cuda_runtime.h"
#include "common.h"

void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}

void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
}

void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
Expand All @@ -38,7 +26,7 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
CUDACHECK(cudaDeviceSynchronize());
}
Expand Down
17 changes: 3 additions & 14 deletions src/alltoall.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,6 @@
#include "cuda_runtime.h"
#include "common.h"

void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}

void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
}

void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = (count/nranks)*nranks;
*recvcount = (count/nranks)*nranks;
Expand All @@ -39,9 +27,10 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData(((char*)args->expected[i])+args->sendBytes/nranks*j, sendcount/nranks, type, rep+rank*sendcount/nranks, j));
size_t partcount = sendcount/nranks;
TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
}
CUDACHECK(cudaDeviceSynchronize());
}
Expand Down
16 changes: 2 additions & 14 deletions src/broadcast.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,6 @@
#include "cuda_runtime.h"
#include "common.h"

void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}

void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
}

void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
Expand All @@ -37,8 +25,8 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(args->expected[i], recvcount, type, rep, root));
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
TESTCHECK(InitData(args->expected[i], recvcount, 0, type, ncclSum, rep, 1, 0));
CUDACHECK(cudaDeviceSynchronize());
}
return testSuccess;
Expand Down
Loading

0 comments on commit 51af557

Please sign in to comment.