From 071d8ac9288298a862d54192f217fe939cfa27e0 Mon Sep 17 00:00:00 2001 From: Sidharth Babu Date: Fri, 2 Jun 2023 23:09:01 +0000 Subject: [PATCH 01/19] Initialized alltoallv test --- src/alltoallv.cu | 111 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 src/alltoallv.cu diff --git a/src/alltoallv.cu b/src/alltoallv.cu new file mode 100644 index 0000000..33a0b28 --- /dev/null +++ b/src/alltoallv.cu @@ -0,0 +1,111 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "common.h" + +void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = (count/nranks)*nranks; + *recvcount = (count/nranks)*nranks; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = count/nranks; +} + +testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + + //Can maybe introduce heterogeneity in data size by conditioning sendcnt and recvcont on rank here? + + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + CUDACHECK(cudaSetDevice(args->gpus[i])); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); + for (int j=0; jexpected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0)); + } + CUDACHECK(cudaDeviceSynchronize()); + } + // We don't support in-place alltoallv + args->reportErrors = in_place ? 0 : 1; + return testSuccess; +} + +void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(nranks-1))/((double)(nranks)); + *busBw = baseBw * factor; +} + +testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + int nRanks; + NCCLCHECK(ncclCommCount(comm, &nRanks)); + size_t rankOffset = count * wordSize(type); + +#if NCCL_MAJOR < 2 || NCCL_MINOR < 7 + printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); + return testNcclError; +#else + NCCLCHECK(ncclGroupStart()); + + for (int r=0; rcollTest = &AlltoAllvTest; + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + for (int i=0; i Date: Sat, 3 Jun 2023 00:11:31 +0000 Subject: [PATCH 02/19] Initial draft of alltoallv test. Changes: - Added variable count of elements to send/recv based on sending/recieving peers - Added new file to make file Notes: - Current method of uniquely identifying the peers that are sending (thread_local of thread number) may not work correctly. Not sure if that is the appropriate way to determine rank. --- src/Makefile | 2 +- src/alltoallv.cu | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/Makefile b/src/Makefile index 393de8e..3ef4ebd 100644 --- a/src/Makefile +++ b/src/Makefile @@ -76,7 +76,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube +BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall alltoallv scatter gather sendrecv hypercube BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src/alltoallv.cu b/src/alltoallv.cu index 33a0b28..26bc67e 100644 --- a/src/alltoallv.cu +++ b/src/alltoallv.cu @@ -7,6 +7,8 @@ #include "cuda_runtime.h" #include "common.h" +thread_local int threadNum = -1; + void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { *sendcount = (count/nranks)*nranks; *recvcount = (count/nranks)*nranks; @@ -17,8 +19,6 @@ void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { - //Can maybe introduce heterogeneity in data size by conditioning sendcnt and recvcont on rank here? - size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus; @@ -59,10 +59,15 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl #else NCCLCHECK(ncclGroupStart()); + for (int r=0; rcollTest = &AlltoAllvTest; + threadNum = args->thread; ncclDataType_t *run_types; const char **run_typenames; int type_count; - if ((int)type != -1) { type_count = 1; run_types = &type; From 086e318b9eda31926983a9bf3a18ce9646cec0e5 Mon Sep 17 00:00:00 2001 From: Sidharth Babu Date: Sat, 3 Jun 2023 00:39:11 +0000 Subject: [PATCH 03/19] changed to use nccl rank finding function. This should now use a valid rank --- src/alltoallv.cu | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/alltoallv.cu b/src/alltoallv.cu index 26bc67e..fef27ab 100644 --- a/src/alltoallv.cu +++ b/src/alltoallv.cu @@ -7,7 +7,6 @@ #include "cuda_runtime.h" #include "common.h" -thread_local int threadNum = -1; void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { *sendcount = (count/nranks)*nranks; @@ -49,8 +48,9 @@ void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, doubl } testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - int nRanks; + int nRanks, myRank; NCCLCHECK(ncclCommCount(comm, &nRanks)); + NCCLCHECK(ncclCommUserRank(comm, &myRank)); size_t rankOffset = count * wordSize(type); #if NCCL_MAJOR < 2 || NCCL_MINOR < 7 @@ -61,8 +61,7 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl for (int r=0; rcollTest = &AlltoAllvTest; - threadNum = args->thread; ncclDataType_t *run_types; const char **run_typenames; int type_count; From 2f0cb03ca6d98429a56d1ad7d7c48cb5b6a172fd Mon Sep 17 00:00:00 2001 From: Sidharth Babu Date: Tue, 6 Jun 2023 20:39:05 +0000 Subject: [PATCH 04/19] created evaluation code for a2av static imbalancing --- src/alltoallv.cu | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/alltoallv.cu b/src/alltoallv.cu index fef27ab..a4ba781 100644 --- a/src/alltoallv.cu +++ b/src/alltoallv.cu @@ -9,11 +9,11 @@ void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { - *sendcount = (count/nranks)*nranks; - *recvcount = (count/nranks)*nranks; + *sendcount = (count/nranks)*nranks; //each rank in a2av should be able to send up to count to all of the others combined. + *recvcount = (count/nranks)*nranks; //each rank in a2av should be able to receive up to count from all of its peers. *sendInplaceOffset = 0; *recvInplaceOffset = 0; - *paramcount = count/nranks; + *paramcount = count/nranks; //each rank in a2av gets one even chunk to send out. } testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { @@ -24,13 +24,16 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc for (int i=0; inGpus; i++) { CUDACHECK(cudaSetDevice(args->gpus[i])); - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); //zeroes out the receive buffer of each GPU with total size (recvcount*wordSize(type)) + CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); //copies the zeroed out receive buffer to the expected buffer + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); //current rank void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); + TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank for (int j=0; jexpected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0)); + //j == peer rank + size_t partcount = sendcount/nranks; //create chunk definition to use in offsetting the data initialization + size_t partcount_mod = (partcount - j - rank - 1) % partcount; //imbalance the count of data to initialize same way we do in the test + TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount_mod, rank*partcount, type, ncclSum, 33*rep + j, 1, 0)); } CUDACHECK(cudaDeviceSynchronize()); } From e8e2bf57fac38a74d9fd8d5832a8e112a21bc4d9 Mon Sep 17 00:00:00 2001 From: Sidharth Babu Date: Tue, 6 Jun 2023 20:51:38 +0000 Subject: [PATCH 05/19] convenience script --- src/runtest.sh | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100755 src/runtest.sh diff --git a/src/runtest.sh b/src/runtest.sh new file mode 100755 index 0000000..d73f21d --- /dev/null +++ b/src/runtest.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# Convenience script to run all tests in the build output + +for script in ../build/*_perf; do + echo $script; + ./$script $@; +done From 5c0ea182acc172c84395fbbd9e847a1dfc88f42b Mon Sep 17 00:00:00 2001 From: Sidharth Babu Date: Tue, 6 Jun 2023 21:10:14 +0000 Subject: [PATCH 06/19] added a default load case --- src/runtest.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/runtest.sh b/src/runtest.sh index d73f21d..b07068a 100755 --- a/src/runtest.sh +++ b/src/runtest.sh @@ -3,5 +3,9 @@ for script in ../build/*_perf; do echo $script; - ./$script $@; + if [ -z "$@"]; then + ./$script -b 8 -e 128M -f 2 -g 2 #convenient default, tests a variety of loads + else + ./$script $@; + fi done From 53bc49625807a5de1f8b111fd9ab48787a8e9312 Mon Sep 17 00:00:00 2001 From: Sidharth Babu Date: Wed, 7 Jun 2023 17:33:07 +0000 Subject: [PATCH 07/19] created second atav testfile for granular test development --- src/alltoallv.cu | 6 --- src/alltoallv2.cu | 111 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 6 deletions(-) create mode 100644 src/alltoallv2.cu diff --git a/src/alltoallv.cu b/src/alltoallv.cu index a4ba781..dccbb71 100644 --- a/src/alltoallv.cu +++ b/src/alltoallv.cu @@ -1,9 +1,3 @@ -/************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - #include "cuda_runtime.h" #include "common.h" diff --git a/src/alltoallv2.cu b/src/alltoallv2.cu new file mode 100644 index 0000000..dccbb71 --- /dev/null +++ b/src/alltoallv2.cu @@ -0,0 +1,111 @@ +#include "cuda_runtime.h" +#include "common.h" + + +void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = (count/nranks)*nranks; //each rank in a2av should be able to send up to count to all of the others combined. + *recvcount = (count/nranks)*nranks; //each rank in a2av should be able to receive up to count from all of its peers. + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = count/nranks; //each rank in a2av gets one even chunk to send out. +} + +testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + CUDACHECK(cudaSetDevice(args->gpus[i])); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); //zeroes out the receive buffer of each GPU with total size (recvcount*wordSize(type)) + CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); //copies the zeroed out receive buffer to the expected buffer + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); //current rank + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank + for (int j=0; jexpected[i] + j*partcount*wordSize(type), partcount_mod, rank*partcount, type, ncclSum, 33*rep + j, 1, 0)); + } + CUDACHECK(cudaDeviceSynchronize()); + } + // We don't support in-place alltoallv + args->reportErrors = in_place ? 0 : 1; + return testSuccess; +} + +void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(nranks-1))/((double)(nranks)); + *busBw = baseBw * factor; +} + +testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + int nRanks, myRank; + NCCLCHECK(ncclCommCount(comm, &nRanks)); + NCCLCHECK(ncclCommUserRank(comm, &myRank)); + size_t rankOffset = count * wordSize(type); + +#if NCCL_MAJOR < 2 || NCCL_MINOR < 7 + printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); + return testNcclError; +#else + NCCLCHECK(ncclGroupStart()); + + + for (int r=0; rcollTest = &AlltoAllvTest; + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + for (int i=0; i Date: Thu, 8 Jun 2023 16:35:22 +0000 Subject: [PATCH 08/19] First draft of more granular alltoallv - alltoallv2.cu testfile: Parameterizes with alltoallv_param.csv - run_a2av.sh script: -- Runs the built test with an arbitrarily named CSV instead of the static name -- Passes through other arguments to the testfile --- run_a2av.sh | 4 +++ src/alltoallv2.cu | 73 ++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 67 insertions(+), 10 deletions(-) create mode 100755 run_a2av.sh diff --git a/run_a2av.sh b/run_a2av.sh new file mode 100755 index 0000000..1e34789 --- /dev/null +++ b/run_a2av.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +cp $1 ./alltoallv_param.csv +./build/alltoallv2_perf ${@:2} diff --git a/src/alltoallv2.cu b/src/alltoallv2.cu index dccbb71..cd2ccef 100644 --- a/src/alltoallv2.cu +++ b/src/alltoallv2.cu @@ -1,7 +1,47 @@ +#include +#include +#include +#include #include "cuda_runtime.h" #include "common.h" +testResult_t parseParamFile(int nranks, std::vector> &imbalancingFactors){ + //Open param csv + std::vector> paramFile_data; + std::ifstream paramFile("alltoallv_param.csv"); + + if (!paramFile.is_open()) { + PRINT("\nUNABLE TO OPEN PARAMS FILE\n"); + return testInternalError; + exit(-1); + } + + std::string row; + while(std::getline(paramFile,row)){ //iterate over every row + + std::vector values; //values from this line + std::stringstream rowstream(row); + std::string value; + while(std::getline(rowstream,value,',')){ //go over the row and get each value + double dval = std::stod(value); + if(dval<0 || dval>1) { + PRINT("\nINVALID PARAMS FILE, PARAMETER OUT OF 0:1 RANGE\n"); + return testInternalError; + exit(-1); + } //ensure that the value is between 0 and 1 (inclusive) + + values.push_back(dval); + } + if(values.size()!=nranks) return testInternalError; //ensure that this row has the right amount of values + paramFile_data.push_back(values); + } + + if(paramFile_data.size()!=nranks) return testInternalError; //ensure we have the right amount of rows + + imbalancingFactors = paramFile_data; //store the data in the global variable + return testSuccess; +} void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { *sendcount = (count/nranks)*nranks; //each rank in a2av should be able to send up to count to all of the others combined. *recvcount = (count/nranks)*nranks; //each rank in a2av should be able to receive up to count from all of its peers. @@ -11,11 +51,13 @@ void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par } testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { - size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus; - + //parse the param file + std::vector> imbalancingFactors; + testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors); + if(parseSuccess != testSuccess) return parseSuccess; for (int i=0; inGpus; i++) { CUDACHECK(cudaSetDevice(args->gpus[i])); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); //zeroes out the receive buffer of each GPU with total size (recvcount*wordSize(type)) @@ -23,10 +65,11 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); //current rank void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank - for (int j=0; jexpected[i] + j*partcount*wordSize(type), partcount_mod, rank*partcount, type, ncclSum, 33*rep + j, 1, 0)); } CUDACHECK(cudaDeviceSynchronize()); @@ -48,6 +91,9 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl int nRanks, myRank; NCCLCHECK(ncclCommCount(comm, &nRanks)); NCCLCHECK(ncclCommUserRank(comm, &myRank)); + std::vector> imbalancingFactors; + testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors); //parse the param file + if(parseSuccess != testSuccess) return parseSuccess; size_t rankOffset = count * wordSize(type); #if NCCL_MAJOR < 2 || NCCL_MINOR < 7 @@ -55,12 +101,19 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl return testNcclError; #else NCCLCHECK(ncclGroupStart()); - - for (int r=0; rimbalancingFactors.size()){ + PRINT("\nmyRank is greater than imbalancingFactors.size(), %i\n", myRank); + return testInternalError; + } else if (r > imbalancingFactors[myRank].size()) { + PRINT("\nr is greater than imbalancingFactors[myRank].size(), %i\n", r); + return testInternalError; + } + unsigned long send_count_mod = count * imbalancingFactors[myRank][r]; + unsigned long recv_count_mod = count * imbalancingFactors[r][myRank]; + NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, send_count_mod, type, r, comm, stream)); + NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, recv_count_mod, type, r, comm, stream)); } From ef00fa9af63fc0f0b26adc7731ee6179e5dcf8d9 Mon Sep 17 00:00:00 2001 From: Sidharth Babu Date: Fri, 9 Jun 2023 23:19:45 +0000 Subject: [PATCH 09/19] This version passes row == 1 distribution guarantee. Each Rank is guaranteed to send X/nranks data in some distribution. --- src/alltoallv2.cu | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/src/alltoallv2.cu b/src/alltoallv2.cu index cd2ccef..cda3b61 100644 --- a/src/alltoallv2.cu +++ b/src/alltoallv2.cu @@ -14,40 +14,50 @@ testResult_t parseParamFile(int nranks, std::vector> &imbala if (!paramFile.is_open()) { PRINT("\nUNABLE TO OPEN PARAMS FILE\n"); return testInternalError; - exit(-1); } std::string row; + int rowidx = 0; while(std::getline(paramFile,row)){ //iterate over every row - std::vector values; //values from this line std::stringstream rowstream(row); std::string value; - while(std::getline(rowstream,value,',')){ //go over the row and get each value + double rowsum = 0; + while(std::getline(rowstream,value,',')){ //go over the row and get each value double dval = std::stod(value); if(dval<0 || dval>1) { - PRINT("\nINVALID PARAMS FILE, PARAMETER OUT OF 0:1 RANGE\n"); + PRINT("\nINVALID PARAMS FILE, PARAMETER OUT OF 0:1 RANGE, ROW NUMBER: %i \n", rowidx); return testInternalError; - exit(-1); - } //ensure that the value is between 0 and 1 (inclusive) - + } //ensure that the value is between 0 and 1 (necessary for probability distribution) + rowsum += dval; values.push_back(dval); } - if(values.size()!=nranks) return testInternalError; //ensure that this row has the right amount of values + if(rowsum!=1.0){ + PRINT("\nINVALID PARAMS FILE, SUM OF ROW %i IS NOT 1\n", rowidx); + return testInternalError; + } //ensure that this row is a valid distribution + if(values.size()!=nranks) { + PRINT("\nINVALID PARAMS FILE, ROW %i DOES NOT HAVE CORRECT NUMBER OF VALUES\n", rowidx); + return testInternalError; + }//ensure that this row has the right amount of values paramFile_data.push_back(values); + rowidx++; } - if(paramFile_data.size()!=nranks) return testInternalError; //ensure we have the right amount of rows + if(paramFile_data.size()!=nranks) { + PRINT("\nINVALID PARAMS FILE, DOES NOT HAVE CORRECT NUMBER OF ROWS\n"); + return testInternalError; + } //ensure we have the right amount of rows imbalancingFactors = paramFile_data; //store the data in the global variable return testSuccess; } void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { - *sendcount = (count/nranks)*nranks; //each rank in a2av should be able to send up to count to all of the others combined. - *recvcount = (count/nranks)*nranks; //each rank in a2av should be able to receive up to count from all of its peers. + *sendcount = (count/nranks)*nranks; //Total send count rounded to a multiple of ranks + *recvcount = (count/nranks)*nranks; //Total recv count rounded to a multiple of ranks *sendInplaceOffset = 0; *recvInplaceOffset = 0; - *paramcount = count/nranks; //each rank in a2av gets one even chunk to send out. + *paramcount = (count/nranks); //each rank in a2av can send up to 1/nranks data. } testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { @@ -67,7 +77,6 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank for (int j=0; jexpected[i] + j*partcount*wordSize(type), partcount_mod, rank*partcount, type, ncclSum, 33*rep + j, 1, 0)); From 6b7e790185997eb3b3f0dea422d6ccd9fb789062 Mon Sep 17 00:00:00 2001 From: Sidharth Babu Date: Tue, 27 Jun 2023 02:10:04 +0000 Subject: [PATCH 10/19] added CLI arg for param file, switched to using variable in testcase --- src/alltoallv2.cu | 22 +++++++++++----------- src/common.cu | 9 ++++++++- src/common.h | 2 ++ 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/alltoallv2.cu b/src/alltoallv2.cu index cda3b61..b6b9a4b 100644 --- a/src/alltoallv2.cu +++ b/src/alltoallv2.cu @@ -5,11 +5,12 @@ #include "cuda_runtime.h" #include "common.h" +int CHECK = 0; -testResult_t parseParamFile(int nranks, std::vector> &imbalancingFactors){ +testResult_t parseParamFile(int nranks, std::vector> &imbalancingFactors, char filename[64]){ //Open param csv std::vector> paramFile_data; - std::ifstream paramFile("alltoallv_param.csv"); + std::ifstream paramFile(filename); if (!paramFile.is_open()) { PRINT("\nUNABLE TO OPEN PARAMS FILE\n"); @@ -22,22 +23,16 @@ testResult_t parseParamFile(int nranks, std::vector> &imbala std::vector values; //values from this line std::stringstream rowstream(row); std::string value; - double rowsum = 0; while(std::getline(rowstream,value,',')){ //go over the row and get each value double dval = std::stod(value); if(dval<0 || dval>1) { PRINT("\nINVALID PARAMS FILE, PARAMETER OUT OF 0:1 RANGE, ROW NUMBER: %i \n", rowidx); return testInternalError; } //ensure that the value is between 0 and 1 (necessary for probability distribution) - rowsum += dval; values.push_back(dval); } - if(rowsum!=1.0){ - PRINT("\nINVALID PARAMS FILE, SUM OF ROW %i IS NOT 1\n", rowidx); - return testInternalError; - } //ensure that this row is a valid distribution if(values.size()!=nranks) { - PRINT("\nINVALID PARAMS FILE, ROW %i DOES NOT HAVE CORRECT NUMBER OF VALUES\n", rowidx); + PRINT("\nINVALID PARAMS FILE, ROW %i DOES NOT HAVE CORRECT NUMBER OF VALUES, HAS %lu ENTRIES, NEEDS %i ENTRIES\n", rowidx, values.size(), nranks); return testInternalError; }//ensure that this row has the right amount of values paramFile_data.push_back(values); @@ -66,7 +61,8 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc int nranks = args->nProcs*args->nThreads*args->nGpus; //parse the param file std::vector> imbalancingFactors; - testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors); + testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors, args->param_file); + CHECK = 1; if(parseSuccess != testSuccess) return parseSuccess; for (int i=0; inGpus; i++) { CUDACHECK(cudaSetDevice(args->gpus[i])); @@ -101,10 +97,14 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl NCCLCHECK(ncclCommCount(comm, &nRanks)); NCCLCHECK(ncclCommUserRank(comm, &myRank)); std::vector> imbalancingFactors; - testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors); //parse the param file + struct threadArgs* args = (struct threadArgs*) (__builtin_frame_address(1)); + testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors, args->param_file); //parse the param file if(parseSuccess != testSuccess) return parseSuccess; size_t rankOffset = count * wordSize(type); + // Get the base address of the previous stack frame. + // Since this function is only ever called from the startColl function, this will be the address of the startColl function's stack frame. + // The beginning of that stack frame will be the threadargs struct. #if NCCL_MAJOR < 2 || NCCL_MINOR < 7 printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); return testNcclError; diff --git a/src/common.cu b/src/common.cu index 48a629c..f8f37c9 100644 --- a/src/common.cu +++ b/src/common.cu @@ -59,6 +59,7 @@ int is_main_proc = 0; thread_local int is_main_thread = 0; // Command line parameter defaults +static char param_file[64]; static int nThreads = 1; static int nGpus = 1; static size_t minBytes = 32*1024*1024; @@ -685,6 +686,7 @@ int main(int argc, char* argv[]) { double parsed; int longindex; static struct option longopts[] = { + {"param_file",optional_argument, 0, 'v'}, {"nthreads", required_argument, 0, 't'}, {"ngpus", required_argument, 0, 'g'}, {"minbytes", required_argument, 0, 'b'}, @@ -711,12 +713,15 @@ int main(int argc, char* argv[]) { while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex); + c = getopt_long(argc, argv, "v:t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex); if (c == -1) break; switch(c) { + case 'v': + strcpy(param_file,optarg); + break; case 't': nThreads = strtol(optarg, NULL, 0); break; @@ -983,6 +988,8 @@ testResult_t run() { memset(threads, 0, sizeof(struct testThread)*nThreads); for (int t=nThreads-1; t>=0; t--) { + strcpy(threads[t].args.param_file, param_file); + threads[t].args.minbytes=minBytes; threads[t].args.maxbytes=maxBytes; threads[t].args.stepbytes=stepBytes; diff --git a/src/common.h b/src/common.h index 20fa461..6d18d48 100644 --- a/src/common.h +++ b/src/common.h @@ -110,6 +110,8 @@ struct testEngine { extern struct testEngine ncclTestEngine; struct threadArgs { + char param_file[64]; + size_t nbytes; size_t minbytes; size_t maxbytes; From 025333acfdc716f7d076dc5b531cc73648773c62 Mon Sep 17 00:00:00 2001 From: Sidharth Babu Date: Tue, 27 Jun 2023 17:25:49 +0000 Subject: [PATCH 11/19] added documentation --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 580996b..f5447af 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,7 @@ All tests support the same set of arguments : * `-m,--agg_iters ` number of operations to aggregate together in each iteration. Default : 1. * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. * Test operation + * `-v,--param_file ` Read parameters from file for tests that require it. Currently only required for alltoallv benchmark. Default : disabled. Max of 64 characters for filename. * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1. * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. From f94647f20987ffe5316cfc4980d6900c9a3149f7 Mon Sep 17 00:00:00 2001 From: Sidharth Babu Date: Tue, 27 Jun 2023 19:44:37 +0000 Subject: [PATCH 12/19] rename + docs --- src/alltoallv.cu | 96 +++++++++++++++++++++---- src/alltoallv2.cu | 173 ---------------------------------------------- 2 files changed, 83 insertions(+), 186 deletions(-) delete mode 100644 src/alltoallv2.cu diff --git a/src/alltoallv.cu b/src/alltoallv.cu index dccbb71..1ade93d 100644 --- a/src/alltoallv.cu +++ b/src/alltoallv.cu @@ -1,21 +1,74 @@ +#include +#include +#include +#include #include "cuda_runtime.h" #include "common.h" +int CHECK = 0; + +/** + * @brief Parses the parameter file and stores the matrix data into the imbalancingFactors reference passed in. + * @param nranks The number of ranks in the test + * @param imbalancingFactors The reference to the vector that will store the parsed data + * @param filename The name of the parameter file to parse +**/ +testResult_t parseParamFile(int nranks, std::vector> &imbalancingFactors, char filename[64]){ + std::vector> paramFile_data; + std::ifstream paramFile(filename); + + if (!paramFile.is_open()) { + PRINT("\nUNABLE TO OPEN PARAMS FILE\n"); + return testInternalError; + } + + std::string row; + int rowidx = 0; + while(std::getline(paramFile,row)){ //iterate over every row + std::vector values; //values from this line + std::stringstream rowstream(row); + std::string value; + while(std::getline(rowstream,value,',')){ //go over the row and get each value + double dval = std::stod(value); + if(dval<0 || dval>1) { + PRINT("\nINVALID PARAMS FILE, PARAMETER OUT OF 0:1 RANGE, ROW NUMBER: %i \n", rowidx); + return testInternalError; + } //ensure that the value is between 0 and 1 (necessary for probability distribution) + values.push_back(dval); + } + if(values.size()!=nranks) { + PRINT("\nINVALID PARAMS FILE, ROW %i DOES NOT HAVE CORRECT NUMBER OF VALUES, HAS %lu ENTRIES, NEEDS %i ENTRIES\n", rowidx, values.size(), nranks); + return testInternalError; + }//ensure that this row has the right amount of values + paramFile_data.push_back(values); + rowidx++; + } + if(paramFile_data.size()!=nranks) { + PRINT("\nINVALID PARAMS FILE, DOES NOT HAVE CORRECT NUMBER OF ROWS\n"); + return testInternalError; + } //ensure we have the right amount of rows + + imbalancingFactors = paramFile_data; //store the data in the return variable + return testSuccess; +} void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { - *sendcount = (count/nranks)*nranks; //each rank in a2av should be able to send up to count to all of the others combined. - *recvcount = (count/nranks)*nranks; //each rank in a2av should be able to receive up to count from all of its peers. + *sendcount = (count/nranks)*nranks; //Total send count rounded to a multiple of ranks + *recvcount = (count/nranks)*nranks; //Total recv count rounded to a multiple of ranks *sendInplaceOffset = 0; *recvInplaceOffset = 0; - *paramcount = count/nranks; //each rank in a2av gets one even chunk to send out. + *paramcount = (count/nranks); } testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { - size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus; - + //parse the param file + std::vector> imbalancingFactors; + testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors, args->param_file); + CHECK = 1; + if(parseSuccess != testSuccess) return parseSuccess; for (int i=0; inGpus; i++) { CUDACHECK(cudaSetDevice(args->gpus[i])); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); //zeroes out the receive buffer of each GPU with total size (recvcount*wordSize(type)) @@ -23,10 +76,10 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); //current rank void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank - for (int j=0; jexpected[i] + j*partcount*wordSize(type), partcount_mod, rank*partcount, type, ncclSum, 33*rep + j, 1, 0)); } CUDACHECK(cudaDeviceSynchronize()); @@ -48,19 +101,36 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl int nRanks, myRank; NCCLCHECK(ncclCommCount(comm, &nRanks)); NCCLCHECK(ncclCommUserRank(comm, &myRank)); + std::vector> imbalancingFactors; + + // Since this function is only ever called from the startColl function, this builtin call will return the address of the startColl function's stack frame. + // The beginning of that stack frame will be the threadargs struct, which contains the param filename. + struct threadArgs* args = (struct threadArgs*) (__builtin_frame_address(1)); + testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors, args->param_file); //parse the param file + if(parseSuccess != testSuccess) return parseSuccess; size_t rankOffset = count * wordSize(type); + // Get the base address of the previous stack frame. + // Since this function is only ever called from the startColl function, this will be the address of the startColl function's stack frame. + // The beginning of that stack frame will be the threadargs struct. #if NCCL_MAJOR < 2 || NCCL_MINOR < 7 printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); return testNcclError; #else NCCLCHECK(ncclGroupStart()); - - for (int r=0; rimbalancingFactors.size()){ + PRINT("\nmyRank is greater than imbalancingFactors.size(), %i\n", myRank); + return testInternalError; + } else if (r > imbalancingFactors[myRank].size()) { + PRINT("\nr is greater than imbalancingFactors[myRank].size(), %i\n", r); + return testInternalError; + } + unsigned long send_count_mod = count * imbalancingFactors[myRank][r]; + unsigned long recv_count_mod = count * imbalancingFactors[r][myRank]; + NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, send_count_mod, type, r, comm, stream)); + NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, recv_count_mod, type, r, comm, stream)); } diff --git a/src/alltoallv2.cu b/src/alltoallv2.cu deleted file mode 100644 index b6b9a4b..0000000 --- a/src/alltoallv2.cu +++ /dev/null @@ -1,173 +0,0 @@ -#include -#include -#include -#include -#include "cuda_runtime.h" -#include "common.h" - -int CHECK = 0; - -testResult_t parseParamFile(int nranks, std::vector> &imbalancingFactors, char filename[64]){ - //Open param csv - std::vector> paramFile_data; - std::ifstream paramFile(filename); - - if (!paramFile.is_open()) { - PRINT("\nUNABLE TO OPEN PARAMS FILE\n"); - return testInternalError; - } - - std::string row; - int rowidx = 0; - while(std::getline(paramFile,row)){ //iterate over every row - std::vector values; //values from this line - std::stringstream rowstream(row); - std::string value; - while(std::getline(rowstream,value,',')){ //go over the row and get each value - double dval = std::stod(value); - if(dval<0 || dval>1) { - PRINT("\nINVALID PARAMS FILE, PARAMETER OUT OF 0:1 RANGE, ROW NUMBER: %i \n", rowidx); - return testInternalError; - } //ensure that the value is between 0 and 1 (necessary for probability distribution) - values.push_back(dval); - } - if(values.size()!=nranks) { - PRINT("\nINVALID PARAMS FILE, ROW %i DOES NOT HAVE CORRECT NUMBER OF VALUES, HAS %lu ENTRIES, NEEDS %i ENTRIES\n", rowidx, values.size(), nranks); - return testInternalError; - }//ensure that this row has the right amount of values - paramFile_data.push_back(values); - rowidx++; - } - - if(paramFile_data.size()!=nranks) { - PRINT("\nINVALID PARAMS FILE, DOES NOT HAVE CORRECT NUMBER OF ROWS\n"); - return testInternalError; - } //ensure we have the right amount of rows - - imbalancingFactors = paramFile_data; //store the data in the global variable - return testSuccess; -} -void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { - *sendcount = (count/nranks)*nranks; //Total send count rounded to a multiple of ranks - *recvcount = (count/nranks)*nranks; //Total recv count rounded to a multiple of ranks - *sendInplaceOffset = 0; - *recvInplaceOffset = 0; - *paramcount = (count/nranks); //each rank in a2av can send up to 1/nranks data. -} - -testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { - size_t sendcount = args->sendBytes / wordSize(type); - size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus; - //parse the param file - std::vector> imbalancingFactors; - testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors, args->param_file); - CHECK = 1; - if(parseSuccess != testSuccess) return parseSuccess; - for (int i=0; inGpus; i++) { - CUDACHECK(cudaSetDevice(args->gpus[i])); - CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); //zeroes out the receive buffer of each GPU with total size (recvcount*wordSize(type)) - CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); //copies the zeroed out receive buffer to the expected buffer - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); //current rank - void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank - for (int j=0; jexpected[i] + j*partcount*wordSize(type), partcount_mod, rank*partcount, type, ncclSum, 33*rep + j, 1, 0)); - } - CUDACHECK(cudaDeviceSynchronize()); - } - // We don't support in-place alltoallv - args->reportErrors = in_place ? 0 : 1; - return testSuccess; -} - -void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { - double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec; - - *algBw = baseBw; - double factor = ((double)(nranks-1))/((double)(nranks)); - *busBw = baseBw * factor; -} - -testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - int nRanks, myRank; - NCCLCHECK(ncclCommCount(comm, &nRanks)); - NCCLCHECK(ncclCommUserRank(comm, &myRank)); - std::vector> imbalancingFactors; - struct threadArgs* args = (struct threadArgs*) (__builtin_frame_address(1)); - testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors, args->param_file); //parse the param file - if(parseSuccess != testSuccess) return parseSuccess; - size_t rankOffset = count * wordSize(type); - - // Get the base address of the previous stack frame. - // Since this function is only ever called from the startColl function, this will be the address of the startColl function's stack frame. - // The beginning of that stack frame will be the threadargs struct. -#if NCCL_MAJOR < 2 || NCCL_MINOR < 7 - printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); - return testNcclError; -#else - NCCLCHECK(ncclGroupStart()); - for (int r=0; rimbalancingFactors.size()){ - PRINT("\nmyRank is greater than imbalancingFactors.size(), %i\n", myRank); - return testInternalError; - } else if (r > imbalancingFactors[myRank].size()) { - PRINT("\nr is greater than imbalancingFactors[myRank].size(), %i\n", r); - return testInternalError; - } - unsigned long send_count_mod = count * imbalancingFactors[myRank][r]; - unsigned long recv_count_mod = count * imbalancingFactors[r][myRank]; - NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, send_count_mod, type, r, comm, stream)); - NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, recv_count_mod, type, r, comm, stream)); - } - - - NCCLCHECK(ncclGroupEnd()); - return testSuccess; -#endif -} - -struct testColl AlltoAllvTest = { - "AlltoAllV", - AlltoAllvGetCollByteCount, - AlltoAllvInitData, - AlltoAllvGetBw, - AlltoAllvRunColl -}; - -void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { - size_t paramcount, sendInplaceOffset, recvInplaceOffset; - AlltoAllvGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); -} - -testResult_t AlltoAllvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { - args->collTest = &AlltoAllvTest; - ncclDataType_t *run_types; - const char **run_typenames; - int type_count; - if ((int)type != -1) { - type_count = 1; - run_types = &type; - run_typenames = &typeName; - } else { - type_count = test_typenum; - run_types = test_types; - run_typenames = test_typenames; - } - - for (int i=0; i Date: Tue, 27 Jun 2023 20:08:41 +0000 Subject: [PATCH 13/19] removed unnecessary script --- run_a2av.sh | 4 ---- 1 file changed, 4 deletions(-) delete mode 100755 run_a2av.sh diff --git a/run_a2av.sh b/run_a2av.sh deleted file mode 100755 index 1e34789..0000000 --- a/run_a2av.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -cp $1 ./alltoallv_param.csv -./build/alltoallv2_perf ${@:2} From 8ad54bd017fc4a560ac292520a11a923624870fc Mon Sep 17 00:00:00 2001 From: Sidharth Babu Date: Tue, 27 Jun 2023 20:14:54 +0000 Subject: [PATCH 14/19] cleanup --- src/runtest.sh | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100755 src/runtest.sh diff --git a/src/runtest.sh b/src/runtest.sh deleted file mode 100755 index b07068a..0000000 --- a/src/runtest.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -# Convenience script to run all tests in the build output - -for script in ../build/*_perf; do - echo $script; - if [ -z "$@"]; then - ./$script -b 8 -e 128M -f 2 -g 2 #convenient default, tests a variety of loads - else - ./$script $@; - fi -done From 8d6c8d84fe532ba6fe74da4c96e56d93ef5a22ef Mon Sep 17 00:00:00 2001 From: Sidharth Babu Date: Mon, 10 Jul 2023 16:46:11 +0000 Subject: [PATCH 15/19] Removed builtin usage, replaced with function argument --- src/all_gather.cu | 2 +- src/all_reduce.cu | 2 +- src/alltoall.cu | 2 +- src/alltoallv.cu | 22 ++++++---------------- src/broadcast.cu | 2 +- src/common.cu | 3 +-- src/common.h | 2 +- src/gather.cu | 2 +- src/hypercube.cu | 2 +- src/reduce.cu | 2 +- src/reduce_scatter.cu | 2 +- src/scatter.cu | 2 +- src/sendrecv.cu | 2 +- 13 files changed, 18 insertions(+), 29 deletions(-) diff --git a/src/all_gather.cu b/src/all_gather.cu index 0831207..9031ed1 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -45,7 +45,7 @@ void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream)); return testSuccess; } diff --git a/src/all_reduce.cu b/src/all_reduce.cu index a38eabe..3bdfb3b 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -40,7 +40,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); return testSuccess; } diff --git a/src/alltoall.cu b/src/alltoall.cu index 41c7c4a..b737dbb 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -45,7 +45,7 @@ void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double *busBw = baseBw * factor; } -testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); size_t rankOffset = count * wordSize(type); diff --git a/src/alltoallv.cu b/src/alltoallv.cu index 1ade93d..d9bf108 100644 --- a/src/alltoallv.cu +++ b/src/alltoallv.cu @@ -57,12 +57,11 @@ void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par *recvcount = (count/nranks)*nranks; //Total recv count rounded to a multiple of ranks *sendInplaceOffset = 0; *recvInplaceOffset = 0; - *paramcount = (count/nranks); + *paramcount = (count/nranks); //Each rank can send a maximum of count/nranks data to each other rank } testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { - size_t sendcount = args->sendBytes / wordSize(type); - size_t recvcount = args->expectedBytes / wordSize(type); + size_t maxchunk = args->nbytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus; //parse the param file std::vector> imbalancingFactors; @@ -75,12 +74,10 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); //copies the zeroed out receive buffer to the expected buffer int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); //current rank void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank + TESTCHECK(InitData(data, maxchunk*nranks, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank. Should be chunk size * nranks for (int j=0; jexpected[i] + j*partcount*wordSize(type), partcount_mod, rank*partcount, type, ncclSum, 33*rep + j, 1, 0)); + size_t partcount_mod = maxchunk * imbalancingFactors[j][rank]; //imbalance the count of data to initialize same way we do in the test + TESTCHECK(InitData((char*)args->expected[i] + j*maxchunk*wordSize(type), partcount_mod, rank*maxchunk, type, ncclSum, 33*rep + j, 1, 0)); } CUDACHECK(cudaDeviceSynchronize()); } @@ -97,22 +94,15 @@ void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { int nRanks, myRank; NCCLCHECK(ncclCommCount(comm, &nRanks)); NCCLCHECK(ncclCommUserRank(comm, &myRank)); std::vector> imbalancingFactors; - - // Since this function is only ever called from the startColl function, this builtin call will return the address of the startColl function's stack frame. - // The beginning of that stack frame will be the threadargs struct, which contains the param filename. - struct threadArgs* args = (struct threadArgs*) (__builtin_frame_address(1)); testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors, args->param_file); //parse the param file if(parseSuccess != testSuccess) return parseSuccess; size_t rankOffset = count * wordSize(type); - // Get the base address of the previous stack frame. - // Since this function is only ever called from the startColl function, this will be the address of the startColl function's stack frame. - // The beginning of that stack frame will be the threadargs struct. #if NCCL_MAJOR < 2 || NCCL_MINOR < 7 printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); return testNcclError; diff --git a/src/broadcast.cu b/src/broadcast.cu index 903066a..9914a57 100644 --- a/src/broadcast.cu +++ b/src/broadcast.cu @@ -39,7 +39,7 @@ void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { int rank; NCCLCHECK(ncclCommUserRank(comm, &rank)); #if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2 diff --git a/src/common.cu b/src/common.cu index f8f37c9..b7bc46f 100644 --- a/src/common.cu +++ b/src/common.cu @@ -374,11 +374,10 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); } #endif - TESTCHECK(args->collTest->runColl( (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff), (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff), - count, type, op, root, args->comms[i], args->streams[i])); + count, type, op, root, args->comms[i], args->streams[i], args)); #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) if(opIndex >= ncclNumOps) { diff --git a/src/common.h b/src/common.h index 6d18d48..a559c0a 100644 --- a/src/common.h +++ b/src/common.h @@ -92,7 +92,7 @@ struct testColl { ncclRedOp_t op, int root, int rep, int in_place); void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args); }; extern struct testColl allReduceTest; extern struct testColl allGatherTest; diff --git a/src/gather.cu b/src/gather.cu index 03ef4d9..9f3c5ad 100644 --- a/src/gather.cu +++ b/src/gather.cu @@ -43,7 +43,7 @@ void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* *busBw = baseBw * factor; } -testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); int rank; diff --git a/src/hypercube.cu b/src/hypercube.cu index 5c1456f..8cb73ed 100644 --- a/src/hypercube.cu +++ b/src/hypercube.cu @@ -45,7 +45,7 @@ void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { char* sbuff = (char*)sendbuff; char* rbuff = (char*)recvbuff; int nRanks; diff --git a/src/reduce.cu b/src/reduce.cu index f2fa80d..6fed902 100644 --- a/src/reduce.cu +++ b/src/reduce.cu @@ -39,7 +39,7 @@ void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* *busBw = baseBw; } -testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream)); return testSuccess; } diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu index ed372e3..f23cbc2 100644 --- a/src/reduce_scatter.cu +++ b/src/reduce_scatter.cu @@ -44,7 +44,7 @@ void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, d *busBw = baseBw * factor; } -testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream)); return testSuccess; } diff --git a/src/scatter.cu b/src/scatter.cu index 49d20e1..0644a52 100644 --- a/src/scatter.cu +++ b/src/scatter.cu @@ -39,7 +39,7 @@ void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* *busBw = baseBw * factor; } -testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); int rank; diff --git a/src/sendrecv.cu b/src/sendrecv.cu index c9eb5bb..8052b81 100644 --- a/src/sendrecv.cu +++ b/src/sendrecv.cu @@ -43,7 +43,7 @@ void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double *busBw = baseBw * factor; } -testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) { int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); int rank; From cd6da4196746d4e838f46162d94ed4fb6eb9d5f4 Mon Sep 17 00:00:00 2001 From: Sidharth Babu Date: Mon, 10 Jul 2023 16:51:34 +0000 Subject: [PATCH 16/19] added some testcases to use as reference --- paramfiles/alltoallv_paramfiles/Rank1Test1.csv | 1 + paramfiles/alltoallv_paramfiles/Rank2Test1.csv | 2 ++ paramfiles/alltoallv_paramfiles/Rank2Test2.csv | 2 ++ paramfiles/alltoallv_paramfiles/Rank3Test1.csv | 3 +++ paramfiles/alltoallv_paramfiles/Rank3Test2.csv | 3 +++ paramfiles/alltoallv_paramfiles/Rank4Test1.csv | 4 ++++ paramfiles/alltoallv_paramfiles/Rank4Test2.csv | 4 ++++ paramfiles/alltoallv_paramfiles/Rank4Test3.csv | 4 ++++ paramfiles/alltoallv_paramfiles/Rank4Test4.csv | 4 ++++ paramfiles/alltoallv_paramfiles/Rank4Test5.csv | 4 ++++ paramfiles/alltoallv_paramfiles/Rank4Test6.csv | 5 +++++ 11 files changed, 36 insertions(+) create mode 100644 paramfiles/alltoallv_paramfiles/Rank1Test1.csv create mode 100644 paramfiles/alltoallv_paramfiles/Rank2Test1.csv create mode 100644 paramfiles/alltoallv_paramfiles/Rank2Test2.csv create mode 100644 paramfiles/alltoallv_paramfiles/Rank3Test1.csv create mode 100644 paramfiles/alltoallv_paramfiles/Rank3Test2.csv create mode 100644 paramfiles/alltoallv_paramfiles/Rank4Test1.csv create mode 100644 paramfiles/alltoallv_paramfiles/Rank4Test2.csv create mode 100644 paramfiles/alltoallv_paramfiles/Rank4Test3.csv create mode 100644 paramfiles/alltoallv_paramfiles/Rank4Test4.csv create mode 100644 paramfiles/alltoallv_paramfiles/Rank4Test5.csv create mode 100644 paramfiles/alltoallv_paramfiles/Rank4Test6.csv diff --git a/paramfiles/alltoallv_paramfiles/Rank1Test1.csv b/paramfiles/alltoallv_paramfiles/Rank1Test1.csv new file mode 100644 index 0000000..d00491f --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank1Test1.csv @@ -0,0 +1 @@ +1 diff --git a/paramfiles/alltoallv_paramfiles/Rank2Test1.csv b/paramfiles/alltoallv_paramfiles/Rank2Test1.csv new file mode 100644 index 0000000..6504ac9 --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank2Test1.csv @@ -0,0 +1,2 @@ +0.1,0.4 +0,0.5 diff --git a/paramfiles/alltoallv_paramfiles/Rank2Test2.csv b/paramfiles/alltoallv_paramfiles/Rank2Test2.csv new file mode 100644 index 0000000..adbb77f --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank2Test2.csv @@ -0,0 +1,2 @@ +0,1 +0,0 diff --git a/paramfiles/alltoallv_paramfiles/Rank3Test1.csv b/paramfiles/alltoallv_paramfiles/Rank3Test1.csv new file mode 100644 index 0000000..5e2bda4 --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank3Test1.csv @@ -0,0 +1,3 @@ +0.3,0.4,0.3 +0.2,0,0.8 +0.1,0.2,0.7 diff --git a/paramfiles/alltoallv_paramfiles/Rank3Test2.csv b/paramfiles/alltoallv_paramfiles/Rank3Test2.csv new file mode 100644 index 0000000..072e914 --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank3Test2.csv @@ -0,0 +1,3 @@ +0,0,1 +0,1,0 +1,0,0 diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test1.csv b/paramfiles/alltoallv_paramfiles/Rank4Test1.csv new file mode 100644 index 0000000..9316fe3 --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank4Test1.csv @@ -0,0 +1,4 @@ +1,1,1,1 +1,1,1,1 +1,1,1,1 +1,1,1,1 diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test2.csv b/paramfiles/alltoallv_paramfiles/Rank4Test2.csv new file mode 100644 index 0000000..9d31d09 --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank4Test2.csv @@ -0,0 +1,4 @@ +0.25,0.25,0.25,0.25 +0.50,0.50,0.50,0.50 +0.75,0.75,0.75,0.75 +1,1,1,1 diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test3.csv b/paramfiles/alltoallv_paramfiles/Rank4Test3.csv new file mode 100644 index 0000000..46b8070 --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank4Test3.csv @@ -0,0 +1,4 @@ +1,1,1,1 +0.1,0.1,0.1,0.1 +0.1,0.1,0.1,0.1 +0.1,0.1,0.1,0.1 diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test4.csv b/paramfiles/alltoallv_paramfiles/Rank4Test4.csv new file mode 100644 index 0000000..e43df53 --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank4Test4.csv @@ -0,0 +1,4 @@ +1,1,1,1 +0,0,0,0 +0,0,0,0 +0,0,0,0 diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test5.csv b/paramfiles/alltoallv_paramfiles/Rank4Test5.csv new file mode 100644 index 0000000..2750e10 --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank4Test5.csv @@ -0,0 +1,4 @@ +0,0,0,0 +1,1,1,1 +1,1,1,1 +1,1,1,1 diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test6.csv b/paramfiles/alltoallv_paramfiles/Rank4Test6.csv new file mode 100644 index 0000000..f81b129 --- /dev/null +++ b/paramfiles/alltoallv_paramfiles/Rank4Test6.csv @@ -0,0 +1,5 @@ +1,0,0,0 +1,0,0,0 +1,0,0,0 +1,0,0,0 + From d4fb4d05bbfd4c9b7120e3d3949199af50f37a23 Mon Sep 17 00:00:00 2001 From: Sidharth Babu Date: Mon, 10 Jul 2023 20:39:10 +0000 Subject: [PATCH 17/19] Renamed CLI arg --- README.md | 2 +- src/alltoallv.cu | 4 ++-- src/common.cu | 12 ++++++------ src/common.h | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index f5447af..128f0e2 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ All tests support the same set of arguments : * `-m,--agg_iters ` number of operations to aggregate together in each iteration. Default : 1. * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. * Test operation - * `-v,--param_file ` Read parameters from file for tests that require it. Currently only required for alltoallv benchmark. Default : disabled. Max of 64 characters for filename. + * `-s,--setup_file ` Read parameters from file for tests that require it. Currently only required for alltoallv benchmark. Default : disabled. Max of 64 characters for filename. * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1. * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. diff --git a/src/alltoallv.cu b/src/alltoallv.cu index d9bf108..1b65537 100644 --- a/src/alltoallv.cu +++ b/src/alltoallv.cu @@ -65,7 +65,7 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc int nranks = args->nProcs*args->nThreads*args->nGpus; //parse the param file std::vector> imbalancingFactors; - testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors, args->param_file); + testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors, args->setup_file); CHECK = 1; if(parseSuccess != testSuccess) return parseSuccess; for (int i=0; inGpus; i++) { @@ -99,7 +99,7 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl NCCLCHECK(ncclCommCount(comm, &nRanks)); NCCLCHECK(ncclCommUserRank(comm, &myRank)); std::vector> imbalancingFactors; - testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors, args->param_file); //parse the param file + testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors, args->setup_file); //parse the param file if(parseSuccess != testSuccess) return parseSuccess; size_t rankOffset = count * wordSize(type); diff --git a/src/common.cu b/src/common.cu index b7bc46f..f174c0e 100644 --- a/src/common.cu +++ b/src/common.cu @@ -59,7 +59,7 @@ int is_main_proc = 0; thread_local int is_main_thread = 0; // Command line parameter defaults -static char param_file[64]; +static char setup_file[64]; static int nThreads = 1; static int nGpus = 1; static size_t minBytes = 32*1024*1024; @@ -685,7 +685,7 @@ int main(int argc, char* argv[]) { double parsed; int longindex; static struct option longopts[] = { - {"param_file",optional_argument, 0, 'v'}, + {"setup_file",optional_argument, 0, 's'}, {"nthreads", required_argument, 0, 't'}, {"ngpus", required_argument, 0, 'g'}, {"minbytes", required_argument, 0, 'b'}, @@ -712,14 +712,14 @@ int main(int argc, char* argv[]) { while(1) { int c; - c = getopt_long(argc, argv, "v:t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex); + c = getopt_long(argc, argv, "s:t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex); if (c == -1) break; switch(c) { - case 'v': - strcpy(param_file,optarg); + case 's': + strcpy(setup_file,optarg); break; case 't': nThreads = strtol(optarg, NULL, 0); @@ -987,7 +987,7 @@ testResult_t run() { memset(threads, 0, sizeof(struct testThread)*nThreads); for (int t=nThreads-1; t>=0; t--) { - strcpy(threads[t].args.param_file, param_file); + strcpy(threads[t].args.setup_file, setup_file); threads[t].args.minbytes=minBytes; threads[t].args.maxbytes=maxBytes; diff --git a/src/common.h b/src/common.h index a559c0a..ac01c4f 100644 --- a/src/common.h +++ b/src/common.h @@ -110,7 +110,7 @@ struct testEngine { extern struct testEngine ncclTestEngine; struct threadArgs { - char param_file[64]; + char setup_file[64]; size_t nbytes; size_t minbytes; From 5cbb12e8fb78a5e1d67fc50fcb7609b857d97c94 Mon Sep 17 00:00:00 2001 From: Sidharth Babu Date: Wed, 12 Jul 2023 16:30:10 +0000 Subject: [PATCH 18/19] changed filepath limit --- src/alltoallv.cu | 3 +-- src/common.cu | 2 +- src/common.h | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/alltoallv.cu b/src/alltoallv.cu index 1b65537..05a419a 100644 --- a/src/alltoallv.cu +++ b/src/alltoallv.cu @@ -13,7 +13,7 @@ int CHECK = 0; * @param imbalancingFactors The reference to the vector that will store the parsed data * @param filename The name of the parameter file to parse **/ -testResult_t parseParamFile(int nranks, std::vector> &imbalancingFactors, char filename[64]){ +testResult_t parseParamFile(int nranks, std::vector> &imbalancingFactors, char filename[PATH_MAX]){ std::vector> paramFile_data; std::ifstream paramFile(filename); @@ -66,7 +66,6 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc //parse the param file std::vector> imbalancingFactors; testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors, args->setup_file); - CHECK = 1; if(parseSuccess != testSuccess) return parseSuccess; for (int i=0; inGpus; i++) { CUDACHECK(cudaSetDevice(args->gpus[i])); diff --git a/src/common.cu b/src/common.cu index f174c0e..559f360 100644 --- a/src/common.cu +++ b/src/common.cu @@ -59,7 +59,7 @@ int is_main_proc = 0; thread_local int is_main_thread = 0; // Command line parameter defaults -static char setup_file[64]; +static char setup_file[PATH_MAX]; static int nThreads = 1; static int nGpus = 1; static size_t minBytes = 32*1024*1024; diff --git a/src/common.h b/src/common.h index ac01c4f..9bf2769 100644 --- a/src/common.h +++ b/src/common.h @@ -110,7 +110,7 @@ struct testEngine { extern struct testEngine ncclTestEngine; struct threadArgs { - char setup_file[64]; + char setup_file[PATH_MAX]; size_t nbytes; size_t minbytes; From 86577ee9d4c0cf06716796b5e10c4b82ed4a1143 Mon Sep 17 00:00:00 2001 From: Sidharth Babu Date: Wed, 12 Jul 2023 16:57:05 +0000 Subject: [PATCH 19/19] adjusted parse function --- src/alltoallv.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/alltoallv.cu b/src/alltoallv.cu index 05a419a..6fb265f 100644 --- a/src/alltoallv.cu +++ b/src/alltoallv.cu @@ -13,12 +13,12 @@ int CHECK = 0; * @param imbalancingFactors The reference to the vector that will store the parsed data * @param filename The name of the parameter file to parse **/ -testResult_t parseParamFile(int nranks, std::vector> &imbalancingFactors, char filename[PATH_MAX]){ +testResult_t parseParamFile(int nranks, std::vector> &imbalancingFactors, const char filename[PATH_MAX]){ std::vector> paramFile_data; std::ifstream paramFile(filename); if (!paramFile.is_open()) { - PRINT("\nUNABLE TO OPEN PARAMS FILE\n"); + PRINT("\nUNABLE TO OPEN PARAMS FILE AT: %s\n", filename); return testInternalError; } @@ -45,7 +45,7 @@ testResult_t parseParamFile(int nranks, std::vector> &imbala } if(paramFile_data.size()!=nranks) { - PRINT("\nINVALID PARAMS FILE, DOES NOT HAVE CORRECT NUMBER OF ROWS\n"); + PRINT("\nINVALID PARAMS FILE, DOES NOT HAVE CORRECT NUMBER OF ROWS, HAS %i ROWS, NEEDS %i ROWS\n", paramFile_data.size(), nranks); return testInternalError; } //ensure we have the right amount of rows