Skip to content

Commit

Permalink
Allow more precise measurements of single operation (#20)
Browse files Browse the repository at this point in the history
  • Loading branch information
wenkaidu authored and edgargabriel committed Oct 21, 2022
1 parent 774f098 commit dd957fe
Showing 1 changed file with 13 additions and 6 deletions.
19 changes: 13 additions & 6 deletions src/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ static int average = 1;
static int numDevices = 1;
static int ranksPerGpu = 1;
static int enable_multiranks = 0;
static int delay_inout_place = 0;

#define NUM_BLOCKS 32

Expand Down Expand Up @@ -459,9 +460,11 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
}

// Sync
TESTCHECK(startColl(args, type, op, root, in_place, 0));
TESTCHECK(completeColl(args));
if (warmup_iters) {
// Sync
TESTCHECK(startColl(args, type, op, root, in_place, 0));
TESTCHECK(completeColl(args));
}

Barrier(args);

Expand Down Expand Up @@ -651,6 +654,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
sprintf(rootName, "%6i", root);
PRINT("%12li %12li %8s %6s %6s", (size_t)max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
TESTCHECK(BenchTime(args, type, op, root, 0));
usleep(delay_inout_place);
TESTCHECK(BenchTime(args, type, op, root, 1));
PRINT("\n");
}
Expand Down Expand Up @@ -802,9 +806,9 @@ int main(int argc, char* argv[]) {
while(1) {
int c;
#ifdef RCCL_MULTIRANKPERGPU
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:R:x:", longopts, &longindex);
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:R:x:q:", longopts, &longindex);
#else
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:", longopts, &longindex);
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:q:", longopts, &longindex);
#endif

if (c == -1)
Expand Down Expand Up @@ -913,6 +917,9 @@ int main(int argc, char* argv[]) {
ranksPerGpu = (int)strtol(optarg, NULL, 0);
break;
#endif
case 'q':
delay_inout_place = (int)strtol(optarg, NULL, 10);
break;
case 'h':
default:
if (c != 'h') printf("invalid option '%c'\n", c);
Expand Down Expand Up @@ -950,6 +957,7 @@ int main(int argc, char* argv[]) {
"[-x,--enable_multiranks <0/1> enable using multiple ranks per GPU] \n\t"
"[-R,--ranks_per_gpu] \n\t"
#endif
"[-q,--delay <delay between out-of-place and in-place in microseconds>] \n\t"
"[-h,--help]\n",
basename(argv[0]));
return 0;
Expand Down Expand Up @@ -1112,7 +1120,6 @@ testResult_t run() {
}
}
}

//if parallel init is not selected, use main thread to initialize NCCL
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus*ranksPerGpu);
if (!parallel_init) {
Expand Down

0 comments on commit dd957fe

Please sign in to comment.