From 071d8ac9288298a862d54192f217fe939cfa27e0 Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Fri, 2 Jun 2023 23:09:01 +0000
Subject: [PATCH 01/19] Initialized alltoallv test

---
 src/alltoallv.cu | 111 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 src/alltoallv.cu
diff --git a/src/alltoallv.cu b/src/alltoallv.cu
new file mode 100644
index 0000000..33a0b28
--- /dev/null
+++ b/src/alltoallv.cu
@@ -0,0 +1,111 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "common.h"
+
+void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks; 
+  *recvcount = (count/nranks)*nranks; 
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = count/nranks;
+}
+
+testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  
+  //Can maybe introduce heterogeneity in data size by conditioning sendcnt and recvcont on rank here?
+  
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
+    for (int j=0; j<nranks; j++) {
+      size_t partcount = sendcount/nranks;
+      TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  // We don't support in-place alltoallv
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+
+void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  size_t rankOffset = count * wordSize(type);
+
+#if NCCL_MAJOR < 2 || NCCL_MINOR < 7
+  printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
+  return testNcclError;
+#else
+  NCCLCHECK(ncclGroupStart());
+
+  for (int r=0; r<nRanks; r++) {
+    NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
+    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
+  }
+  NCCLCHECK(ncclGroupEnd());
+  return testSuccess;
+#endif
+}
+
+struct testColl AlltoAllvTest = {
+  "AlltoAllV",
+  AlltoAllvGetCollByteCount,
+  AlltoAllvInitData,
+  AlltoAllvGetBw,
+  AlltoAllvRunColl
+};
+
+void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AlltoAllvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AlltoAllvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &AlltoAllvTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
+  }
+  return testSuccess;
+}
+
+struct testEngine AlltoAllvEngine = {
+  AlltoAllvGetBuffSize,
+  AlltoAllvRunTest
+};
+
+#pragma weak ncclTestEngine=AlltoAllvEngine

From 46d30f1fc494a51fbbbdf4affd91beb022400eab Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Sat, 3 Jun 2023 00:11:31 +0000
Subject: [PATCH 02/19] Initial draft of alltoallv test.

Changes:
- Added variable count of elements to send/recv based on sending/recieving peers
- Added new file to make file

Notes:
- Current method of uniquely identifying the peers that are sending (thread_local of thread number) may not work correctly.
Not sure if that is the appropriate way to determine rank.
---
 src/Makefile     |  2 +-
 src/alltoallv.cu | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 393de8e..3ef4ebd 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -76,7 +76,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%)
 DST_DIR := $(BUILDDIR)
 SRC_FILES := $(wildcard *.cu)
 OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
-BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall alltoallv scatter gather sendrecv hypercube
 BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
 
 build: ${BIN_FILES}
diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index 33a0b28..26bc67e 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -7,6 +7,8 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
+thread_local int threadNum = -1;
+
 void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = (count/nranks)*nranks; 
   *recvcount = (count/nranks)*nranks; 
@@ -17,8 +19,6 @@ void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par
 
 testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
   
-  //Can maybe introduce heterogeneity in data size by conditioning sendcnt and recvcont on rank here?
-  
   size_t sendcount = args->sendBytes / wordSize(type);
   size_t recvcount = args->expectedBytes / wordSize(type);
   int nranks = args->nProcs*args->nThreads*args->nGpus;
@@ -59,10 +59,15 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
 #else
   NCCLCHECK(ncclGroupStart());
 
+
   for (int r=0; r<nRanks; r++) {
-    NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
-    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
+    int count_mod = (count-threadNum-r-1) % count; //modify the count variable to to be strictly less than count, but depend on both the peer rank and the sending thread number
+
+    NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count_mod, type, r, comm, stream));
+    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count_mod, type, r, comm, stream));
   }
+
+
   NCCLCHECK(ncclGroupEnd());
   return testSuccess;
 #endif
@@ -83,10 +88,10 @@ void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, in
 
 testResult_t AlltoAllvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
   args->collTest = &AlltoAllvTest;
+  threadNum = args->thread;
   ncclDataType_t *run_types;
   const char **run_typenames;
   int type_count;
-
   if ((int)type != -1) {
     type_count = 1;
     run_types = &type;

From 086e318b9eda31926983a9bf3a18ce9646cec0e5 Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Sat, 3 Jun 2023 00:39:11 +0000
Subject: [PATCH 03/19] changed to use nccl rank finding function. This should
 now use a valid rank

---
 src/alltoallv.cu | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index 26bc67e..fef27ab 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -7,7 +7,6 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
-thread_local int threadNum = -1;
 
 void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = (count/nranks)*nranks; 
@@ -49,8 +48,9 @@ void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, doubl
 }
 
 testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  int nRanks;
+  int nRanks, myRank;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
+  NCCLCHECK(ncclCommUserRank(comm, &myRank));
   size_t rankOffset = count * wordSize(type);
 
 #if NCCL_MAJOR < 2 || NCCL_MINOR < 7
@@ -61,8 +61,7 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
 
 
   for (int r=0; r<nRanks; r++) {
-    int count_mod = (count-threadNum-r-1) % count; //modify the count variable to to be strictly less than count, but depend on both the peer rank and the sending thread number
-
+    int count_mod = (count-myRank-r-1) % count; //modify the count variable to to be strictly less than count, but depend on both the peer rank and the sending rank
     NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count_mod, type, r, comm, stream));
     NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count_mod, type, r, comm, stream));
   }
@@ -88,7 +87,6 @@ void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, in
 
 testResult_t AlltoAllvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
   args->collTest = &AlltoAllvTest;
-  threadNum = args->thread;
   ncclDataType_t *run_types;
   const char **run_typenames;
   int type_count;

From 2f0cb03ca6d98429a56d1ad7d7c48cb5b6a172fd Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Tue, 6 Jun 2023 20:39:05 +0000
Subject: [PATCH 04/19] created evaluation code for a2av static imbalancing

---
 src/alltoallv.cu | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index fef27ab..a4ba781 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -9,11 +9,11 @@
 
 
 void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = (count/nranks)*nranks; 
-  *recvcount = (count/nranks)*nranks; 
+  *sendcount = (count/nranks)*nranks; //each rank in a2av should be able to send up to count to all of the others combined. 
+  *recvcount = (count/nranks)*nranks; //each rank in a2av should be able to receive up to count from all of its peers.
   *sendInplaceOffset = 0;
   *recvInplaceOffset = 0;
-  *paramcount = count/nranks;
+  *paramcount = count/nranks; //each rank in a2av gets one even chunk to send out.
 }
 
 testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -24,13 +24,16 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
 
   for (int i=0; i<args->nGpus; i++) {
     CUDACHECK(cudaSetDevice(args->gpus[i]));
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); //zeroes out the receive buffer of each GPU with total size (recvcount*wordSize(type))
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); //copies the zeroed out receive buffer to the expected buffer
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); //current rank
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
+    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank 
     for (int j=0; j<nranks; j++) {
-      size_t partcount = sendcount/nranks;
-      TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
+      //j == peer rank 
+      size_t partcount = sendcount/nranks; //create chunk definition to use in offsetting the data initialization
+      size_t partcount_mod = (partcount - j - rank - 1) % partcount; //imbalance the count of data to initialize same way we do in the test
+      TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount_mod, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
     }
     CUDACHECK(cudaDeviceSynchronize());
   }

From e8e2bf57fac38a74d9fd8d5832a8e112a21bc4d9 Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Tue, 6 Jun 2023 20:51:38 +0000
Subject: [PATCH 05/19] convenience script

---
 src/runtest.sh | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100755 src/runtest.sh

diff --git a/src/runtest.sh b/src/runtest.sh
new file mode 100755
index 0000000..d73f21d
--- /dev/null
+++ b/src/runtest.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+# Convenience script to run all tests in the build output
+
+for script in ../build/*_perf; do
+	echo $script;
+	./$script $@;
+done

From 5c0ea182acc172c84395fbbd9e847a1dfc88f42b Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Tue, 6 Jun 2023 21:10:14 +0000
Subject: [PATCH 06/19] added a default load case

---
 src/runtest.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/runtest.sh b/src/runtest.sh
index d73f21d..b07068a 100755
--- a/src/runtest.sh
+++ b/src/runtest.sh
@@ -3,5 +3,9 @@
 
 for script in ../build/*_perf; do
 	echo $script;
-	./$script $@;
+	if [ -z "$@"]; then
+		./$script -b 8 -e 128M -f 2 -g 2 #convenient default, tests a variety of loads
+	else 
+		./$script $@;
+	fi
 done

From 53bc49625807a5de1f8b111fd9ab48787a8e9312 Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Wed, 7 Jun 2023 17:33:07 +0000
Subject: [PATCH 07/19] created second atav testfile for granular test
 development

---
 src/alltoallv.cu  |   6 ---
 src/alltoallv2.cu | 111 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+), 6 deletions(-)
 create mode 100644 src/alltoallv2.cu

diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index a4ba781..dccbb71 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -1,9 +1,3 @@
-/*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
 #include "cuda_runtime.h"
 #include "common.h"
 
diff --git a/src/alltoallv2.cu b/src/alltoallv2.cu
new file mode 100644
index 0000000..dccbb71
--- /dev/null
+++ b/src/alltoallv2.cu
@@ -0,0 +1,111 @@
+#include "cuda_runtime.h"
+#include "common.h"
+
+
+void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks; //each rank in a2av should be able to send up to count to all of the others combined. 
+  *recvcount = (count/nranks)*nranks; //each rank in a2av should be able to receive up to count from all of its peers.
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = count/nranks; //each rank in a2av gets one even chunk to send out.
+}
+
+testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus;
+
+  for (int i=0; i<args->nGpus; i++) {
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); //zeroes out the receive buffer of each GPU with total size (recvcount*wordSize(type))
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); //copies the zeroed out receive buffer to the expected buffer
+    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); //current rank
+    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
+    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank 
+    for (int j=0; j<nranks; j++) {
+      //j == peer rank 
+      size_t partcount = sendcount/nranks; //create chunk definition to use in offsetting the data initialization
+      size_t partcount_mod = (partcount - j - rank - 1) % partcount; //imbalance the count of data to initialize same way we do in the test
+      TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount_mod, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
+    }
+    CUDACHECK(cudaDeviceSynchronize());
+  }
+  // We don't support in-place alltoallv
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+
+void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  int nRanks, myRank;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  NCCLCHECK(ncclCommUserRank(comm, &myRank));
+  size_t rankOffset = count * wordSize(type);
+
+#if NCCL_MAJOR < 2 || NCCL_MINOR < 7
+  printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
+  return testNcclError;
+#else
+  NCCLCHECK(ncclGroupStart());
+
+
+  for (int r=0; r<nRanks; r++) {
+    int count_mod = (count-myRank-r-1) % count; //modify the count variable to to be strictly less than count, but depend on both the peer rank and the sending rank
+    NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count_mod, type, r, comm, stream));
+    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count_mod, type, r, comm, stream));
+  }
+
+
+  NCCLCHECK(ncclGroupEnd());
+  return testSuccess;
+#endif
+}
+
+struct testColl AlltoAllvTest = {
+  "AlltoAllV",
+  AlltoAllvGetCollByteCount,
+  AlltoAllvInitData,
+  AlltoAllvGetBw,
+  AlltoAllvRunColl
+};
+
+void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AlltoAllvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t AlltoAllvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &AlltoAllvTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
+  }
+  return testSuccess;
+}
+
+struct testEngine AlltoAllvEngine = {
+  AlltoAllvGetBuffSize,
+  AlltoAllvRunTest
+};
+
+#pragma weak ncclTestEngine=AlltoAllvEngine

From 5644a03a5c84cc86c7c2694eae41808fcc3c7408 Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Thu, 8 Jun 2023 16:35:22 +0000
Subject: [PATCH 08/19] First draft of more granular alltoallv - alltoallv2.cu
 testfile: Parameterizes with alltoallv_param.csv - run_a2av.sh script: --
 Runs the built test with an arbitrarily named CSV instead of the static name
 -- Passes through other arguments to the testfile

---
 run_a2av.sh       |  4 +++
 src/alltoallv2.cu | 73 ++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 67 insertions(+), 10 deletions(-)
 create mode 100755 run_a2av.sh

diff --git a/run_a2av.sh b/run_a2av.sh
new file mode 100755
index 0000000..1e34789
--- /dev/null
+++ b/run_a2av.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+cp $1 ./alltoallv_param.csv
+./build/alltoallv2_perf ${@:2}
diff --git a/src/alltoallv2.cu b/src/alltoallv2.cu
index dccbb71..cd2ccef 100644
--- a/src/alltoallv2.cu
+++ b/src/alltoallv2.cu
@@ -1,7 +1,47 @@
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
 #include "cuda_runtime.h"
 #include "common.h"
 
 
+testResult_t parseParamFile(int nranks, std::vector<std::vector<double>> &imbalancingFactors){
+  //Open param csv
+  std::vector<std::vector<double>> paramFile_data;
+  std::ifstream paramFile("alltoallv_param.csv");
+
+  if (!paramFile.is_open()) {
+    PRINT("\nUNABLE TO OPEN PARAMS FILE\n");
+    return testInternalError;
+    exit(-1);
+  }
+
+  std::string row;
+  while(std::getline(paramFile,row)){ //iterate over every row
+
+    std::vector<double> values; //values from this line
+    std::stringstream rowstream(row);
+    std::string value;
+    while(std::getline(rowstream,value,',')){ //go over the row and get each value
+      double dval = std::stod(value);
+      if(dval<0 || dval>1) {
+        PRINT("\nINVALID PARAMS FILE, PARAMETER OUT OF 0:1 RANGE\n");
+        return testInternalError;
+        exit(-1);
+      } //ensure that the value is between 0 and 1 (inclusive)
+
+      values.push_back(dval);
+    }
+    if(values.size()!=nranks) return testInternalError; //ensure that this row has the right amount of values
+    paramFile_data.push_back(values);
+  }
+
+  if(paramFile_data.size()!=nranks) return testInternalError; //ensure we have the right amount of rows
+  
+  imbalancingFactors = paramFile_data; //store the data in the global variable
+  return testSuccess;
+} 
 void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   *sendcount = (count/nranks)*nranks; //each rank in a2av should be able to send up to count to all of the others combined. 
   *recvcount = (count/nranks)*nranks; //each rank in a2av should be able to receive up to count from all of its peers.
@@ -11,11 +51,13 @@ void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par
 }
 
 testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
-  
   size_t sendcount = args->sendBytes / wordSize(type);
   size_t recvcount = args->expectedBytes / wordSize(type);
   int nranks = args->nProcs*args->nThreads*args->nGpus;
-
+  //parse the param file
+  std::vector<std::vector<double>> imbalancingFactors;
+  testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors);
+  if(parseSuccess != testSuccess) return parseSuccess;
   for (int i=0; i<args->nGpus; i++) {
     CUDACHECK(cudaSetDevice(args->gpus[i]));
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); //zeroes out the receive buffer of each GPU with total size (recvcount*wordSize(type))
@@ -23,10 +65,11 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); //current rank
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank 
-    for (int j=0; j<nranks; j++) {
-      //j == peer rank 
+    for (int j=0; j<nranks; j++) { 
       size_t partcount = sendcount/nranks; //create chunk definition to use in offsetting the data initialization
-      size_t partcount_mod = (partcount - j - rank - 1) % partcount; //imbalance the count of data to initialize same way we do in the test
+      // size_t partcount_mod = (partcount - j - rank - 1) % partcount; //imbalance the count of data to initialize same way we do in the test
+      size_t partcount_mod = partcount * imbalancingFactors[j][rank]; //imbalance the count of data to initialize same way we do in the test
+
       TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount_mod, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
     }
     CUDACHECK(cudaDeviceSynchronize());
@@ -48,6 +91,9 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
   int nRanks, myRank;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   NCCLCHECK(ncclCommUserRank(comm, &myRank));
+  std::vector<std::vector<double>> imbalancingFactors;
+  testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors); //parse the param file
+  if(parseSuccess != testSuccess) return parseSuccess;
   size_t rankOffset = count * wordSize(type);
 
 #if NCCL_MAJOR < 2 || NCCL_MINOR < 7
@@ -55,12 +101,19 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
   return testNcclError;
 #else
   NCCLCHECK(ncclGroupStart());
-
-
   for (int r=0; r<nRanks; r++) {
-    int count_mod = (count-myRank-r-1) % count; //modify the count variable to to be strictly less than count, but depend on both the peer rank and the sending rank
-    NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count_mod, type, r, comm, stream));
-    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count_mod, type, r, comm, stream));
+    // int count_mod = (count-myRank-r-1) % count; //modify the count variable to to be strictly less than count, but depend on both the peer rank and the sending rank
+    if(myRank>imbalancingFactors.size()){
+      PRINT("\nmyRank is greater than imbalancingFactors.size(), %i\n", myRank);
+      return testInternalError;
+    } else if (r > imbalancingFactors[myRank].size()) {
+        PRINT("\nr is greater than imbalancingFactors[myRank].size(), %i\n", r);
+        return testInternalError;
+    }
+    unsigned long send_count_mod = count * imbalancingFactors[myRank][r]; 
+    unsigned long recv_count_mod = count * imbalancingFactors[r][myRank]; 
+    NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, send_count_mod, type, r, comm, stream));
+    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, recv_count_mod, type, r, comm, stream));
   }
 
 

From ef00fa9af63fc0f0b26adc7731ee6179e5dcf8d9 Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Fri, 9 Jun 2023 23:19:45 +0000
Subject: [PATCH 09/19] This version passes row == 1 distribution guarantee.
 Each Rank is guaranteed to send X/nranks data in some distribution.

---
 src/alltoallv2.cu | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/alltoallv2.cu b/src/alltoallv2.cu
index cd2ccef..cda3b61 100644
--- a/src/alltoallv2.cu
+++ b/src/alltoallv2.cu
@@ -14,40 +14,50 @@ testResult_t parseParamFile(int nranks, std::vector<std::vector<double>> &imbala
   if (!paramFile.is_open()) {
     PRINT("\nUNABLE TO OPEN PARAMS FILE\n");
     return testInternalError;
-    exit(-1);
   }
 
   std::string row;
+  int rowidx = 0;
   while(std::getline(paramFile,row)){ //iterate over every row
-
     std::vector<double> values; //values from this line
     std::stringstream rowstream(row);
     std::string value;
-    while(std::getline(rowstream,value,',')){ //go over the row and get each value
+    double rowsum = 0;
+    while(std::getline(rowstream,value,',')){ //go over the row and get each value  
       double dval = std::stod(value);
       if(dval<0 || dval>1) {
-        PRINT("\nINVALID PARAMS FILE, PARAMETER OUT OF 0:1 RANGE\n");
+        PRINT("\nINVALID PARAMS FILE, PARAMETER OUT OF 0:1 RANGE, ROW NUMBER: %i \n", rowidx);
         return testInternalError;
-        exit(-1);
-      } //ensure that the value is between 0 and 1 (inclusive)
-
+      } //ensure that the value is between 0 and 1 (necessary for probability distribution)
+      rowsum += dval;
       values.push_back(dval);
     }
-    if(values.size()!=nranks) return testInternalError; //ensure that this row has the right amount of values
+    if(rowsum!=1.0){
+      PRINT("\nINVALID PARAMS FILE, SUM OF ROW %i IS NOT 1\n", rowidx);
+      return testInternalError;
+    } //ensure that this row is a valid distribution
+    if(values.size()!=nranks) {
+      PRINT("\nINVALID PARAMS FILE, ROW %i DOES NOT HAVE CORRECT NUMBER OF VALUES\n", rowidx);
+      return testInternalError;
+    }//ensure that this row has the right amount of values
     paramFile_data.push_back(values);
+    rowidx++;
   }
 
-  if(paramFile_data.size()!=nranks) return testInternalError; //ensure we have the right amount of rows
+  if(paramFile_data.size()!=nranks) {
+    PRINT("\nINVALID PARAMS FILE, DOES NOT HAVE CORRECT NUMBER OF ROWS\n");
+    return testInternalError;
+  } //ensure we have the right amount of rows
   
   imbalancingFactors = paramFile_data; //store the data in the global variable
   return testSuccess;
 } 
 void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = (count/nranks)*nranks; //each rank in a2av should be able to send up to count to all of the others combined. 
-  *recvcount = (count/nranks)*nranks; //each rank in a2av should be able to receive up to count from all of its peers.
+  *sendcount = (count/nranks)*nranks; //Total send count rounded to a multiple of ranks 
+  *recvcount = (count/nranks)*nranks; //Total recv count rounded to a multiple of ranks
   *sendInplaceOffset = 0;
   *recvInplaceOffset = 0;
-  *paramcount = count/nranks; //each rank in a2av gets one even chunk to send out.
+  *paramcount = (count/nranks); //each rank in a2av can send up to 1/nranks data.
 }
 
 testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -67,7 +77,6 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
     TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank 
     for (int j=0; j<nranks; j++) { 
       size_t partcount = sendcount/nranks; //create chunk definition to use in offsetting the data initialization
-      // size_t partcount_mod = (partcount - j - rank - 1) % partcount; //imbalance the count of data to initialize same way we do in the test
       size_t partcount_mod = partcount * imbalancingFactors[j][rank]; //imbalance the count of data to initialize same way we do in the test
 
       TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount_mod, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));

From 6b7e790185997eb3b3f0dea422d6ccd9fb789062 Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Tue, 27 Jun 2023 02:10:04 +0000
Subject: [PATCH 10/19] added CLI arg for param file, switched to using
 variable in testcase

---
 src/alltoallv2.cu | 22 +++++++++++-----------
 src/common.cu     |  9 ++++++++-
 src/common.h      |  2 ++
 3 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/alltoallv2.cu b/src/alltoallv2.cu
index cda3b61..b6b9a4b 100644
--- a/src/alltoallv2.cu
+++ b/src/alltoallv2.cu
@@ -5,11 +5,12 @@
 #include "cuda_runtime.h"
 #include "common.h"
 
+int CHECK = 0;
 
-testResult_t parseParamFile(int nranks, std::vector<std::vector<double>> &imbalancingFactors){
+testResult_t parseParamFile(int nranks, std::vector<std::vector<double>> &imbalancingFactors, char filename[64]){
   //Open param csv
   std::vector<std::vector<double>> paramFile_data;
-  std::ifstream paramFile("alltoallv_param.csv");
+  std::ifstream paramFile(filename);
 
   if (!paramFile.is_open()) {
     PRINT("\nUNABLE TO OPEN PARAMS FILE\n");
@@ -22,22 +23,16 @@ testResult_t parseParamFile(int nranks, std::vector<std::vector<double>> &imbala
     std::vector<double> values; //values from this line
     std::stringstream rowstream(row);
     std::string value;
-    double rowsum = 0;
     while(std::getline(rowstream,value,',')){ //go over the row and get each value  
       double dval = std::stod(value);
       if(dval<0 || dval>1) {
         PRINT("\nINVALID PARAMS FILE, PARAMETER OUT OF 0:1 RANGE, ROW NUMBER: %i \n", rowidx);
         return testInternalError;
       } //ensure that the value is between 0 and 1 (necessary for probability distribution)
-      rowsum += dval;
       values.push_back(dval);
     }
-    if(rowsum!=1.0){
-      PRINT("\nINVALID PARAMS FILE, SUM OF ROW %i IS NOT 1\n", rowidx);
-      return testInternalError;
-    } //ensure that this row is a valid distribution
     if(values.size()!=nranks) {
-      PRINT("\nINVALID PARAMS FILE, ROW %i DOES NOT HAVE CORRECT NUMBER OF VALUES\n", rowidx);
+      PRINT("\nINVALID PARAMS FILE, ROW %i DOES NOT HAVE CORRECT NUMBER OF VALUES, HAS %lu ENTRIES, NEEDS %i ENTRIES\n", rowidx, values.size(), nranks);
       return testInternalError;
     }//ensure that this row has the right amount of values
     paramFile_data.push_back(values);
@@ -66,7 +61,8 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
   int nranks = args->nProcs*args->nThreads*args->nGpus;
   //parse the param file
   std::vector<std::vector<double>> imbalancingFactors;
-  testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors);
+  testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors, args->param_file);
+  CHECK = 1;
   if(parseSuccess != testSuccess) return parseSuccess;
   for (int i=0; i<args->nGpus; i++) {
     CUDACHECK(cudaSetDevice(args->gpus[i]));
@@ -101,10 +97,14 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   NCCLCHECK(ncclCommUserRank(comm, &myRank));
   std::vector<std::vector<double>> imbalancingFactors;
-  testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors); //parse the param file
+  struct threadArgs* args = (struct threadArgs*) (__builtin_frame_address(1));
+  testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors, args->param_file); //parse the param file
   if(parseSuccess != testSuccess) return parseSuccess;
   size_t rankOffset = count * wordSize(type);
 
+  // Get the base address of the previous stack frame. 
+  // Since this function is only ever called from the startColl function, this will be the address of the startColl function's stack frame. 
+  // The beginning of that stack frame will be the threadargs struct.
 #if NCCL_MAJOR < 2 || NCCL_MINOR < 7
   printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
   return testNcclError;
diff --git a/src/common.cu b/src/common.cu
index 48a629c..f8f37c9 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -59,6 +59,7 @@ int is_main_proc = 0;
 thread_local int is_main_thread = 0;
 
 // Command line parameter defaults
+static char param_file[64];
 static int nThreads = 1;
 static int nGpus = 1;
 static size_t minBytes = 32*1024*1024;
@@ -685,6 +686,7 @@ int main(int argc, char* argv[]) {
   double parsed;
   int longindex;
   static struct option longopts[] = {
+    {"param_file",optional_argument, 0, 'v'},
     {"nthreads", required_argument, 0, 't'},
     {"ngpus", required_argument, 0, 'g'},
     {"minbytes", required_argument, 0, 'b'},
@@ -711,12 +713,15 @@ int main(int argc, char* argv[]) {
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex);
+    c = getopt_long(argc, argv, "v:t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex);
 
     if (c == -1)
       break;
 
     switch(c) {
+      case 'v':
+        strcpy(param_file,optarg);
+        break;
       case 't':
         nThreads = strtol(optarg, NULL, 0);
         break;
@@ -983,6 +988,8 @@ testResult_t run() {
   memset(threads, 0, sizeof(struct testThread)*nThreads);
 
   for (int t=nThreads-1; t>=0; t--) {
+    strcpy(threads[t].args.param_file, param_file);
+
     threads[t].args.minbytes=minBytes;
     threads[t].args.maxbytes=maxBytes;
     threads[t].args.stepbytes=stepBytes;
diff --git a/src/common.h b/src/common.h
index 20fa461..6d18d48 100644
--- a/src/common.h
+++ b/src/common.h
@@ -110,6 +110,8 @@ struct testEngine {
 extern struct testEngine ncclTestEngine;
 
 struct threadArgs {
+  char param_file[64];
+
   size_t nbytes;
   size_t minbytes;
   size_t maxbytes;

From 025333acfdc716f7d076dc5b531cc73648773c62 Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Tue, 27 Jun 2023 17:25:49 +0000
Subject: [PATCH 11/19] added documentation

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 580996b..f5447af 100644
--- a/README.md
+++ b/README.md
@@ -61,6 +61,7 @@ All tests support the same set of arguments :
   * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
   * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
 * Test operation
+  * `-v,--param_file <filename>` Read parameters from file for tests that require it. Currently only required for alltoallv benchmark. Default : disabled. Max of 64 characters for filename.
   * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
   * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
   * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.

From f94647f20987ffe5316cfc4980d6900c9a3149f7 Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Tue, 27 Jun 2023 19:44:37 +0000
Subject: [PATCH 12/19] rename + docs

---
 src/alltoallv.cu  |  96 +++++++++++++++++++++----
 src/alltoallv2.cu | 173 ----------------------------------------------
 2 files changed, 83 insertions(+), 186 deletions(-)
 delete mode 100644 src/alltoallv2.cu

diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index dccbb71..1ade93d 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -1,21 +1,74 @@
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
 #include "cuda_runtime.h"
 #include "common.h"
 
+int CHECK = 0;
+
+/**
+ * @brief Parses the parameter file and stores the matrix data into the imbalancingFactors reference passed in. 
+ * @param nranks The number of ranks in the test
+ * @param imbalancingFactors The reference to the vector that will store the parsed data
+ * @param filename The name of the parameter file to parse
+**/
+testResult_t parseParamFile(int nranks, std::vector<std::vector<double>> &imbalancingFactors, char filename[64]){
+  std::vector<std::vector<double>> paramFile_data;
+  std::ifstream paramFile(filename);
+
+  if (!paramFile.is_open()) {
+    PRINT("\nUNABLE TO OPEN PARAMS FILE\n");
+    return testInternalError;
+  }
+
+  std::string row;
+  int rowidx = 0;
+  while(std::getline(paramFile,row)){ //iterate over every row
+    std::vector<double> values; //values from this line
+    std::stringstream rowstream(row);
+    std::string value;
+    while(std::getline(rowstream,value,',')){ //go over the row and get each value  
+      double dval = std::stod(value);
+      if(dval<0 || dval>1) {
+        PRINT("\nINVALID PARAMS FILE, PARAMETER OUT OF 0:1 RANGE, ROW NUMBER: %i \n", rowidx);
+        return testInternalError;
+      } //ensure that the value is between 0 and 1 (necessary for probability distribution)
+      values.push_back(dval);
+    }
+    if(values.size()!=nranks) {
+      PRINT("\nINVALID PARAMS FILE, ROW %i DOES NOT HAVE CORRECT NUMBER OF VALUES, HAS %lu ENTRIES, NEEDS %i ENTRIES\n", rowidx, values.size(), nranks);
+      return testInternalError;
+    }//ensure that this row has the right amount of values
+    paramFile_data.push_back(values);
+    rowidx++;
+  }
 
+  if(paramFile_data.size()!=nranks) {
+    PRINT("\nINVALID PARAMS FILE, DOES NOT HAVE CORRECT NUMBER OF ROWS\n");
+    return testInternalError;
+  } //ensure we have the right amount of rows
+  
+  imbalancingFactors = paramFile_data; //store the data in the return variable
+  return testSuccess;
+} 
 void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = (count/nranks)*nranks; //each rank in a2av should be able to send up to count to all of the others combined. 
-  *recvcount = (count/nranks)*nranks; //each rank in a2av should be able to receive up to count from all of its peers.
+  *sendcount = (count/nranks)*nranks; //Total send count rounded to a multiple of ranks 
+  *recvcount = (count/nranks)*nranks; //Total recv count rounded to a multiple of ranks
   *sendInplaceOffset = 0;
   *recvInplaceOffset = 0;
-  *paramcount = count/nranks; //each rank in a2av gets one even chunk to send out.
+  *paramcount = (count/nranks);
 }
 
 testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
-  
   size_t sendcount = args->sendBytes / wordSize(type);
   size_t recvcount = args->expectedBytes / wordSize(type);
   int nranks = args->nProcs*args->nThreads*args->nGpus;
-
+  //parse the param file
+  std::vector<std::vector<double>> imbalancingFactors;
+  testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors, args->param_file);
+  CHECK = 1;
+  if(parseSuccess != testSuccess) return parseSuccess;
   for (int i=0; i<args->nGpus; i++) {
     CUDACHECK(cudaSetDevice(args->gpus[i]));
     CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); //zeroes out the receive buffer of each GPU with total size (recvcount*wordSize(type))
@@ -23,10 +76,10 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); //current rank
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank 
-    for (int j=0; j<nranks; j++) {
-      //j == peer rank 
+    for (int j=0; j<nranks; j++) { 
       size_t partcount = sendcount/nranks; //create chunk definition to use in offsetting the data initialization
-      size_t partcount_mod = (partcount - j - rank - 1) % partcount; //imbalance the count of data to initialize same way we do in the test
+      size_t partcount_mod = partcount * imbalancingFactors[j][rank]; //imbalance the count of data to initialize same way we do in the test
+
       TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount_mod, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
     }
     CUDACHECK(cudaDeviceSynchronize());
@@ -48,19 +101,36 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
   int nRanks, myRank;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   NCCLCHECK(ncclCommUserRank(comm, &myRank));
+  std::vector<std::vector<double>> imbalancingFactors; 
+
+  // Since this function is only ever called from the startColl function, this builtin call will return the address of the startColl function's stack frame. 
+  // The beginning of that stack frame will be the threadargs struct, which contains the param filename.
+  struct threadArgs* args = (struct threadArgs*) (__builtin_frame_address(1));
+  testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors, args->param_file); //parse the param file
+  if(parseSuccess != testSuccess) return parseSuccess;
   size_t rankOffset = count * wordSize(type);
 
+  // Get the base address of the previous stack frame. 
+  // Since this function is only ever called from the startColl function, this will be the address of the startColl function's stack frame. 
+  // The beginning of that stack frame will be the threadargs struct.
 #if NCCL_MAJOR < 2 || NCCL_MINOR < 7
   printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
   return testNcclError;
 #else
   NCCLCHECK(ncclGroupStart());
-
-
   for (int r=0; r<nRanks; r++) {
-    int count_mod = (count-myRank-r-1) % count; //modify the count variable to to be strictly less than count, but depend on both the peer rank and the sending rank
-    NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count_mod, type, r, comm, stream));
-    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count_mod, type, r, comm, stream));
+    // int count_mod = (count-myRank-r-1) % count; //modify the count variable to to be strictly less than count, but depend on both the peer rank and the sending rank
+    if(myRank>imbalancingFactors.size()){
+      PRINT("\nmyRank is greater than imbalancingFactors.size(), %i\n", myRank);
+      return testInternalError;
+    } else if (r > imbalancingFactors[myRank].size()) {
+        PRINT("\nr is greater than imbalancingFactors[myRank].size(), %i\n", r);
+        return testInternalError;
+    }
+    unsigned long send_count_mod = count * imbalancingFactors[myRank][r]; 
+    unsigned long recv_count_mod = count * imbalancingFactors[r][myRank]; 
+    NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, send_count_mod, type, r, comm, stream));
+    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, recv_count_mod, type, r, comm, stream));
   }
 
 
diff --git a/src/alltoallv2.cu b/src/alltoallv2.cu
deleted file mode 100644
index b6b9a4b..0000000
--- a/src/alltoallv2.cu
+++ /dev/null
@@ -1,173 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <vector>
-#include "cuda_runtime.h"
-#include "common.h"
-
-int CHECK = 0;
-
-testResult_t parseParamFile(int nranks, std::vector<std::vector<double>> &imbalancingFactors, char filename[64]){
-  //Open param csv
-  std::vector<std::vector<double>> paramFile_data;
-  std::ifstream paramFile(filename);
-
-  if (!paramFile.is_open()) {
-    PRINT("\nUNABLE TO OPEN PARAMS FILE\n");
-    return testInternalError;
-  }
-
-  std::string row;
-  int rowidx = 0;
-  while(std::getline(paramFile,row)){ //iterate over every row
-    std::vector<double> values; //values from this line
-    std::stringstream rowstream(row);
-    std::string value;
-    while(std::getline(rowstream,value,',')){ //go over the row and get each value  
-      double dval = std::stod(value);
-      if(dval<0 || dval>1) {
-        PRINT("\nINVALID PARAMS FILE, PARAMETER OUT OF 0:1 RANGE, ROW NUMBER: %i \n", rowidx);
-        return testInternalError;
-      } //ensure that the value is between 0 and 1 (necessary for probability distribution)
-      values.push_back(dval);
-    }
-    if(values.size()!=nranks) {
-      PRINT("\nINVALID PARAMS FILE, ROW %i DOES NOT HAVE CORRECT NUMBER OF VALUES, HAS %lu ENTRIES, NEEDS %i ENTRIES\n", rowidx, values.size(), nranks);
-      return testInternalError;
-    }//ensure that this row has the right amount of values
-    paramFile_data.push_back(values);
-    rowidx++;
-  }
-
-  if(paramFile_data.size()!=nranks) {
-    PRINT("\nINVALID PARAMS FILE, DOES NOT HAVE CORRECT NUMBER OF ROWS\n");
-    return testInternalError;
-  } //ensure we have the right amount of rows
-  
-  imbalancingFactors = paramFile_data; //store the data in the global variable
-  return testSuccess;
-} 
-void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
-  *sendcount = (count/nranks)*nranks; //Total send count rounded to a multiple of ranks 
-  *recvcount = (count/nranks)*nranks; //Total recv count rounded to a multiple of ranks
-  *sendInplaceOffset = 0;
-  *recvInplaceOffset = 0;
-  *paramcount = (count/nranks); //each rank in a2av can send up to 1/nranks data.
-}
-
-testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
-  size_t sendcount = args->sendBytes / wordSize(type);
-  size_t recvcount = args->expectedBytes / wordSize(type);
-  int nranks = args->nProcs*args->nThreads*args->nGpus;
-  //parse the param file
-  std::vector<std::vector<double>> imbalancingFactors;
-  testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors, args->param_file);
-  CHECK = 1;
-  if(parseSuccess != testSuccess) return parseSuccess;
-  for (int i=0; i<args->nGpus; i++) {
-    CUDACHECK(cudaSetDevice(args->gpus[i]));
-    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); //zeroes out the receive buffer of each GPU with total size (recvcount*wordSize(type))
-    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); //copies the zeroed out receive buffer to the expected buffer
-    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); //current rank
-    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank 
-    for (int j=0; j<nranks; j++) { 
-      size_t partcount = sendcount/nranks; //create chunk definition to use in offsetting the data initialization
-      size_t partcount_mod = partcount * imbalancingFactors[j][rank]; //imbalance the count of data to initialize same way we do in the test
-
-      TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount_mod, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
-    }
-    CUDACHECK(cudaDeviceSynchronize());
-  }
-  // We don't support in-place alltoallv
-  args->reportErrors = in_place ? 0 : 1;
-  return testSuccess;
-}
-
-void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
-  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
-
-  *algBw = baseBw;
-  double factor = ((double)(nranks-1))/((double)(nranks));
-  *busBw = baseBw * factor;
-}
-
-testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  int nRanks, myRank;
-  NCCLCHECK(ncclCommCount(comm, &nRanks));
-  NCCLCHECK(ncclCommUserRank(comm, &myRank));
-  std::vector<std::vector<double>> imbalancingFactors;
-  struct threadArgs* args = (struct threadArgs*) (__builtin_frame_address(1));
-  testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors, args->param_file); //parse the param file
-  if(parseSuccess != testSuccess) return parseSuccess;
-  size_t rankOffset = count * wordSize(type);
-
-  // Get the base address of the previous stack frame. 
-  // Since this function is only ever called from the startColl function, this will be the address of the startColl function's stack frame. 
-  // The beginning of that stack frame will be the threadargs struct.
-#if NCCL_MAJOR < 2 || NCCL_MINOR < 7
-  printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
-  return testNcclError;
-#else
-  NCCLCHECK(ncclGroupStart());
-  for (int r=0; r<nRanks; r++) {
-    // int count_mod = (count-myRank-r-1) % count; //modify the count variable to to be strictly less than count, but depend on both the peer rank and the sending rank
-    if(myRank>imbalancingFactors.size()){
-      PRINT("\nmyRank is greater than imbalancingFactors.size(), %i\n", myRank);
-      return testInternalError;
-    } else if (r > imbalancingFactors[myRank].size()) {
-        PRINT("\nr is greater than imbalancingFactors[myRank].size(), %i\n", r);
-        return testInternalError;
-    }
-    unsigned long send_count_mod = count * imbalancingFactors[myRank][r]; 
-    unsigned long recv_count_mod = count * imbalancingFactors[r][myRank]; 
-    NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, send_count_mod, type, r, comm, stream));
-    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, recv_count_mod, type, r, comm, stream));
-  }
-
-
-  NCCLCHECK(ncclGroupEnd());
-  return testSuccess;
-#endif
-}
-
-struct testColl AlltoAllvTest = {
-  "AlltoAllV",
-  AlltoAllvGetCollByteCount,
-  AlltoAllvInitData,
-  AlltoAllvGetBw,
-  AlltoAllvRunColl
-};
-
-void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
-  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
-  AlltoAllvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
-}
-
-testResult_t AlltoAllvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
-  args->collTest = &AlltoAllvTest;
-  ncclDataType_t *run_types;
-  const char **run_typenames;
-  int type_count;
-  if ((int)type != -1) {
-    type_count = 1;
-    run_types = &type;
-    run_typenames = &typeName;
-  } else {
-    type_count = test_typenum;
-    run_types = test_types;
-    run_typenames = test_typenames;
-  }
-
-  for (int i=0; i<type_count; i++) {
-      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
-  }
-  return testSuccess;
-}
-
-struct testEngine AlltoAllvEngine = {
-  AlltoAllvGetBuffSize,
-  AlltoAllvRunTest
-};
-
-#pragma weak ncclTestEngine=AlltoAllvEngine

From 2782aaeacc59c3e33c0f18c3d72a91a468a89c2b Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Tue, 27 Jun 2023 20:08:41 +0000
Subject: [PATCH 13/19] removed unnecessary script

---
 run_a2av.sh | 4 ----
 1 file changed, 4 deletions(-)
 delete mode 100755 run_a2av.sh

diff --git a/run_a2av.sh b/run_a2av.sh
deleted file mode 100755
index 1e34789..0000000
--- a/run_a2av.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-cp $1 ./alltoallv_param.csv
-./build/alltoallv2_perf ${@:2}

From 8ad54bd017fc4a560ac292520a11a923624870fc Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Tue, 27 Jun 2023 20:14:54 +0000
Subject: [PATCH 14/19] cleanup

---
 src/runtest.sh | 11 -----------
 1 file changed, 11 deletions(-)
 delete mode 100755 src/runtest.sh

diff --git a/src/runtest.sh b/src/runtest.sh
deleted file mode 100755
index b07068a..0000000
--- a/src/runtest.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-# Convenience script to run all tests in the build output
-
-for script in ../build/*_perf; do
-	echo $script;
-	if [ -z "$@"]; then
-		./$script -b 8 -e 128M -f 2 -g 2 #convenient default, tests a variety of loads
-	else 
-		./$script $@;
-	fi
-done

From 8d6c8d84fe532ba6fe74da4c96e56d93ef5a22ef Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Mon, 10 Jul 2023 16:46:11 +0000
Subject: [PATCH 15/19] Removed builtin usage, replaced with function argument

---
 src/all_gather.cu     |  2 +-
 src/all_reduce.cu     |  2 +-
 src/alltoall.cu       |  2 +-
 src/alltoallv.cu      | 22 ++++++----------------
 src/broadcast.cu      |  2 +-
 src/common.cu         |  3 +--
 src/common.h          |  2 +-
 src/gather.cu         |  2 +-
 src/hypercube.cu      |  2 +-
 src/reduce.cu         |  2 +-
 src/reduce_scatter.cu |  2 +-
 src/scatter.cu        |  2 +-
 src/sendrecv.cu       |  2 +-
 13 files changed, 18 insertions(+), 29 deletions(-)

diff --git a/src/all_gather.cu b/src/all_gather.cu
index 0831207..9031ed1 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -45,7 +45,7 @@ void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) {
   NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
   return testSuccess;
 }
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index a38eabe..3bdfb3b 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -40,7 +40,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) {
   NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
   return testSuccess;
 }
diff --git a/src/alltoall.cu b/src/alltoall.cu
index 41c7c4a..b737dbb 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -45,7 +45,7 @@ void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double
   *busBw = baseBw * factor;
 }
 
-testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) {
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   size_t rankOffset = count * wordSize(type);
diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index 1ade93d..d9bf108 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -57,12 +57,11 @@ void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par
   *recvcount = (count/nranks)*nranks; //Total recv count rounded to a multiple of ranks
   *sendInplaceOffset = 0;
   *recvInplaceOffset = 0;
-  *paramcount = (count/nranks);
+  *paramcount = (count/nranks); //Each rank can send a maximum of count/nranks data to each other rank
 }
 
 testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
-  size_t sendcount = args->sendBytes / wordSize(type);
-  size_t recvcount = args->expectedBytes / wordSize(type);
+  size_t maxchunk = args->nbytes / wordSize(type);
   int nranks = args->nProcs*args->nThreads*args->nGpus;
   //parse the param file
   std::vector<std::vector<double>> imbalancingFactors;
@@ -75,12 +74,10 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
     CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); //copies the zeroed out receive buffer to the expected buffer
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); //current rank
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
-    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank 
+    TESTCHECK(InitData(data, maxchunk*nranks, 0, type, ncclSum, 33*rep + rank, 1, 0)); //initializes the sendbuffer data for this rank. Should be chunk size * nranks
     for (int j=0; j<nranks; j++) { 
-      size_t partcount = sendcount/nranks; //create chunk definition to use in offsetting the data initialization
-      size_t partcount_mod = partcount * imbalancingFactors[j][rank]; //imbalance the count of data to initialize same way we do in the test
-
-      TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount_mod, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
+      size_t partcount_mod = maxchunk * imbalancingFactors[j][rank]; //imbalance the count of data to initialize same way we do in the test
+      TESTCHECK(InitData((char*)args->expected[i] + j*maxchunk*wordSize(type), partcount_mod, rank*maxchunk, type, ncclSum, 33*rep + j, 1, 0));
     }
     CUDACHECK(cudaDeviceSynchronize());
   }
@@ -97,22 +94,15 @@ void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) {
   int nRanks, myRank;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   NCCLCHECK(ncclCommUserRank(comm, &myRank));
   std::vector<std::vector<double>> imbalancingFactors; 
-
-  // Since this function is only ever called from the startColl function, this builtin call will return the address of the startColl function's stack frame. 
-  // The beginning of that stack frame will be the threadargs struct, which contains the param filename.
-  struct threadArgs* args = (struct threadArgs*) (__builtin_frame_address(1));
   testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors, args->param_file); //parse the param file
   if(parseSuccess != testSuccess) return parseSuccess;
   size_t rankOffset = count * wordSize(type);
 
-  // Get the base address of the previous stack frame. 
-  // Since this function is only ever called from the startColl function, this will be the address of the startColl function's stack frame. 
-  // The beginning of that stack frame will be the threadargs struct.
 #if NCCL_MAJOR < 2 || NCCL_MINOR < 7
   printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
   return testNcclError;
diff --git a/src/broadcast.cu b/src/broadcast.cu
index 903066a..9914a57 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -39,7 +39,7 @@ void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) {
   int rank;
   NCCLCHECK(ncclCommUserRank(comm, &rank));
 #if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
diff --git a/src/common.cu b/src/common.cu
index f8f37c9..b7bc46f 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -374,11 +374,10 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
     }
     #endif
-
     TESTCHECK(args->collTest->runColl(
           (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
           (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
-        count, type, op, root, args->comms[i], args->streams[i]));
+        count, type, op, root, args->comms[i], args->streams[i], args));
 
     #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
     if(opIndex >= ncclNumOps) {
diff --git a/src/common.h b/src/common.h
index 6d18d48..a559c0a 100644
--- a/src/common.h
+++ b/src/common.h
@@ -92,7 +92,7 @@ struct testColl {
       ncclRedOp_t op, int root, int rep, int in_place);
   void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
   testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-      ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+      ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args);
 };
 extern struct testColl allReduceTest;
 extern struct testColl allGatherTest;
diff --git a/src/gather.cu b/src/gather.cu
index 03ef4d9..9f3c5ad 100644
--- a/src/gather.cu
+++ b/src/gather.cu
@@ -43,7 +43,7 @@ void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double*
   *busBw = baseBw * factor;
 }
 
-testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) {
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   int rank;
diff --git a/src/hypercube.cu b/src/hypercube.cu
index 5c1456f..8cb73ed 100644
--- a/src/hypercube.cu
+++ b/src/hypercube.cu
@@ -45,7 +45,7 @@ void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) {
   char* sbuff = (char*)sendbuff;
   char* rbuff = (char*)recvbuff;
   int nRanks;
diff --git a/src/reduce.cu b/src/reduce.cu
index f2fa80d..6fed902 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -39,7 +39,7 @@ void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double*
   *busBw = baseBw;
 }
 
-testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) {
   NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
   return testSuccess;
 }
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index ed372e3..f23cbc2 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -44,7 +44,7 @@ void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, d
   *busBw = baseBw * factor;
 }
 
-testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) {
   NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
   return testSuccess;
 }
diff --git a/src/scatter.cu b/src/scatter.cu
index 49d20e1..0644a52 100644
--- a/src/scatter.cu
+++ b/src/scatter.cu
@@ -39,7 +39,7 @@ void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double*
   *busBw = baseBw * factor;
 }
 
-testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) {
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   int rank;
diff --git a/src/sendrecv.cu b/src/sendrecv.cu
index c9eb5bb..8052b81 100644
--- a/src/sendrecv.cu
+++ b/src/sendrecv.cu
@@ -43,7 +43,7 @@ void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double
   *busBw = baseBw * factor;
 }
 
-testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, struct threadArgs* args) {
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   int rank;

From cd6da4196746d4e838f46162d94ed4fb6eb9d5f4 Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Mon, 10 Jul 2023 16:51:34 +0000
Subject: [PATCH 16/19] added some testcases to use as reference

---
 paramfiles/alltoallv_paramfiles/Rank1Test1.csv | 1 +
 paramfiles/alltoallv_paramfiles/Rank2Test1.csv | 2 ++
 paramfiles/alltoallv_paramfiles/Rank2Test2.csv | 2 ++
 paramfiles/alltoallv_paramfiles/Rank3Test1.csv | 3 +++
 paramfiles/alltoallv_paramfiles/Rank3Test2.csv | 3 +++
 paramfiles/alltoallv_paramfiles/Rank4Test1.csv | 4 ++++
 paramfiles/alltoallv_paramfiles/Rank4Test2.csv | 4 ++++
 paramfiles/alltoallv_paramfiles/Rank4Test3.csv | 4 ++++
 paramfiles/alltoallv_paramfiles/Rank4Test4.csv | 4 ++++
 paramfiles/alltoallv_paramfiles/Rank4Test5.csv | 4 ++++
 paramfiles/alltoallv_paramfiles/Rank4Test6.csv | 5 +++++
 11 files changed, 36 insertions(+)
 create mode 100644 paramfiles/alltoallv_paramfiles/Rank1Test1.csv
 create mode 100644 paramfiles/alltoallv_paramfiles/Rank2Test1.csv
 create mode 100644 paramfiles/alltoallv_paramfiles/Rank2Test2.csv
 create mode 100644 paramfiles/alltoallv_paramfiles/Rank3Test1.csv
 create mode 100644 paramfiles/alltoallv_paramfiles/Rank3Test2.csv
 create mode 100644 paramfiles/alltoallv_paramfiles/Rank4Test1.csv
 create mode 100644 paramfiles/alltoallv_paramfiles/Rank4Test2.csv
 create mode 100644 paramfiles/alltoallv_paramfiles/Rank4Test3.csv
 create mode 100644 paramfiles/alltoallv_paramfiles/Rank4Test4.csv
 create mode 100644 paramfiles/alltoallv_paramfiles/Rank4Test5.csv
 create mode 100644 paramfiles/alltoallv_paramfiles/Rank4Test6.csv

diff --git a/paramfiles/alltoallv_paramfiles/Rank1Test1.csv b/paramfiles/alltoallv_paramfiles/Rank1Test1.csv
new file mode 100644
index 0000000..d00491f
--- /dev/null
+++ b/paramfiles/alltoallv_paramfiles/Rank1Test1.csv
@@ -0,0 +1 @@
+1
diff --git a/paramfiles/alltoallv_paramfiles/Rank2Test1.csv b/paramfiles/alltoallv_paramfiles/Rank2Test1.csv
new file mode 100644
index 0000000..6504ac9
--- /dev/null
+++ b/paramfiles/alltoallv_paramfiles/Rank2Test1.csv
@@ -0,0 +1,2 @@
+0.1,0.4
+0,0.5
diff --git a/paramfiles/alltoallv_paramfiles/Rank2Test2.csv b/paramfiles/alltoallv_paramfiles/Rank2Test2.csv
new file mode 100644
index 0000000..adbb77f
--- /dev/null
+++ b/paramfiles/alltoallv_paramfiles/Rank2Test2.csv
@@ -0,0 +1,2 @@
+0,1
+0,0
diff --git a/paramfiles/alltoallv_paramfiles/Rank3Test1.csv b/paramfiles/alltoallv_paramfiles/Rank3Test1.csv
new file mode 100644
index 0000000..5e2bda4
--- /dev/null
+++ b/paramfiles/alltoallv_paramfiles/Rank3Test1.csv
@@ -0,0 +1,3 @@
+0.3,0.4,0.3
+0.2,0,0.8
+0.1,0.2,0.7
diff --git a/paramfiles/alltoallv_paramfiles/Rank3Test2.csv b/paramfiles/alltoallv_paramfiles/Rank3Test2.csv
new file mode 100644
index 0000000..072e914
--- /dev/null
+++ b/paramfiles/alltoallv_paramfiles/Rank3Test2.csv
@@ -0,0 +1,3 @@
+0,0,1
+0,1,0
+1,0,0
diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test1.csv b/paramfiles/alltoallv_paramfiles/Rank4Test1.csv
new file mode 100644
index 0000000..9316fe3
--- /dev/null
+++ b/paramfiles/alltoallv_paramfiles/Rank4Test1.csv
@@ -0,0 +1,4 @@
+1,1,1,1
+1,1,1,1
+1,1,1,1
+1,1,1,1
diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test2.csv b/paramfiles/alltoallv_paramfiles/Rank4Test2.csv
new file mode 100644
index 0000000..9d31d09
--- /dev/null
+++ b/paramfiles/alltoallv_paramfiles/Rank4Test2.csv
@@ -0,0 +1,4 @@
+0.25,0.25,0.25,0.25
+0.50,0.50,0.50,0.50
+0.75,0.75,0.75,0.75
+1,1,1,1
diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test3.csv b/paramfiles/alltoallv_paramfiles/Rank4Test3.csv
new file mode 100644
index 0000000..46b8070
--- /dev/null
+++ b/paramfiles/alltoallv_paramfiles/Rank4Test3.csv
@@ -0,0 +1,4 @@
+1,1,1,1
+0.1,0.1,0.1,0.1
+0.1,0.1,0.1,0.1
+0.1,0.1,0.1,0.1
diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test4.csv b/paramfiles/alltoallv_paramfiles/Rank4Test4.csv
new file mode 100644
index 0000000..e43df53
--- /dev/null
+++ b/paramfiles/alltoallv_paramfiles/Rank4Test4.csv
@@ -0,0 +1,4 @@
+1,1,1,1
+0,0,0,0
+0,0,0,0
+0,0,0,0
diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test5.csv b/paramfiles/alltoallv_paramfiles/Rank4Test5.csv
new file mode 100644
index 0000000..2750e10
--- /dev/null
+++ b/paramfiles/alltoallv_paramfiles/Rank4Test5.csv
@@ -0,0 +1,4 @@
+0,0,0,0
+1,1,1,1
+1,1,1,1
+1,1,1,1
diff --git a/paramfiles/alltoallv_paramfiles/Rank4Test6.csv b/paramfiles/alltoallv_paramfiles/Rank4Test6.csv
new file mode 100644
index 0000000..f81b129
--- /dev/null
+++ b/paramfiles/alltoallv_paramfiles/Rank4Test6.csv
@@ -0,0 +1,5 @@
+1,0,0,0
+1,0,0,0
+1,0,0,0
+1,0,0,0
+

From d4fb4d05bbfd4c9b7120e3d3949199af50f37a23 Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Mon, 10 Jul 2023 20:39:10 +0000
Subject: [PATCH 17/19] Renamed CLI arg

---
 README.md        |  2 +-
 src/alltoallv.cu |  4 ++--
 src/common.cu    | 12 ++++++------
 src/common.h     |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index f5447af..128f0e2 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ All tests support the same set of arguments :
   * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
   * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
 * Test operation
-  * `-v,--param_file <filename>` Read parameters from file for tests that require it. Currently only required for alltoallv benchmark. Default : disabled. Max of 64 characters for filename.
+  * `-s,--setup_file <filename>` Read parameters from file for tests that require it. Currently only required for alltoallv benchmark. Default : disabled. Max of 64 characters for filename.
   * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0.
   * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
   * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index d9bf108..1b65537 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -65,7 +65,7 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
   int nranks = args->nProcs*args->nThreads*args->nGpus;
   //parse the param file
   std::vector<std::vector<double>> imbalancingFactors;
-  testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors, args->param_file);
+  testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors, args->setup_file);
   CHECK = 1;
   if(parseSuccess != testSuccess) return parseSuccess;
   for (int i=0; i<args->nGpus; i++) {
@@ -99,7 +99,7 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   NCCLCHECK(ncclCommUserRank(comm, &myRank));
   std::vector<std::vector<double>> imbalancingFactors; 
-  testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors, args->param_file); //parse the param file
+  testResult_t parseSuccess = parseParamFile(nRanks, imbalancingFactors, args->setup_file); //parse the param file
   if(parseSuccess != testSuccess) return parseSuccess;
   size_t rankOffset = count * wordSize(type);
 
diff --git a/src/common.cu b/src/common.cu
index b7bc46f..f174c0e 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -59,7 +59,7 @@ int is_main_proc = 0;
 thread_local int is_main_thread = 0;
 
 // Command line parameter defaults
-static char param_file[64];
+static char setup_file[64];
 static int nThreads = 1;
 static int nGpus = 1;
 static size_t minBytes = 32*1024*1024;
@@ -685,7 +685,7 @@ int main(int argc, char* argv[]) {
   double parsed;
   int longindex;
   static struct option longopts[] = {
-    {"param_file",optional_argument, 0, 'v'},
+    {"setup_file",optional_argument, 0, 's'},
     {"nthreads", required_argument, 0, 't'},
     {"ngpus", required_argument, 0, 'g'},
     {"minbytes", required_argument, 0, 'b'},
@@ -712,14 +712,14 @@ int main(int argc, char* argv[]) {
 
   while(1) {
     int c;
-    c = getopt_long(argc, argv, "v:t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex);
+    c = getopt_long(argc, argv, "s:t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex);
 
     if (c == -1)
       break;
 
     switch(c) {
-      case 'v':
-        strcpy(param_file,optarg);
+      case 's':
+        strcpy(setup_file,optarg);
         break;
       case 't':
         nThreads = strtol(optarg, NULL, 0);
@@ -987,7 +987,7 @@ testResult_t run() {
   memset(threads, 0, sizeof(struct testThread)*nThreads);
 
   for (int t=nThreads-1; t>=0; t--) {
-    strcpy(threads[t].args.param_file, param_file);
+    strcpy(threads[t].args.setup_file, setup_file);
 
     threads[t].args.minbytes=minBytes;
     threads[t].args.maxbytes=maxBytes;
diff --git a/src/common.h b/src/common.h
index a559c0a..ac01c4f 100644
--- a/src/common.h
+++ b/src/common.h
@@ -110,7 +110,7 @@ struct testEngine {
 extern struct testEngine ncclTestEngine;
 
 struct threadArgs {
-  char param_file[64];
+  char setup_file[64];
 
   size_t nbytes;
   size_t minbytes;

From 5cbb12e8fb78a5e1d67fc50fcb7609b857d97c94 Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Wed, 12 Jul 2023 16:30:10 +0000
Subject: [PATCH 18/19] changed filepath limit

---
 src/alltoallv.cu | 3 +--
 src/common.cu    | 2 +-
 src/common.h     | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index 1b65537..05a419a 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -13,7 +13,7 @@ int CHECK = 0;
  * @param imbalancingFactors The reference to the vector that will store the parsed data
  * @param filename The name of the parameter file to parse
 **/
-testResult_t parseParamFile(int nranks, std::vector<std::vector<double>> &imbalancingFactors, char filename[64]){
+testResult_t parseParamFile(int nranks, std::vector<std::vector<double>> &imbalancingFactors, char filename[PATH_MAX]){
   std::vector<std::vector<double>> paramFile_data;
   std::ifstream paramFile(filename);
 
@@ -66,7 +66,6 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
   //parse the param file
   std::vector<std::vector<double>> imbalancingFactors;
   testResult_t parseSuccess = parseParamFile(nranks, imbalancingFactors, args->setup_file);
-  CHECK = 1;
   if(parseSuccess != testSuccess) return parseSuccess;
   for (int i=0; i<args->nGpus; i++) {
     CUDACHECK(cudaSetDevice(args->gpus[i]));
diff --git a/src/common.cu b/src/common.cu
index f174c0e..559f360 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -59,7 +59,7 @@ int is_main_proc = 0;
 thread_local int is_main_thread = 0;
 
 // Command line parameter defaults
-static char setup_file[64];
+static char setup_file[PATH_MAX];
 static int nThreads = 1;
 static int nGpus = 1;
 static size_t minBytes = 32*1024*1024;
diff --git a/src/common.h b/src/common.h
index ac01c4f..9bf2769 100644
--- a/src/common.h
+++ b/src/common.h
@@ -110,7 +110,7 @@ struct testEngine {
 extern struct testEngine ncclTestEngine;
 
 struct threadArgs {
-  char setup_file[64];
+  char setup_file[PATH_MAX];
 
   size_t nbytes;
   size_t minbytes;

From 86577ee9d4c0cf06716796b5e10c4b82ed4a1143 Mon Sep 17 00:00:00 2001
From: Sidharth Babu <sidharth.babu@keysight.com>
Date: Wed, 12 Jul 2023 16:57:05 +0000
Subject: [PATCH 19/19] adjusted parse function

---
 src/alltoallv.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index 05a419a..6fb265f 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -13,12 +13,12 @@ int CHECK = 0;
  * @param imbalancingFactors The reference to the vector that will store the parsed data
  * @param filename The name of the parameter file to parse
 **/
-testResult_t parseParamFile(int nranks, std::vector<std::vector<double>> &imbalancingFactors, char filename[PATH_MAX]){
+testResult_t parseParamFile(int nranks, std::vector<std::vector<double>> &imbalancingFactors, const char filename[PATH_MAX]){
   std::vector<std::vector<double>> paramFile_data;
   std::ifstream paramFile(filename);
 
   if (!paramFile.is_open()) {
-    PRINT("\nUNABLE TO OPEN PARAMS FILE\n");
+    PRINT("\nUNABLE TO OPEN PARAMS FILE AT: %s\n", filename);
     return testInternalError;
   }
 
@@ -45,7 +45,7 @@ testResult_t parseParamFile(int nranks, std::vector<std::vector<double>> &imbala
   }
 
   if(paramFile_data.size()!=nranks) {
-    PRINT("\nINVALID PARAMS FILE, DOES NOT HAVE CORRECT NUMBER OF ROWS\n");
+    PRINT("\nINVALID PARAMS FILE, DOES NOT HAVE CORRECT NUMBER OF ROWS, HAS %i ROWS, NEEDS %i ROWS\n", paramFile_data.size(), nranks);
     return testInternalError;
   } //ensure we have the right amount of rows