From 9696b8fa30bdd59348c97bfd739116d5c3099a9f Mon Sep 17 00:00:00 2001
From: Christopher Pirillo <pirillo.chris@gmail.com>
Date: Fri, 15 Dec 2023 10:30:19 -0800
Subject: [PATCH] Add a nccl-test sample workload (#345)

---
 sample_workloads/nccltest/README.md           | 150 +++++++++++++++
 sample_workloads/nccltest/docker/Dockerfile   |  73 ++++++++
 .../docker/root_scripts/gen_hostfiles.sh      |  24 +++
 .../nccltest/docker/root_scripts/init_ssh.sh  |  13 ++
 .../nccltest/docker/root_scripts/tune_net.sh  |  66 +++++++
 .../docker/scripts/container_entry.sh         | 151 +++++++++++++++
 .../nccltest/docker/scripts/mpi_entry.sh      |  45 +++++
 .../docker/scripts/run_nccl_benchmark.sh      |  74 ++++++++
 sample_workloads/nccltest/gke/Chart.yaml      |   6 +
 .../gke/templates/nccl_benchmarks.yaml        | 176 ++++++++++++++++++
 sample_workloads/nccltest/gke/values.yaml     |  59 ++++++
 11 files changed, 837 insertions(+)
 create mode 100644 sample_workloads/nccltest/README.md
 create mode 100644 sample_workloads/nccltest/docker/Dockerfile
 create mode 100644 sample_workloads/nccltest/docker/root_scripts/gen_hostfiles.sh
 create mode 100644 sample_workloads/nccltest/docker/root_scripts/init_ssh.sh
 create mode 100644 sample_workloads/nccltest/docker/root_scripts/tune_net.sh
 create mode 100644 sample_workloads/nccltest/docker/scripts/container_entry.sh
 create mode 100644 sample_workloads/nccltest/docker/scripts/mpi_entry.sh
 create mode 100644 sample_workloads/nccltest/docker/scripts/run_nccl_benchmark.sh
 create mode 100644 sample_workloads/nccltest/gke/Chart.yaml
 create mode 100644 sample_workloads/nccltest/gke/templates/nccl_benchmarks.yaml
 create mode 100644 sample_workloads/nccltest/gke/values.yaml
diff --git a/sample_workloads/nccltest/README.md b/sample_workloads/nccltest/README.md
new file mode 100644
index 00000000..1096affe
--- /dev/null
+++ b/sample_workloads/nccltest/README.md
@@ -0,0 +1,150 @@
+# TCPX NCCL Level Benchmarks
+
+This document will walk you through how to build, push, and deploy the nccl-benchmark image for use on a GKE cluster configured with an A3 nodepool.
+
+## Building the Benchmark Docker Image
+
+To build the latest benchmark docker image, run:
+
+```shell
+cd docker && docker build . -t nccl-benchmarks
+```
+
+**Note: A pre-built image is available in the values.yaml file**
+
+## Running the TCPX NCCL Benchmarks
+
+This section describes how you can run a 2-node world-level all-gather
+benchmark at message sizes 1G and 8G.
+
+
+If you intend to run with GKE, run:
+
+```shell
+cd gke
+PARAMS="cluster.nNodes=2,"
+PARAMS+="ncclBenchmarks.benchmarks=all_reduce_perf,"
+PARAMS+="ncclBenchmarks.masks=0x0,"
+PARAMS+="ncclBenchmarks.msgSizes=1G\,8G"
+helm install "${USER}-nccl-bm" . --set "$PARAMS"
+```
+
+Once the job is scheduled, find your master pod by running
+
+```shell
+kubectl get pods | grep "${USER}-nccl-bm.*pod0"
+```
+
+You can then follow the logs with
+
+```shell
+kubectl logs --follow <master-pod-name> -c nccl-benchmarks
+```
+
+### Finding Results
+
+The container will log the output of Nvidia's nccl-test binaries for each of the tests that are requested in the parameters.
+
+Each test run is logged separately. If you specify a release that uses multiple tests and multiple runs, the logs will be ordered by Test0Run0...Test0RunN...TestNRunN.
+
+An example test output would look like:
+```
+benchmark: all_reduce_perf, mask: 0x0, run 1/1
+# nThread 1 nGpus 1 minBytes 1048576 maxBytes 8589934592 step: 2(factor) warmup iters: 2 iters: 10 agg iters: 1 validation: 0 graph: 0
+#
+#                                                              out-of-place                       in-place
+#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
+#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)
+     1048576        262144     float     sum      -1   1788.7    0.59    1.10    N/A   1725.9    0.61    1.14    N/A
+     2097152        524288     float     sum      -1   1645.1    1.27    2.39    N/A   1649.8    1.27    2.38    N/A
+     4194304       1048576     float     sum      -1   1701.3    2.47    4.62    N/A   1691.6    2.48    4.65    N/A
+     8388608       2097152     float     sum      -1   1891.3    4.44    8.32    N/A   1911.9    4.39    8.23    N/A
+    16777216       4194304     float     sum      -1   1932.8    8.68   16.28    N/A   2014.4    8.33   15.62    N/A
+    33554432       8388608     float     sum      -1   2190.8   15.32   28.72    N/A   2395.8   14.01   26.26    N/A
+    67108864      16777216     float     sum      -1   2367.6   28.34   53.15    N/A   2389.8   28.08   52.65    N/A
+   134217728      33554432     float     sum      -1   3539.5   37.92   71.10    N/A   3266.7   41.09   77.04    N/A
+   268435456      67108864     float     sum      -1   5969.3   44.97   84.32    N/A   5850.8   45.88   86.03    N/A
+   536870912     134217728     float     sum      -1    11625   46.18   86.59    N/A    11737   45.74   85.77    N/A
+  1073741824     268435456     float     sum      -1    23144   46.39   86.99    N/A    38777   27.69   51.92    N/A
+  2147483648     536870912     float     sum      -1    45662   47.03   88.18    N/A    45522   47.17   88.45    N/A
+  4294967296    1073741824     float     sum      -1    90227   47.60   89.25    N/A    90354   47.53   89.13    N/A
+  8589934592    2147483648     float     sum      -1   179880   47.75   89.54    N/A   178867   48.02   90.05    N/A
+# Out of bounds values : 0 OK
+# Avg bus bandwidth    : 49.6371
+```
+
+If `gcsBucket` is specified in the values.yaml file, then the logs will also be uploaded to the specified bucket.
+
+### Breaking Down the Parameters
+
+*Disclaimer: This is not a comprehensive list. Refer to `values.yaml` for a full list of tunable parameters.
+
+#### Benchmark Parameters
+
+|yaml path|Explanation|
+|---|---|
+|`ncclBenchmarks.benchmarks`|A CSV of benchmarks to run.|
+|`ncclBenchmarks.masks`|A CSV of hexadecimal masks to use.|
+|`ncclBenchmarks.msgSizeBegin`|The minimum message size to use,  |
+|`ncclBenchmarks.msgSizeEnd`|The maximum message size to use, specified using 'G', 'M', 'K', or no suffix for bytes. [Source](https://github.com/NVIDIA/nccl-tests/blob/master/src/common.cu#L86). |
+|`ncclBenchmarks.nGpusPerNode`|Number of GPUs per node to use.|
+|`ncclBenchmarks.warmupIters`|Number of warmup iterations.|
+|`ncclBenchmarks.runIters`|Number of iterations per run.|
+|`ncclBenchmarks.nRuns`|Number of runs to aggregate over.|
+
+##### Benchmarks, masks, and message sizes
+
+You can specify multiple benchmarks, each with its own hexadecimal mask. **The
+message sizes to sweep over are shared across all benchmarks.** Supported
+benchmarks are `all_gather_perf`, `all_reduce_perf`, `reduce_scatter_perf`, `broadcast_perf`,
+`reduce_perf`, `sendrecv_perf`, `scatter_perf`, `gather_perf`, `alltoall_perf`, and `hypercube_perf`.
+
+For each benchmark, you must supply a mask. The benchmark does a bitwise AND
+between the rank and the mask to get a color, and ranks with the same color
+goes in the same NCCL communicator. Examples:
+
+- For a world-level NCCL operation, `MASK=0x0`.
+- For a rail-aligned NCCL operation using all 8 GPUs on a VM, `MASK=0x7`.
+- For a rail-aligned NCCL operation using only 4 GPUs on a VM, `MASK=0x3`.
+
+*Note: Providing a mask larger than the numbers of GPUs on a VM will result in asymetric network traffic between VMs.*
+
+Message sizes should be specified using 'G', 'M', 'K', or no suffix for bytes. For example 1G == 1024M == (1024 * 1024)K == (1024 * 1024 * 1024). [Source](https://github.com/NVIDIA/nccl-tests/blob/1292b25553bd0384f2faa2965f9d82b99797a348/src/common.cu#L86C1-L120C2).
+
+##### WARMUP_ITERS, and RUN_ITERS
+
+For each message size, the benchmark will measure the average latency and bus
+bandwidth used. Each run consists of a few
+warmup iterations, followed by the actual measurements used to derive
+performance.
+
+#### Switching Out Software Components
+
+|GKE|Explanation|
+|---|---|
+|`rxdm.image`|Image for the TCPX RxDM.|
+|`rxdm.tag`|Tag for the TCPX RxDM.|
+|`rxdm.flags`|Runtime flags for the TCPX RxDM.|
+|`ncclPlugin.image`|Image for the TCPX NCCL plugin.|
+|`ncclPlugin.tag`|Tag for the TCPX NCCL plugin.|
+|`ncclPlugin.unreservedCores`|Application reserved cores.|
+|`ncclPlugin.envs`|Environment variables for the TCPX NCCL plugin.|
+
+**For TCPX NCCL, any environment variables starting with `NCCL` will be picked
+up by the benchmarking container.**
+
+#### More Fine-Grained Node Placement Control
+
+|yaml path|Explanation|
+|---|---|
+|`cluster.sbPlacement`|If deliberate superblock placement should be enabled.|
+|`cluster.nSuperblocks`|Number of superblocks for job to span over.|
+|`cluster.startSuperblock`|Which superblock to start job on.|
+
+In GKE, we have a flag to toggle deliberate superblock placement. If enabled,
+we will try to split the job among `cluster.nSuperblocks` superblocks, starting
+from superblock `cluster.startSuperblock`. **This guarantees closer affinity
+between the job nodes and should be enabled for performance benchmarking.**
+
+*Note that this feature is based on a `superblock` label in the Kubernetes
+cluster and would not work if that label is missing. For example, Superblock 1 should be labeled with `superblock`: 1 *
\ No newline at end of file
diff --git a/sample_workloads/nccltest/docker/Dockerfile b/sample_workloads/nccltest/docker/Dockerfile
new file mode 100644
index 00000000..382d9cdb
--- /dev/null
+++ b/sample_workloads/nccltest/docker/Dockerfile
@@ -0,0 +1,73 @@
+FROM nvidia/cuda:12.0.0-devel-ubuntu20.04
+
+ENV DEBIAN_FRONTEND='noninteractive'
+
+RUN apt-get update \
+  && apt-get install -y --no-install-recommends \
+        git openssh-server wget iproute2 vim libopenmpi-dev build-essential \
+        cmake gdb python3 \
+  protobuf-compiler libprotobuf-dev rsync libssl-dev \
+  && rm -rf /var/lib/apt/lists/*
+
+ARG CUDA12_GENCODE='-gencode=arch=compute_90,code=sm_90'
+ARG CUDA12_PTX='-gencode=arch=compute_90,code=compute_90'
+
+WORKDIR /third_party
+# Install patched NCCL
+RUN git clone https://github.com/NVIDIA/nccl.git nccl-netsupport && \
+cd nccl-netsupport && \
+git fetch --all --tags && \
+git checkout -b github_nccl_2_18_5 05121c8191984aada7ed57dd8081bd987f73288f
+WORKDIR nccl-netsupport
+RUN make NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" -j 16
+
+WORKDIR /third_party
+RUN git clone https://github.com/NVIDIA/nccl-tests.git
+WORKDIR nccl-tests
+RUN git fetch --all --tags
+RUN make CUDA_HOME=/usr/local/cuda NCCL_HOME=/third_party/nccl-netsupport/build NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" -j 16
+
+WORKDIR /third_party
+RUN git clone https://github.com/NVIDIA/nccl-tests.git nccl-tests-mpi
+WORKDIR nccl-tests-mpi
+RUN git fetch --all --tags
+RUN make MPI=1 MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi CUDA_HOME=/usr/local/cuda NCCL_HOME=/third_party/nccl-netsupport/build NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" -j 16
+
+# copy all license files
+WORKDIR /third_party/licenses
+RUN cp ../nccl-netsupport/LICENSE.txt license_nccl.txt
+RUN cp ../nccl-tests/LICENSE.txt license_nccl_tests.txt
+
+# Setup SSH to use port 222
+RUN cd /etc/ssh/ && sed --in-place='.bak' 's/#Port 22/Port 222/' sshd_config && \
+    sed --in-place='.bak' 's/#PermitRootLogin prohibit-password/PermitRootLogin prohibit-password/' sshd_config
+RUN ssh-keygen -t rsa -b 4096 -q -f /root/.ssh/id_rsa -N ""
+RUN touch /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys
+RUN cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
+
+# remove NCCL from container
+WORKDIR /third_party/nccl-netsupport
+RUN rm -f build/lib/libnccl*
+WORKDIR /usr/lib/x86_64-linux-gnu/
+RUN rm -f libnccl*
+
+# Install gcsfuse and python3.
+RUN apt-get update \
+  && apt-get install --yes \
+      curl lsb-release cuda-nsight-systems-12-0 \
+  && echo "deb https://packages.cloud.google.com/apt gcsfuse-$(lsb_release -c -s) main" \
+    | tee /etc/apt/sources.list.d/gcsfuse.list \
+  && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - \
+  && apt-get update \
+  && apt-get install -y gcsfuse \
+  && apt-get clean && rm -rf /var/lib/apt/lists/* \
+  && mkdir /gcs
+
+ADD root_scripts /scripts
+RUN chmod +rx /scripts/gen_hostfiles.sh /scripts/init_ssh.sh /scripts/tune_net.sh
+
+ADD scripts /workspace
+RUN chmod +rx /workspace/container_entry.sh /workspace/mpi_entry.sh /workspace/run_nccl_benchmark.sh 
+
+WORKDIR /workspace
+ENTRYPOINT ["/bin/bash", "/workspace/container_entry.sh"]
diff --git a/sample_workloads/nccltest/docker/root_scripts/gen_hostfiles.sh b/sample_workloads/nccltest/docker/root_scripts/gen_hostfiles.sh
new file mode 100644
index 00000000..3746c4fa
--- /dev/null
+++ b/sample_workloads/nccltest/docker/root_scripts/gen_hostfiles.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+len() {
+  local -r arr=($@)
+  echo "${#arr[@]}"
+}
+
+NRANKS_FACTORS=(1 2 4 8)
+
+NHOSTS=$(len "$@")
+echo "generating hostfiles for ${NHOSTS} hosts: "
+for h in "$@"; do echo "$h"; done
+
+mkdir -p "hostfiles${NHOSTS}"
+
+for nr in "${NRANKS_FACTORS[@]}";
+do
+  rm -f "hostfiles${NHOSTS}/hostfile${nr}"
+  touch "hostfiles${NHOSTS}/hostfile${nr}"
+  for h in "$@";
+  do
+    echo "$h port=222 slots=${nr}" >> "hostfiles${NHOSTS}/hostfile${nr}"
+  done
+done
diff --git a/sample_workloads/nccltest/docker/root_scripts/init_ssh.sh b/sample_workloads/nccltest/docker/root_scripts/init_ssh.sh
new file mode 100644
index 00000000..a9d01cd7
--- /dev/null
+++ b/sample_workloads/nccltest/docker/root_scripts/init_ssh.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+PORT=${PORT:-222}
+
+while true; do
+  host=$1
+  if [[ -z $host ]]; then
+    break
+  fi
+  ssh -o StrictHostKeyChecking=no -p "${PORT}" "$host" \
+    echo "Hello from ${host}"
+  shift
+done
diff --git a/sample_workloads/nccltest/docker/root_scripts/tune_net.sh b/sample_workloads/nccltest/docker/root_scripts/tune_net.sh
new file mode 100644
index 00000000..12abbb99
--- /dev/null
+++ b/sample_workloads/nccltest/docker/root_scripts/tune_net.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+set -e
+
+# Dumps the irq binding of $ifname for sanity checking.
+dump_irq_binding() {
+  local -r ifname="$1"
+  echo -e "\n\ndump_irq_binding: ifname=${ifname}\n"
+  for irq in $(ls "/sys/class/net/${ifname}/device/msi_irqs/"); do
+    smp_affinity_list=$(cat "/proc/irq/${irq}/smp_affinity_list")
+    echo irq="$irq" smp_affinity_list="$smp_affinity_list"
+  done
+}
+
+set_irq_range() {
+  local -r nic="$1"
+  local core_start="$2"
+  local num_cores="$3"
+
+  # The user may not have this $nic configured on their VM, if not, just skip
+  # it, no need to error out.
+  if [[ ! -d "/sys/class/net/${nic}/device" ]]; then
+    return;
+  fi
+
+  echo "Setting irq binding for ${nic}..."
+
+  # We count the number of rx queues and assume number of rx queues == tx
+  # queues. Currently the GVE configuration at boot is 16 rx + 16 tx.
+  num_q=$(ls -1 "/sys/class/net/${nic}/queues/" | grep rx | wc -l)
+
+  irq_start=$(ls -1 "/sys/class/net/${nic}/device/msi_irqs" | sort -n | head -n 1)
+  idx=0
+  for ((queue = 0; queue < "$num_q"; queue++)); do
+    irq=$((irq_start + "$queue"))
+
+    core=$(( core_start + idx ))
+
+    # this is GVE's TX irq. See gve_tx_idx_to_ntfy().
+    echo "$core" > /proc/irq/"$irq"/smp_affinity_list
+
+    # this is GVE's RX irq. See gve_rx_idx_to_ntfy().
+    echo "$core" > /proc/irq/$(("$irq" + "$num_q"))/smp_affinity_list
+
+    idx=$(( (idx + 1) % num_cores ))
+  done
+}
+
+# The below eth0-eth0 ranges are named based on expectations for using COS on GKE.
+# If this is being run on a system with different eth0-4 names, change these to the correct names.
+
+a3_bind_irqs() {
+  set_irq_range eth0 32 4
+  set_irq_range eth1 36 8
+  set_irq_range eth2 44 8
+  set_irq_range eth3 88 8
+  set_irq_range eth4 96 8
+}
+
+a3_bind_irqs
+
+dump_irq_binding eth0
+dump_irq_binding eth1
+dump_irq_binding eth2
+dump_irq_binding eth3
+dump_irq_binding eth4
diff --git a/sample_workloads/nccltest/docker/scripts/container_entry.sh b/sample_workloads/nccltest/docker/scripts/container_entry.sh
new file mode 100644
index 00000000..af968941
--- /dev/null
+++ b/sample_workloads/nccltest/docker/scripts/container_entry.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+# Container entry script. Runs NCCL-level benchmarks and report results.
+
+set -u
+
+# Job parameters.
+: "${JOB_TIMESTAMP:?Must set JOB_TIMESTAMP}"
+: "${JOB_NAME:?Must set JOB_NAME}"
+: "${MASTER_ADDR:?Must set MASTER_ADDR}"
+: "${NNODES:?Must set NNODES}"
+: "${NODE_RANK:?Must set NODE_RANK}"
+
+# Benchmark parameters.
+: "${BENCHMARKS_CSV:?Must set BENCHMARKS_CSV}"
+: "${MASKS_CSV:?Must set MASKS_CSV}"
+: "${MSG_SIZE_BEGIN:?Must set MSG_SIZE_BEGIN}"
+: "${MSG_SIZE_END:?Must set MSG_SIZE_END}"
+: "${GPUS_PER_NODE:?Must set GPUS_PER_NODE}"
+: "${WARMUP_ITERS:?Must set WARMUP_ITERS}"
+: "${RUN_ITERS:?Must set RUN_ITERS}"
+: "${N_RUNS:?Must set N_RUNS}"
+
+# Unreserved cores for taskset call. This is a CSV of ranges for cores unused
+# by TCPX.
+: "${UNRESERVED_CORES:?Must set UNRESERVED_CORES}"
+
+# Telemetry.
+: "${GPU_TELEMETRY:?Must set GPU_TELEMETRY}"
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+# If input is malformed, error straight away.
+IFS=',' read -r -a BENCHMARKS <<< "$BENCHMARKS_CSV"
+IFS=',' read -r -a MASKS <<< "$MASKS_CSV"
+
+if [[ "${#BENCHMARKS[@]}" -ne "${#MASKS[@]}" ]]; then
+  echo "Mismatching length of BENCHMARKS_CSV and MASKS_CSV, aborting..."
+  exit 1
+fi
+
+for BENCHMARK in ${BENCHMARKS[@]}; do
+  if [[ "$BENCHMARK" != "all_gather_perf"     && "$BENCHMARK" != "all_reduce_perf" && \
+        "$BENCHMARK" != "reduce_scatter_perf" && "$BENCHMARK" != "broadcast_perf" && \
+        "$BENCHMARK" != "reduce_perf"         && "$BENCHMARK" != "sendrecv_perf" && \
+        "$BENCHMARK" != "scatter_perf"        && "$BENCHMARK" != "gather_perf" && \
+        "$BENCHMARK" != "alltoall_perf"       && "$BENCHMARK" != "hypercube_perf" ]]; then
+    echo "${BENCHMARK} is not a legal benchmark, aborting..."
+    exit 2
+  fi
+done
+
+# Create shared directory for GCE compatibility.
+mkdir -p /usr/share/nccl_benchmarks
+
+# Wait for master node to be ready.
+MASTER_READY=""
+while [[ -z "$MASTER_READY" ]]; do
+  echo "Waiting for master..."
+  MASTER_READY=$( getent hosts ${MASTER_ADDR} )
+  echo $MASTER_READY
+  sleep 5
+done
+
+# IRQ tunings.
+/scripts/tune_net.sh
+
+# Start SSH service on hosts and send address of self to master node.
+mkdir -p /run/mpi_bootstrap
+service ssh start
+
+RANK_FILENAME="rank${NODE_RANK}.txt"
+hostname > "$RANK_FILENAME"
+while true; do
+  scp -o StrictHostKeyChecking=no -P 222 \
+    "$RANK_FILENAME" "${MASTER_ADDR}:/run/mpi_bootstrap/"
+
+  EXIT_STATUS=$?
+  if [[ "$EXIT_STATUS" -eq 0 ]]; then
+    break
+  fi
+  sleep 1
+done
+
+# Mount GCS.
+echo "Mounting GCS..."
+GCS_ROOT_DIR=/workspace/logs
+JOB_LOG_DIR_NAME="${JOB_TIMESTAMP}_${JOB_NAME}_nnodes_${NNODES}_gpus_${GPUS_PER_NODE}"
+JOB_LOG_DIR="${GCS_ROOT_DIR}/${JOB_LOG_DIR_NAME}"
+  
+mkdir -p "$GCS_ROOT_DIR"
+if [[ ! -z "${GCS_BUCKET}" ]]; then
+  gcsfuse --implicit-dirs "$GCS_BUCKET" "$GCS_ROOT_DIR"
+  echo "GCS mount complete; results at ${GCS_BUCKET}/${JOB_LOG_DIR_NAME}"
+else
+  echo "GCS Bucket not specified, no logs will be uploaded. Local logs can be found at ${GCS_ROOT_DIR}/${JOB_LOG_DIR_NAME}"
+fi
+
+if [[ "$NODE_RANK" -eq 0 ]]; then
+  # Once host information has arrived, initialize SSH, generate hostfile, and
+  # start the tests.
+  echo "Waiting for host information to arrive..."
+  NRANKS_READY=0
+  while [[ "$NRANKS_READY" -lt "$NNODES" ]]; do
+    NRANKS_READY=$( ls /run/mpi_bootstrap/rank*.txt | wc -l )
+    sleep 1
+  done
+
+  for (( i = 0; i < NNODES; ++i )); do
+    cat "/run/mpi_bootstrap/rank${i}.txt" >> /run/mpi_bootstrap/hosts.txt
+  done
+
+  cat /run/mpi_bootstrap/hosts.txt | xargs /scripts/init_ssh.sh
+  pushd /scripts
+  cat /run/mpi_bootstrap/hosts.txt | xargs /scripts/gen_hostfiles.sh
+  popd
+
+  # Run workload and process results.
+  for i in "${!BENCHMARKS[@]}"; do
+    BENCHMARK=${BENCHMARKS[i]}
+    MASK=${MASKS[i]}
+    echo "Running benchmark ${BENCHMARK} with mask ${MASK}..."
+
+    BM_LOG_DIR="${JOB_LOG_DIR}/bm_${BENCHMARK}_mask_${MASK}"
+    mkdir -p "$BM_LOG_DIR"
+
+    NNODES="$NNODES" \
+    BM_LOG_DIR="$BM_LOG_DIR" \
+    BENCHMARK="$BENCHMARK" \
+    MASK="$MASK" \
+    MSG_SIZE_BEGIN="$MSG_SIZE_BEGIN" \
+    MSG_SIZE_END="$MSG_SIZE_END" \
+    GPUS_PER_NODE="$GPUS_PER_NODE" \
+    WARMUP_ITERS="$WARMUP_ITERS" \
+    RUN_ITERS="$RUN_ITERS" \
+    N_RUNS="$N_RUNS" \
+    UNRESERVED_CORES="$UNRESERVED_CORES" \
+    GPU_TELEMETRY="$GPU_TELEMETRY" \
+      "${SCRIPT_DIR}/run_nccl_benchmark.sh"
+  done
+
+  # Tell each node the MPI workload has terminated.
+  mpirun --mca btl tcp,self --mca btl_tcp_if_include eth0 \
+    --mca routed direct --allow-run-as-root \
+    -np "$NNODES" --hostfile /scripts/hostfiles${NNODES}/hostfile1 \
+    touch /usr/share/nccl_benchmarks/workload_terminated
+else
+  while [[ ! -e /usr/share/nccl_benchmarks/workload_terminated ]]; do
+    sleep 10
+  done
+fi
diff --git a/sample_workloads/nccltest/docker/scripts/mpi_entry.sh b/sample_workloads/nccltest/docker/scripts/mpi_entry.sh
new file mode 100644
index 00000000..00bb5d22
--- /dev/null
+++ b/sample_workloads/nccltest/docker/scripts/mpi_entry.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Job parameters.
+: "${RUN_LOG_DIR:?Must set RUN_LOG_DIR}"
+
+# Benchmark parameters.
+: "${MSG_SIZE_BEGIN:?Must set MSG_SIZE_BEGIN}"
+: "${MSG_SIZE_END:?Must set MSG_SIZE_END}"
+: "${WARMUP_ITERS:?Must set WARMUP_ITERS}"
+: "${RUN_ITERS:?Must set RUN_ITERS}"
+
+# Unreserved cores for taskset call. This is a CSV of ranges for cores unused
+# by TCPX.
+: "${UNRESERVED_CORES:?Must set UNRESERVED_CORES}"
+
+# Telemetry.
+: "${GPU_TELEMETRY:?Must set GPU_TELEMETRY}"
+
+# OpenMPI parameters.
+: "${OMPI_COMM_WORLD_RANK:?Must set OMPI_COMM_WORLD_RANK}"
+: "${OMPI_COMM_WORLD_LOCAL_SIZE:?Must set OMPI_COMM_WORLD_LOCAL_SIZE}"
+: "${OMPI_COMM_WORLD_LOCAL_RANK:?Must set OMPI_COMM_WORLD_LOCAL_RANK}"
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+# Compute node rank based on global rank and local world size.
+# This is a workaround for us not being able to send this info directly in MPI.
+NODE_RANK=$(( OMPI_COMM_WORLD_RANK / OMPI_COMM_WORLD_LOCAL_SIZE ))
+
+TELEMETRY_DIR="${RUN_LOG_DIR}/telemetry/node${NODE_RANK}"
+# Add GPU profiling.
+NSYS_PREFIX=""
+if [[ "$GPU_TELEMETRY" == "true" ]]; then
+  GPU_TELEMETRY_DIR="${TELEMETRY_DIR}/gpu"
+  mkdir -p "$GPU_TELEMETRY_DIR"
+  GPU_TELEMETRY_OUTPUT="${GPU_TELEMETRY_DIR}/rank${OMPI_COMM_WORLD_LOCAL_RANK}"
+  NSYS_PREFIX="nsys profile \
+                  --wait primary -o ${GPU_TELEMETRY_OUTPUT} \
+                  --force-overwrite true -t cuda,nvtx -s none --export sqlite"
+fi
+
+$NSYS_PREFIX \
+taskset -c "$UNRESERVED_CORES" \
+  /third_party/nccl-tests-mpi/build/${BENCHMARK} \
+    -b "$MSG_SIZE_BEGIN" -e "$MSG_SIZE_END" -f 2 -g 1 -w "$WARMUP_ITERS" --iters "$RUN_ITERS" -c 0
diff --git a/sample_workloads/nccltest/docker/scripts/run_nccl_benchmark.sh b/sample_workloads/nccltest/docker/scripts/run_nccl_benchmark.sh
new file mode 100644
index 00000000..d21d1663
--- /dev/null
+++ b/sample_workloads/nccltest/docker/scripts/run_nccl_benchmark.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+set -u
+set -o pipefail
+
+# Job parameters.
+: "${NNODES:?Must set NNODES}"
+: "${BM_LOG_DIR:?Must set BM_LOG_DIR}"
+
+# Benchmark parameters.
+: "${BENCHMARK:?Must set BENCHMARK}"
+: "${MASK:?Must set MASK}"
+: "${MSG_SIZE_BEGIN:?Must set MSG_SIZE_BEGIN}"
+: "${MSG_SIZE_END:?Must set MSG_SIZE_END}"
+: "${GPUS_PER_NODE:?Must set GPUS_PER_NODE}"
+: "${WARMUP_ITERS:?Must set WARMUP_ITERS}"
+: "${RUN_ITERS:?Must set RUN_ITERS}"
+: "${N_RUNS:?Must set N_RUNS}"
+
+# Unreserved cores for taskset call. This is a CSV of ranges for cores unused
+# by TCPX.
+: "${UNRESERVED_CORES:?Must set UNRESERVED_CORES}"
+
+# Telemetry.
+: "${GPU_TELEMETRY:?Must set GPU_TELEMETRY}"
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+NRANKS=$(( NNODES * GPUS_PER_NODE ))
+
+# Generate CUDA_VISIBLE_DEVICES.
+CUDA_VISIBLE_DEVICES=$( seq -s, 0 1 $(( GPUS_PER_NODE - 1 )) )
+
+# Generate NCCL flags for application.
+NCCL_FLAGS=$( env | egrep ^NCCL | awk '{ printf "-x %s ", $0; }' )
+
+# Run actual NCCL benchmarks.
+for (( i = 1; i <= $N_RUNS; ++i )); do
+  RUN_LOG_DIR="${BM_LOG_DIR}/per_run_results/run_${i}"
+  mkdir -p "$RUN_LOG_DIR"
+  LOGFILE_PATH="${RUN_LOG_DIR}/logs.txt"
+
+  echo "benchmark: ${BENCHMARK}, mask: ${MASK}, run ${i}/${N_RUNS}"
+
+  # Run benchmark, with a 3-hour timeout.
+  mpirun --mca btl tcp,self --mca btl_tcp_if_include eth0 \
+    --mca routed direct --allow-run-as-root -np "$NRANKS" \
+    --hostfile "/scripts/hostfiles${NNODES}/hostfile${GPUS_PER_NODE}" \
+    --timeout $(( 3 * 60 * 60 )) \
+    -x LD_LIBRARY_PATH -x PATH \
+    -x "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}" \
+    $NCCL_FLAGS \
+    -x "RUN_LOG_DIR=${RUN_LOG_DIR}" \
+    -x "UNRESERVED_CORES=${UNRESERVED_CORES}" \
+    -x "GPU_TELEMETRY=${GPU_TELEMETRY}" \
+    -x "BENCHMARK=${BENCHMARK}" \
+    -x "MSG_SIZE_BEGIN=${MSG_SIZE_BEGIN}" \
+    -x "MSG_SIZE_END=${MSG_SIZE_END}" \
+    -x "WARMUP_ITERS=${WARMUP_ITERS}" \
+    -x "RUN_ITERS=${RUN_ITERS}" \
+    -x "NCCL_TESTS_SPLIT_MASK=${MASK}" \
+      "${SCRIPT_DIR}/mpi_entry.sh" 2>&1 | \
+  tee "$LOGFILE_PATH"
+
+  EXIT_STATUS=$?
+  if [[ "$EXIT_STATUS" -ne 0 ]]; then
+    echo "WARNING: got non-zero exit status ${EXIT_STATUS}"
+  fi
+
+  # Generate the CSVs for the run.
+  CSV_PATH="${RUN_LOG_DIR}/results.csv"
+  grep "float" "$LOGFILE_PATH" | \
+    awk '{ printf "%s,%s,%s\n", $4, $9, $11 }' > "$CSV_PATH"
+done
diff --git a/sample_workloads/nccltest/gke/Chart.yaml b/sample_workloads/nccltest/gke/Chart.yaml
new file mode 100644
index 00000000..997419b6
--- /dev/null
+++ b/sample_workloads/nccltest/gke/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: nccl-benchmarks
+description: nccl-benchmarks
+type: application
+version: 0.1.0
+appVersion: 0.1.0
diff --git a/sample_workloads/nccltest/gke/templates/nccl_benchmarks.yaml b/sample_workloads/nccltest/gke/templates/nccl_benchmarks.yaml
new file mode 100644
index 00000000..2b0fbd8c
--- /dev/null
+++ b/sample_workloads/nccltest/gke/templates/nccl_benchmarks.yaml
@@ -0,0 +1,176 @@
+# yamllint disable
+{{- $requiredVar := .Values.cluster.nNodes | required ".Values.cluster.nNodes is required, add --set cluster.nNodes=2" -}}
+
+{{ $timestamp := now | date "2006-01-02-150405" }}
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: "nccl-benchmarks-{{ $.Release.Name }}-{{ $timestamp }}"
+spec:
+  selector:
+    name: "nccl-benchmarks-{{ $.Release.Name }}-{{ $timestamp }}"
+  clusterIP: None
+---
+{{ $node_count := .Values.cluster.nNodes | int }}
+{{ $superblock_count := .Values.cluster.nSuperblocks | int }}
+{{ $nodesPerSuperblock := divf $node_count $superblock_count | ceil }}
+
+{{- range $node_index, $element := until $node_count }}
+apiVersion: v1
+kind: Pod
+metadata:
+  name: nccl-benchmarks-{{ $.Release.Name }}-{{ $timestamp }}-pod{{ $node_index }}
+  {{- if eq $node_index 0 }}
+  labels:
+    name: nccl-benchmarks-{{ $.Release.Name }}-{{ $timestamp }}
+  {{- end }}
+spec:
+  hostNetwork: true
+  dnsPolicy: ClusterFirstWithHostNet
+  hostname: nccl-benchmarks-pod{{ $node_index }}
+  subdomain: nccl-benchmarks-{{ $timestamp }}
+  serviceAccountName: "default"
+  restartPolicy: Never
+  {{- if $.Values.cluster.sbPlacement }}
+  {{ $superblockChunk :=  div $node_index $nodesPerSuperblock | int }}
+  {{ $superblockIndex :=  add $.Values.cluster.startSuperblock $superblockChunk | int }}
+  nodeSelector:
+    superblock: "{{ $superblockIndex }}"
+  {{- end }}
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms:
+        - matchExpressions:
+          - key: cloud.google.com/gke-accelerator
+            operator: Exists
+  tolerations:
+  - operator: "Exists"
+    key: nvidia.com/gpu
+  volumes:
+  - name: nvidia-install-dir-host
+    hostPath:
+      path: /home/kubernetes/bin/nvidia/lib64
+  - name: tcpd-socket
+    hostPath:
+      path: /run/tcpx
+  - name: shared-memory
+    emptyDir:
+      medium: "Memory"
+      sizeLimit: 200Gi
+  - name: tcpx-nccl-plugin-volume
+    emptyDir: {}
+  - name: workload-terminated-volume
+    emptyDir: {}
+  initContainers:
+  - name: tcpx-nccl-plugin-installer
+    image: {{ $.Values.ncclPlugin.image }}:{{ $.Values.ncclPlugin.tag }}
+    imagePullPolicy: Always
+    volumeMounts:
+    - name: tcpx-nccl-plugin-volume
+      mountPath: /var/lib/tcpx
+    resources:
+      requests:
+        cpu: 150m
+    command:
+      - /bin/bash
+      - -c
+      - |
+        /scripts/container_entry.sh install --install-nccl
+        {{- if $.Values.telemetry.gpu }} \
+          --nccl-buildtype=nvtx
+        {{- end }}
+  containers:
+  - name: tcpd-daemon
+    image: {{ $.Values.rxdm.image }}:{{ $.Values.rxdm.tag }}
+    imagePullPolicy: Always
+    command:
+    - "bash"
+    - "-c"
+    - |
+      /tcpgpudmarxd/build/app/tcpgpudmarxd {{- range $.Values.rxdm.flags }} {{.}} {{- end }} &
+      while [ ! -e "/usr/share/nccl_benchmarks/workload_terminated" ]; do sleep 10; done
+      pkill -e "^"tcpgpudmarxd || true
+      sleep 30
+    securityContext:
+      privileged: true
+    volumeMounts:
+    - name: nvidia-install-dir-host
+      mountPath: /usr/local/nvidia/lib64
+    - name: tcpd-socket
+      mountPath: /tmp
+    - name: workload-terminated-volume
+      mountPath: /usr/share/nccl_benchmarks
+    env:
+    - name: LD_LIBRARY_PATH
+      value: /usr/local/nvidia/lib64
+  - name: nccl-benchmarks
+    image: {{ $.Values.ncclBenchmarks.image }}:{{ $.Values.ncclBenchmarks.tag }}
+    imagePullPolicy: Always
+    securityContext:
+      privileged: true
+      capabilities:
+        add:
+          - SYS_ADMIN
+          - SYS_PTRACE
+          - IPC_LOCK
+    env:
+      - name: JOB_TIMESTAMP
+        value: "{{ $timestamp }}"
+      - name: JOB_NAME
+        value: "{{ $.Release.Name }}"
+      - name: MASTER_ADDR
+        value: "nccl-benchmarks-{{ $.Release.Name }}-{{ $timestamp }}"
+      - name: NNODES
+        value: "{{ $node_count }}"
+      - name: NODE_RANK
+        value: "{{ $node_index }}"
+      - name: GCS_BUCKET
+        value: "{{ $.Values.cluster.gcsBucket }}"
+      - name: LD_LIBRARY_PATH
+        value: "/usr/local/tcpx/lib64:/usr/local/nvidia/lib64"
+      - name: BENCHMARKS_CSV
+        value: "{{ $.Values.ncclBenchmarks.benchmarks }}"
+      - name: MASKS_CSV
+        value: "{{ $.Values.ncclBenchmarks.masks }}"
+      - name: MSG_SIZE_BEGIN
+        value: "{{ $.Values.ncclBenchmarks.msgSizeBegin }}"
+      - name: MSG_SIZE_END
+        value: "{{ $.Values.ncclBenchmarks.msgSizeEnd }}"
+      - name: GPUS_PER_NODE
+        value: "{{ $.Values.ncclBenchmarks.gpusPerNode }}"
+      - name: WARMUP_ITERS
+        value: "{{ $.Values.ncclBenchmarks.warmupIters }}"
+      - name: RUN_ITERS
+        value: "{{ $.Values.ncclBenchmarks.runIters }}"
+      - name: N_RUNS
+        value: "{{ $.Values.ncclBenchmarks.nRuns }}"
+      - name: UNRESERVED_CORES
+        value: "{{ $.Values.ncclPlugin.unreservedCores }}"
+      - name: GPU_TELEMETRY
+        value: "{{ $.Values.telemetry.gpu }}"
+      {{- range $key, $value := $.Values.ncclPlugin.envs }}
+      - name: "{{ $key }}"
+        value: "{{ $value }}"
+      {{- end }}
+      {{- if $.Values.telemetry.gpu }}
+      - name: NCCL_PROXY_NVTX_ENABLE
+        value: "1"
+      {{- end }}
+    volumeMounts:
+      - name: nvidia-install-dir-host
+        mountPath: /usr/local/nvidia/lib64
+      - name: tcpd-socket
+        mountPath: /tmp
+      - name: shared-memory
+        mountPath: /dev/shm
+      - name: tcpx-nccl-plugin-volume
+        mountPath: /usr/local/tcpx
+      - name: workload-terminated-volume
+        mountPath: /usr/share/nccl_benchmarks
+    resources:
+      limits:
+        nvidia.com/gpu: !!int 8
+---
+{{- end }}
diff --git a/sample_workloads/nccltest/gke/values.yaml b/sample_workloads/nccltest/gke/values.yaml
new file mode 100644
index 00000000..bbb66df3
--- /dev/null
+++ b/sample_workloads/nccltest/gke/values.yaml
@@ -0,0 +1,59 @@
+cluster:
+  nNodes: null  # Must specify on commandline (--set cluster.nNodes=2)
+  sbPlacement: false # Set `true` if running across multiple superblocks
+  nSuperblocks: 1
+  startSuperblock: 1
+  gcsBucket: null
+
+ncclBenchmarks:
+  image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-benchmark"
+  tag: "latest"
+  # A comma-separated list of benchmarks to run.
+  benchmarks: "all_gather_perf,all_reduce_perf,sendrecv_perf"
+  # A comma-separated list of hex masks for the benchmarks.
+  # Must be of the same length as ncclBenchmarks.benchmarks.
+  # Each mask is recommended to be less than ncclBenchmarks.gpusPerNode.
+  masks: "0x0,0x0,0x7"
+  msgSizeBegin: "1M"
+  msgSizeEnd: "8G"
+  # Number of GPUs per node. Must be one of 1, 2, 4, 8.
+  gpusPerNode: 8
+  warmupIters: 5
+  runIters: 100
+  nRuns: 5
+
+telemetry:
+  gpu: false
+
+rxdm:
+  image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev"
+  tag: "v2.0.9"
+  flags: ["--setup_param \"--verbose 128 2 0\"", "--gpu_nic_preset a3vm", "--gpu_shmem_type fd"]
+
+ncclPlugin:
+  image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev"
+  tag: "v3.1.6_2023_10_23"
+  unreservedCores: "0-7,104-111,52-59,156-163"
+  envs:
+    NCCL_GPUDIRECTTCPX_FORCE_ACK: "0"
+    NCCL_SOCKET_IFRAME: "eth0"
+    NCCL_DYNAMIC_CHUNK_SIZE: 524288
+    NCCL_P2P_NET_CHUNKSIZE: 524288
+    NCCL_P2P_PCI_CHUNKSIZE: 524288
+    NCCL_P2P_NVL_CHUNKSIZE: 1048576
+    NCCL_GPUDIRECTTCPX_TX_BINDINGS:
+      "eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"
+    NCCL_GPUDIRECTTCPX_RX_BINDINGS:
+      "eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191"
+    NCCL_NSOCKS_PERTHREAD: 4
+    NCCL_SOCKET_NTHREADS: 1
+    NCCL_MAX_NCHANNELS: 12
+    NCCL_MIN_NCHANNELS: 12
+    NCCL_GPUDIRECTTCPX_SOCKET_IFNAME: "eth1,eth2,eth3,eth4"
+    NCCL_GPUDIRECTTCPX_CTRL_DEV: "eth0"
+    NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS: "1000000"
+    NCCL_CROSS_NIC: 0
+    NCCL_ALGO: "Ring"
+    NCCL_PROTO: "Simple"
+    NCCL_NET_GDR_LEVEL: "PIX"
+    NCCL_P2P_PXN_LEVEL: 0