Add a nccl-test sample workload (#345)

GoogleCloudPlatform · Dec 15, 2023 · 9696b8f · 9696b8f
1 parent 12c7ce5
commit 9696b8f
Show file tree

Hide file tree

Showing 11 changed files with 837 additions and 0 deletions.
diff --git a/sample_workloads/nccltest/README.md b/sample_workloads/nccltest/README.md
@@ -0,0 +1,150 @@
+# TCPX NCCL Level Benchmarks
+
+This document will walk you through how to build, push, and deploy the nccl-benchmark image for use on a GKE cluster configured with an A3 nodepool.
+
+## Building the Benchmark Docker Image
+
+To build the latest benchmark docker image, run:
+
+```shell
+cd docker && docker build . -t nccl-benchmarks
+```
+
+**Note: A pre-built image is available in the values.yaml file**
+
+## Running the TCPX NCCL Benchmarks
+
+This section describes how you can run a 2-node world-level all-gather
+benchmark at message sizes 1G and 8G.
+
+
+If you intend to run with GKE, run:
+
+```shell
+cd gke
+PARAMS="cluster.nNodes=2,"
+PARAMS+="ncclBenchmarks.benchmarks=all_reduce_perf,"
+PARAMS+="ncclBenchmarks.masks=0x0,"
+PARAMS+="ncclBenchmarks.msgSizes=1G\,8G"
+helm install "${USER}-nccl-bm" . --set "$PARAMS"
+```
+
+Once the job is scheduled, find your master pod by running
+
+```shell
+kubectl get pods | grep "${USER}-nccl-bm.*pod0"
+```
+
+You can then follow the logs with
+
+```shell
+kubectl logs --follow <master-pod-name> -c nccl-benchmarks
+```
+
+### Finding Results
+
+The container will log the output of Nvidia's nccl-test binaries for each of the tests that are requested in the parameters.
+
+Each test run is logged separately. If you specify a release that uses multiple tests and multiple runs, the logs will be ordered by Test0Run0...Test0RunN...TestNRunN.
+
+An example test output would look like:
+```
+benchmark: all_reduce_perf, mask: 0x0, run 1/1
+# nThread 1 nGpus 1 minBytes 1048576 maxBytes 8589934592 step: 2(factor) warmup iters: 2 iters: 10 agg iters: 1 validation: 0 graph: 0
+#
+#                                                              out-of-place                       in-place
+#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
+#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)
+     1048576        262144     float     sum      -1   1788.7    0.59    1.10    N/A   1725.9    0.61    1.14    N/A
+     2097152        524288     float     sum      -1   1645.1    1.27    2.39    N/A   1649.8    1.27    2.38    N/A
+     4194304       1048576     float     sum      -1   1701.3    2.47    4.62    N/A   1691.6    2.48    4.65    N/A
+     8388608       2097152     float     sum      -1   1891.3    4.44    8.32    N/A   1911.9    4.39    8.23    N/A
+    16777216       4194304     float     sum      -1   1932.8    8.68   16.28    N/A   2014.4    8.33   15.62    N/A
+    33554432       8388608     float     sum      -1   2190.8   15.32   28.72    N/A   2395.8   14.01   26.26    N/A
+    67108864      16777216     float     sum      -1   2367.6   28.34   53.15    N/A   2389.8   28.08   52.65    N/A
+   134217728      33554432     float     sum      -1   3539.5   37.92   71.10    N/A   3266.7   41.09   77.04    N/A
+   268435456      67108864     float     sum      -1   5969.3   44.97   84.32    N/A   5850.8   45.88   86.03    N/A
+   536870912     134217728     float     sum      -1    11625   46.18   86.59    N/A    11737   45.74   85.77    N/A
+  1073741824     268435456     float     sum      -1    23144   46.39   86.99    N/A    38777   27.69   51.92    N/A
+  2147483648     536870912     float     sum      -1    45662   47.03   88.18    N/A    45522   47.17   88.45    N/A
+  4294967296    1073741824     float     sum      -1    90227   47.60   89.25    N/A    90354   47.53   89.13    N/A
+  8589934592    2147483648     float     sum      -1   179880   47.75   89.54    N/A   178867   48.02   90.05    N/A
+# Out of bounds values : 0 OK
+# Avg bus bandwidth    : 49.6371
+```
+
+If `gcsBucket` is specified in the values.yaml file, then the logs will also be uploaded to the specified bucket.
+
+### Breaking Down the Parameters
+
+*Disclaimer: This is not a comprehensive list. Refer to `values.yaml` for a full list of tunable parameters.
+
+#### Benchmark Parameters
+
+|yaml path|Explanation|
+|---|---|
+|`ncclBenchmarks.benchmarks`|A CSV of benchmarks to run.|
+|`ncclBenchmarks.masks`|A CSV of hexadecimal masks to use.|
+|`ncclBenchmarks.msgSizeBegin`|The minimum message size to use,  |
+|`ncclBenchmarks.msgSizeEnd`|The maximum message size to use, specified using 'G', 'M', 'K', or no suffix for bytes. [Source](https://github.com/NVIDIA/nccl-tests/blob/master/src/common.cu#L86). |
+|`ncclBenchmarks.nGpusPerNode`|Number of GPUs per node to use.|
+|`ncclBenchmarks.warmupIters`|Number of warmup iterations.|
+|`ncclBenchmarks.runIters`|Number of iterations per run.|
+|`ncclBenchmarks.nRuns`|Number of runs to aggregate over.|
+
+##### Benchmarks, masks, and message sizes
+
+You can specify multiple benchmarks, each with its own hexadecimal mask. **The
+message sizes to sweep over are shared across all benchmarks.** Supported
+benchmarks are `all_gather_perf`, `all_reduce_perf`, `reduce_scatter_perf`, `broadcast_perf`,
+`reduce_perf`, `sendrecv_perf`, `scatter_perf`, `gather_perf`, `alltoall_perf`, and `hypercube_perf`.
+
+For each benchmark, you must supply a mask. The benchmark does a bitwise AND
+between the rank and the mask to get a color, and ranks with the same color
+goes in the same NCCL communicator. Examples:
+
+- For a world-level NCCL operation, `MASK=0x0`.
+- For a rail-aligned NCCL operation using all 8 GPUs on a VM, `MASK=0x7`.
+- For a rail-aligned NCCL operation using only 4 GPUs on a VM, `MASK=0x3`.
+
+*Note: Providing a mask larger than the numbers of GPUs on a VM will result in asymetric network traffic between VMs.*
+
+Message sizes should be specified using 'G', 'M', 'K', or no suffix for bytes. For example 1G == 1024M == (1024 * 1024)K == (1024 * 1024 * 1024). [Source](https://github.com/NVIDIA/nccl-tests/blob/1292b25553bd0384f2faa2965f9d82b99797a348/src/common.cu#L86C1-L120C2).
+
+##### WARMUP_ITERS, and RUN_ITERS
+
+For each message size, the benchmark will measure the average latency and bus
+bandwidth used. Each run consists of a few
+warmup iterations, followed by the actual measurements used to derive
+performance.
+
+#### Switching Out Software Components
+
+|GKE|Explanation|
+|---|---|
+|`rxdm.image`|Image for the TCPX RxDM.|
+|`rxdm.tag`|Tag for the TCPX RxDM.|
+|`rxdm.flags`|Runtime flags for the TCPX RxDM.|
+|`ncclPlugin.image`|Image for the TCPX NCCL plugin.|
+|`ncclPlugin.tag`|Tag for the TCPX NCCL plugin.|
+|`ncclPlugin.unreservedCores`|Application reserved cores.|
+|`ncclPlugin.envs`|Environment variables for the TCPX NCCL plugin.|
+
+**For TCPX NCCL, any environment variables starting with `NCCL` will be picked
+up by the benchmarking container.**
+
+#### More Fine-Grained Node Placement Control
+
+|yaml path|Explanation|
+|---|---|
+|`cluster.sbPlacement`|If deliberate superblock placement should be enabled.|
+|`cluster.nSuperblocks`|Number of superblocks for job to span over.|
+|`cluster.startSuperblock`|Which superblock to start job on.|
+
+In GKE, we have a flag to toggle deliberate superblock placement. If enabled,
+we will try to split the job among `cluster.nSuperblocks` superblocks, starting
+from superblock `cluster.startSuperblock`. **This guarantees closer affinity
+between the job nodes and should be enabled for performance benchmarking.**
+
+*Note that this feature is based on a `superblock` label in the Kubernetes
+cluster and would not work if that label is missing. For example, Superblock 1 should be labeled with `superblock`: 1 *
diff --git a/sample_workloads/nccltest/docker/Dockerfile b/sample_workloads/nccltest/docker/Dockerfile
@@ -0,0 +1,73 @@
+FROM nvidia/cuda:12.0.0-devel-ubuntu20.04
+
+ENV DEBIAN_FRONTEND='noninteractive'
+
+RUN apt-get update \
+  && apt-get install -y --no-install-recommends \
+        git openssh-server wget iproute2 vim libopenmpi-dev build-essential \
+        cmake gdb python3 \
+  protobuf-compiler libprotobuf-dev rsync libssl-dev \
+  && rm -rf /var/lib/apt/lists/*
+
+ARG CUDA12_GENCODE='-gencode=arch=compute_90,code=sm_90'
+ARG CUDA12_PTX='-gencode=arch=compute_90,code=compute_90'
+
+WORKDIR /third_party
+# Install patched NCCL
+RUN git clone https://github.com/NVIDIA/nccl.git nccl-netsupport && \
+cd nccl-netsupport && \
+git fetch --all --tags && \
+git checkout -b github_nccl_2_18_5 05121c8191984aada7ed57dd8081bd987f73288f
+WORKDIR nccl-netsupport
+RUN make NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" -j 16
+
+WORKDIR /third_party
+RUN git clone https://github.com/NVIDIA/nccl-tests.git
+WORKDIR nccl-tests
+RUN git fetch --all --tags
+RUN make CUDA_HOME=/usr/local/cuda NCCL_HOME=/third_party/nccl-netsupport/build NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" -j 16
+
+WORKDIR /third_party
+RUN git clone https://github.com/NVIDIA/nccl-tests.git nccl-tests-mpi
+WORKDIR nccl-tests-mpi
+RUN git fetch --all --tags
+RUN make MPI=1 MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi CUDA_HOME=/usr/local/cuda NCCL_HOME=/third_party/nccl-netsupport/build NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" -j 16
+
+# copy all license files
+WORKDIR /third_party/licenses
+RUN cp ../nccl-netsupport/LICENSE.txt license_nccl.txt
+RUN cp ../nccl-tests/LICENSE.txt license_nccl_tests.txt
+
+# Setup SSH to use port 222
+RUN cd /etc/ssh/ && sed --in-place='.bak' 's/#Port 22/Port 222/' sshd_config && \
+    sed --in-place='.bak' 's/#PermitRootLogin prohibit-password/PermitRootLogin prohibit-password/' sshd_config
+RUN ssh-keygen -t rsa -b 4096 -q -f /root/.ssh/id_rsa -N ""
+RUN touch /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys
+RUN cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
+
+# remove NCCL from container
+WORKDIR /third_party/nccl-netsupport
+RUN rm -f build/lib/libnccl*
+WORKDIR /usr/lib/x86_64-linux-gnu/
+RUN rm -f libnccl*
+
+# Install gcsfuse and python3.
+RUN apt-get update \
+  && apt-get install --yes \
+      curl lsb-release cuda-nsight-systems-12-0 \
+  && echo "deb https://packages.cloud.google.com/apt gcsfuse-$(lsb_release -c -s) main" \
+    | tee /etc/apt/sources.list.d/gcsfuse.list \
+  && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - \
+  && apt-get update \
+  && apt-get install -y gcsfuse \
+  && apt-get clean && rm -rf /var/lib/apt/lists/* \
+  && mkdir /gcs
+
+ADD root_scripts /scripts
+RUN chmod +rx /scripts/gen_hostfiles.sh /scripts/init_ssh.sh /scripts/tune_net.sh
+
+ADD scripts /workspace
+RUN chmod +rx /workspace/container_entry.sh /workspace/mpi_entry.sh /workspace/run_nccl_benchmark.sh 
+
+WORKDIR /workspace
+ENTRYPOINT ["/bin/bash", "/workspace/container_entry.sh"]
diff --git a/sample_workloads/nccltest/docker/root_scripts/gen_hostfiles.sh b/sample_workloads/nccltest/docker/root_scripts/gen_hostfiles.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+len() {
+  local -r arr=($@)
+  echo "${#arr[@]}"
+}
+
+NRANKS_FACTORS=(1 2 4 8)
+
+NHOSTS=$(len "$@")
+echo "generating hostfiles for ${NHOSTS} hosts: "
+for h in "$@"; do echo "$h"; done
+
+mkdir -p "hostfiles${NHOSTS}"
+
+for nr in "${NRANKS_FACTORS[@]}";
+do
+  rm -f "hostfiles${NHOSTS}/hostfile${nr}"
+  touch "hostfiles${NHOSTS}/hostfile${nr}"
+  for h in "$@";
+  do
+    echo "$h port=222 slots=${nr}" >> "hostfiles${NHOSTS}/hostfile${nr}"
+  done
+done
diff --git a/sample_workloads/nccltest/docker/root_scripts/init_ssh.sh b/sample_workloads/nccltest/docker/root_scripts/init_ssh.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+PORT=${PORT:-222}
+
+while true; do
+  host=$1
+  if [[ -z $host ]]; then
+    break
+  fi
+  ssh -o StrictHostKeyChecking=no -p "${PORT}" "$host" \
+    echo "Hello from ${host}"
+  shift
+done
diff --git a/sample_workloads/nccltest/docker/root_scripts/tune_net.sh b/sample_workloads/nccltest/docker/root_scripts/tune_net.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+set -e
+
+# Dumps the irq binding of $ifname for sanity checking.
+dump_irq_binding() {
+  local -r ifname="$1"
+  echo -e "\n\ndump_irq_binding: ifname=${ifname}\n"
+  for irq in $(ls "/sys/class/net/${ifname}/device/msi_irqs/"); do
+    smp_affinity_list=$(cat "/proc/irq/${irq}/smp_affinity_list")
+    echo irq="$irq" smp_affinity_list="$smp_affinity_list"
+  done
+}
+
+set_irq_range() {
+  local -r nic="$1"
+  local core_start="$2"
+  local num_cores="$3"
+
+  # The user may not have this $nic configured on their VM, if not, just skip
+  # it, no need to error out.
+  if [[ ! -d "/sys/class/net/${nic}/device" ]]; then
+    return;
+  fi
+
+  echo "Setting irq binding for ${nic}..."
+
+  # We count the number of rx queues and assume number of rx queues == tx
+  # queues. Currently the GVE configuration at boot is 16 rx + 16 tx.
+  num_q=$(ls -1 "/sys/class/net/${nic}/queues/" | grep rx | wc -l)
+
+  irq_start=$(ls -1 "/sys/class/net/${nic}/device/msi_irqs" | sort -n | head -n 1)
+  idx=0
+  for ((queue = 0; queue < "$num_q"; queue++)); do
+    irq=$((irq_start + "$queue"))
+
+    core=$(( core_start + idx ))
+
+    # this is GVE's TX irq. See gve_tx_idx_to_ntfy().
+    echo "$core" > /proc/irq/"$irq"/smp_affinity_list
+
+    # this is GVE's RX irq. See gve_rx_idx_to_ntfy().
+    echo "$core" > /proc/irq/$(("$irq" + "$num_q"))/smp_affinity_list
+
+    idx=$(( (idx + 1) % num_cores ))
+  done
+}
+
+# The below eth0-eth0 ranges are named based on expectations for using COS on GKE.
+# If this is being run on a system with different eth0-4 names, change these to the correct names.
+
+a3_bind_irqs() {
+  set_irq_range eth0 32 4
+  set_irq_range eth1 36 8
+  set_irq_range eth2 44 8
+  set_irq_range eth3 88 8
+  set_irq_range eth4 96 8
+}
+
+a3_bind_irqs
+
+dump_irq_binding eth0
+dump_irq_binding eth1
+dump_irq_binding eth2
+dump_irq_binding eth3
+dump_irq_binding eth4