From 9696b8fa30bdd59348c97bfd739116d5c3099a9f Mon Sep 17 00:00:00 2001 From: Christopher Pirillo Date: Fri, 15 Dec 2023 10:30:19 -0800 Subject: [PATCH] Add a nccl-test sample workload (#345) --- sample_workloads/nccltest/README.md | 150 +++++++++++++++ sample_workloads/nccltest/docker/Dockerfile | 73 ++++++++ .../docker/root_scripts/gen_hostfiles.sh | 24 +++ .../nccltest/docker/root_scripts/init_ssh.sh | 13 ++ .../nccltest/docker/root_scripts/tune_net.sh | 66 +++++++ .../docker/scripts/container_entry.sh | 151 +++++++++++++++ .../nccltest/docker/scripts/mpi_entry.sh | 45 +++++ .../docker/scripts/run_nccl_benchmark.sh | 74 ++++++++ sample_workloads/nccltest/gke/Chart.yaml | 6 + .../gke/templates/nccl_benchmarks.yaml | 176 ++++++++++++++++++ sample_workloads/nccltest/gke/values.yaml | 59 ++++++ 11 files changed, 837 insertions(+) create mode 100644 sample_workloads/nccltest/README.md create mode 100644 sample_workloads/nccltest/docker/Dockerfile create mode 100644 sample_workloads/nccltest/docker/root_scripts/gen_hostfiles.sh create mode 100644 sample_workloads/nccltest/docker/root_scripts/init_ssh.sh create mode 100644 sample_workloads/nccltest/docker/root_scripts/tune_net.sh create mode 100644 sample_workloads/nccltest/docker/scripts/container_entry.sh create mode 100644 sample_workloads/nccltest/docker/scripts/mpi_entry.sh create mode 100644 sample_workloads/nccltest/docker/scripts/run_nccl_benchmark.sh create mode 100644 sample_workloads/nccltest/gke/Chart.yaml create mode 100644 sample_workloads/nccltest/gke/templates/nccl_benchmarks.yaml create mode 100644 sample_workloads/nccltest/gke/values.yaml diff --git a/sample_workloads/nccltest/README.md b/sample_workloads/nccltest/README.md new file mode 100644 index 00000000..1096affe --- /dev/null +++ b/sample_workloads/nccltest/README.md @@ -0,0 +1,150 @@ +# TCPX NCCL Level Benchmarks + +This document will walk you through how to build, push, and deploy the nccl-benchmark image for use on a GKE cluster configured with an A3 nodepool. + +## Building the Benchmark Docker Image + +To build the latest benchmark docker image, run: + +```shell +cd docker && docker build . -t nccl-benchmarks +``` + +**Note: A pre-built image is available in the values.yaml file** + +## Running the TCPX NCCL Benchmarks + +This section describes how you can run a 2-node world-level all-gather +benchmark at message sizes 1G and 8G. + + +If you intend to run with GKE, run: + +```shell +cd gke +PARAMS="cluster.nNodes=2," +PARAMS+="ncclBenchmarks.benchmarks=all_reduce_perf," +PARAMS+="ncclBenchmarks.masks=0x0," +PARAMS+="ncclBenchmarks.msgSizes=1G\,8G" +helm install "${USER}-nccl-bm" . --set "$PARAMS" +``` + +Once the job is scheduled, find your master pod by running + +```shell +kubectl get pods | grep "${USER}-nccl-bm.*pod0" +``` + +You can then follow the logs with + +```shell +kubectl logs --follow -c nccl-benchmarks +``` + +### Finding Results + +The container will log the output of Nvidia's nccl-test binaries for each of the tests that are requested in the parameters. + +Each test run is logged separately. If you specify a release that uses multiple tests and multiple runs, the logs will be ordered by Test0Run0...Test0RunN...TestNRunN. + +An example test output would look like: +``` +benchmark: all_reduce_perf, mask: 0x0, run 1/1 +# nThread 1 nGpus 1 minBytes 1048576 maxBytes 8589934592 step: 2(factor) warmup iters: 2 iters: 10 agg iters: 1 validation: 0 graph: 0 +# +# out-of-place in-place +# size count type redop root time algbw busbw #wrong time algbw busbw #wrong +# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 1048576 262144 float sum -1 1788.7 0.59 1.10 N/A 1725.9 0.61 1.14 N/A + 2097152 524288 float sum -1 1645.1 1.27 2.39 N/A 1649.8 1.27 2.38 N/A + 4194304 1048576 float sum -1 1701.3 2.47 4.62 N/A 1691.6 2.48 4.65 N/A + 8388608 2097152 float sum -1 1891.3 4.44 8.32 N/A 1911.9 4.39 8.23 N/A + 16777216 4194304 float sum -1 1932.8 8.68 16.28 N/A 2014.4 8.33 15.62 N/A + 33554432 8388608 float sum -1 2190.8 15.32 28.72 N/A 2395.8 14.01 26.26 N/A + 67108864 16777216 float sum -1 2367.6 28.34 53.15 N/A 2389.8 28.08 52.65 N/A + 134217728 33554432 float sum -1 3539.5 37.92 71.10 N/A 3266.7 41.09 77.04 N/A + 268435456 67108864 float sum -1 5969.3 44.97 84.32 N/A 5850.8 45.88 86.03 N/A + 536870912 134217728 float sum -1 11625 46.18 86.59 N/A 11737 45.74 85.77 N/A + 1073741824 268435456 float sum -1 23144 46.39 86.99 N/A 38777 27.69 51.92 N/A + 2147483648 536870912 float sum -1 45662 47.03 88.18 N/A 45522 47.17 88.45 N/A + 4294967296 1073741824 float sum -1 90227 47.60 89.25 N/A 90354 47.53 89.13 N/A + 8589934592 2147483648 float sum -1 179880 47.75 89.54 N/A 178867 48.02 90.05 N/A +# Out of bounds values : 0 OK +# Avg bus bandwidth : 49.6371 +``` + +If `gcsBucket` is specified in the values.yaml file, then the logs will also be uploaded to the specified bucket. + +### Breaking Down the Parameters + +*Disclaimer: This is not a comprehensive list. Refer to `values.yaml` for a full list of tunable parameters. + +#### Benchmark Parameters + +|yaml path|Explanation| +|---|---| +|`ncclBenchmarks.benchmarks`|A CSV of benchmarks to run.| +|`ncclBenchmarks.masks`|A CSV of hexadecimal masks to use.| +|`ncclBenchmarks.msgSizeBegin`|The minimum message size to use, | +|`ncclBenchmarks.msgSizeEnd`|The maximum message size to use, specified using 'G', 'M', 'K', or no suffix for bytes. [Source](https://github.com/NVIDIA/nccl-tests/blob/master/src/common.cu#L86). | +|`ncclBenchmarks.nGpusPerNode`|Number of GPUs per node to use.| +|`ncclBenchmarks.warmupIters`|Number of warmup iterations.| +|`ncclBenchmarks.runIters`|Number of iterations per run.| +|`ncclBenchmarks.nRuns`|Number of runs to aggregate over.| + +##### Benchmarks, masks, and message sizes + +You can specify multiple benchmarks, each with its own hexadecimal mask. **The +message sizes to sweep over are shared across all benchmarks.** Supported +benchmarks are `all_gather_perf`, `all_reduce_perf`, `reduce_scatter_perf`, `broadcast_perf`, +`reduce_perf`, `sendrecv_perf`, `scatter_perf`, `gather_perf`, `alltoall_perf`, and `hypercube_perf`. + +For each benchmark, you must supply a mask. The benchmark does a bitwise AND +between the rank and the mask to get a color, and ranks with the same color +goes in the same NCCL communicator. Examples: + +- For a world-level NCCL operation, `MASK=0x0`. +- For a rail-aligned NCCL operation using all 8 GPUs on a VM, `MASK=0x7`. +- For a rail-aligned NCCL operation using only 4 GPUs on a VM, `MASK=0x3`. + +*Note: Providing a mask larger than the numbers of GPUs on a VM will result in asymetric network traffic between VMs.* + +Message sizes should be specified using 'G', 'M', 'K', or no suffix for bytes. For example 1G == 1024M == (1024 * 1024)K == (1024 * 1024 * 1024). [Source](https://github.com/NVIDIA/nccl-tests/blob/1292b25553bd0384f2faa2965f9d82b99797a348/src/common.cu#L86C1-L120C2). + +##### WARMUP_ITERS, and RUN_ITERS + +For each message size, the benchmark will measure the average latency and bus +bandwidth used. Each run consists of a few +warmup iterations, followed by the actual measurements used to derive +performance. + +#### Switching Out Software Components + +|GKE|Explanation| +|---|---| +|`rxdm.image`|Image for the TCPX RxDM.| +|`rxdm.tag`|Tag for the TCPX RxDM.| +|`rxdm.flags`|Runtime flags for the TCPX RxDM.| +|`ncclPlugin.image`|Image for the TCPX NCCL plugin.| +|`ncclPlugin.tag`|Tag for the TCPX NCCL plugin.| +|`ncclPlugin.unreservedCores`|Application reserved cores.| +|`ncclPlugin.envs`|Environment variables for the TCPX NCCL plugin.| + +**For TCPX NCCL, any environment variables starting with `NCCL` will be picked +up by the benchmarking container.** + +#### More Fine-Grained Node Placement Control + +|yaml path|Explanation| +|---|---| +|`cluster.sbPlacement`|If deliberate superblock placement should be enabled.| +|`cluster.nSuperblocks`|Number of superblocks for job to span over.| +|`cluster.startSuperblock`|Which superblock to start job on.| + +In GKE, we have a flag to toggle deliberate superblock placement. If enabled, +we will try to split the job among `cluster.nSuperblocks` superblocks, starting +from superblock `cluster.startSuperblock`. **This guarantees closer affinity +between the job nodes and should be enabled for performance benchmarking.** + +*Note that this feature is based on a `superblock` label in the Kubernetes +cluster and would not work if that label is missing. For example, Superblock 1 should be labeled with `superblock`: 1 * \ No newline at end of file diff --git a/sample_workloads/nccltest/docker/Dockerfile b/sample_workloads/nccltest/docker/Dockerfile new file mode 100644 index 00000000..382d9cdb --- /dev/null +++ b/sample_workloads/nccltest/docker/Dockerfile @@ -0,0 +1,73 @@ +FROM nvidia/cuda:12.0.0-devel-ubuntu20.04 + +ENV DEBIAN_FRONTEND='noninteractive' + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + git openssh-server wget iproute2 vim libopenmpi-dev build-essential \ + cmake gdb python3 \ + protobuf-compiler libprotobuf-dev rsync libssl-dev \ + && rm -rf /var/lib/apt/lists/* + +ARG CUDA12_GENCODE='-gencode=arch=compute_90,code=sm_90' +ARG CUDA12_PTX='-gencode=arch=compute_90,code=compute_90' + +WORKDIR /third_party +# Install patched NCCL +RUN git clone https://github.com/NVIDIA/nccl.git nccl-netsupport && \ +cd nccl-netsupport && \ +git fetch --all --tags && \ +git checkout -b github_nccl_2_18_5 05121c8191984aada7ed57dd8081bd987f73288f +WORKDIR nccl-netsupport +RUN make NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" -j 16 + +WORKDIR /third_party +RUN git clone https://github.com/NVIDIA/nccl-tests.git +WORKDIR nccl-tests +RUN git fetch --all --tags +RUN make CUDA_HOME=/usr/local/cuda NCCL_HOME=/third_party/nccl-netsupport/build NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" -j 16 + +WORKDIR /third_party +RUN git clone https://github.com/NVIDIA/nccl-tests.git nccl-tests-mpi +WORKDIR nccl-tests-mpi +RUN git fetch --all --tags +RUN make MPI=1 MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi CUDA_HOME=/usr/local/cuda NCCL_HOME=/third_party/nccl-netsupport/build NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" -j 16 + +# copy all license files +WORKDIR /third_party/licenses +RUN cp ../nccl-netsupport/LICENSE.txt license_nccl.txt +RUN cp ../nccl-tests/LICENSE.txt license_nccl_tests.txt + +# Setup SSH to use port 222 +RUN cd /etc/ssh/ && sed --in-place='.bak' 's/#Port 22/Port 222/' sshd_config && \ + sed --in-place='.bak' 's/#PermitRootLogin prohibit-password/PermitRootLogin prohibit-password/' sshd_config +RUN ssh-keygen -t rsa -b 4096 -q -f /root/.ssh/id_rsa -N "" +RUN touch /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys +RUN cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys + +# remove NCCL from container +WORKDIR /third_party/nccl-netsupport +RUN rm -f build/lib/libnccl* +WORKDIR /usr/lib/x86_64-linux-gnu/ +RUN rm -f libnccl* + +# Install gcsfuse and python3. +RUN apt-get update \ + && apt-get install --yes \ + curl lsb-release cuda-nsight-systems-12-0 \ + && echo "deb https://packages.cloud.google.com/apt gcsfuse-$(lsb_release -c -s) main" \ + | tee /etc/apt/sources.list.d/gcsfuse.list \ + && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - \ + && apt-get update \ + && apt-get install -y gcsfuse \ + && apt-get clean && rm -rf /var/lib/apt/lists/* \ + && mkdir /gcs + +ADD root_scripts /scripts +RUN chmod +rx /scripts/gen_hostfiles.sh /scripts/init_ssh.sh /scripts/tune_net.sh + +ADD scripts /workspace +RUN chmod +rx /workspace/container_entry.sh /workspace/mpi_entry.sh /workspace/run_nccl_benchmark.sh + +WORKDIR /workspace +ENTRYPOINT ["/bin/bash", "/workspace/container_entry.sh"] diff --git a/sample_workloads/nccltest/docker/root_scripts/gen_hostfiles.sh b/sample_workloads/nccltest/docker/root_scripts/gen_hostfiles.sh new file mode 100644 index 00000000..3746c4fa --- /dev/null +++ b/sample_workloads/nccltest/docker/root_scripts/gen_hostfiles.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +len() { + local -r arr=($@) + echo "${#arr[@]}" +} + +NRANKS_FACTORS=(1 2 4 8) + +NHOSTS=$(len "$@") +echo "generating hostfiles for ${NHOSTS} hosts: " +for h in "$@"; do echo "$h"; done + +mkdir -p "hostfiles${NHOSTS}" + +for nr in "${NRANKS_FACTORS[@]}"; +do + rm -f "hostfiles${NHOSTS}/hostfile${nr}" + touch "hostfiles${NHOSTS}/hostfile${nr}" + for h in "$@"; + do + echo "$h port=222 slots=${nr}" >> "hostfiles${NHOSTS}/hostfile${nr}" + done +done diff --git a/sample_workloads/nccltest/docker/root_scripts/init_ssh.sh b/sample_workloads/nccltest/docker/root_scripts/init_ssh.sh new file mode 100644 index 00000000..a9d01cd7 --- /dev/null +++ b/sample_workloads/nccltest/docker/root_scripts/init_ssh.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +PORT=${PORT:-222} + +while true; do + host=$1 + if [[ -z $host ]]; then + break + fi + ssh -o StrictHostKeyChecking=no -p "${PORT}" "$host" \ + echo "Hello from ${host}" + shift +done diff --git a/sample_workloads/nccltest/docker/root_scripts/tune_net.sh b/sample_workloads/nccltest/docker/root_scripts/tune_net.sh new file mode 100644 index 00000000..12abbb99 --- /dev/null +++ b/sample_workloads/nccltest/docker/root_scripts/tune_net.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +set -e + +# Dumps the irq binding of $ifname for sanity checking. +dump_irq_binding() { + local -r ifname="$1" + echo -e "\n\ndump_irq_binding: ifname=${ifname}\n" + for irq in $(ls "/sys/class/net/${ifname}/device/msi_irqs/"); do + smp_affinity_list=$(cat "/proc/irq/${irq}/smp_affinity_list") + echo irq="$irq" smp_affinity_list="$smp_affinity_list" + done +} + +set_irq_range() { + local -r nic="$1" + local core_start="$2" + local num_cores="$3" + + # The user may not have this $nic configured on their VM, if not, just skip + # it, no need to error out. + if [[ ! -d "/sys/class/net/${nic}/device" ]]; then + return; + fi + + echo "Setting irq binding for ${nic}..." + + # We count the number of rx queues and assume number of rx queues == tx + # queues. Currently the GVE configuration at boot is 16 rx + 16 tx. + num_q=$(ls -1 "/sys/class/net/${nic}/queues/" | grep rx | wc -l) + + irq_start=$(ls -1 "/sys/class/net/${nic}/device/msi_irqs" | sort -n | head -n 1) + idx=0 + for ((queue = 0; queue < "$num_q"; queue++)); do + irq=$((irq_start + "$queue")) + + core=$(( core_start + idx )) + + # this is GVE's TX irq. See gve_tx_idx_to_ntfy(). + echo "$core" > /proc/irq/"$irq"/smp_affinity_list + + # this is GVE's RX irq. See gve_rx_idx_to_ntfy(). + echo "$core" > /proc/irq/$(("$irq" + "$num_q"))/smp_affinity_list + + idx=$(( (idx + 1) % num_cores )) + done +} + +# The below eth0-eth0 ranges are named based on expectations for using COS on GKE. +# If this is being run on a system with different eth0-4 names, change these to the correct names. + +a3_bind_irqs() { + set_irq_range eth0 32 4 + set_irq_range eth1 36 8 + set_irq_range eth2 44 8 + set_irq_range eth3 88 8 + set_irq_range eth4 96 8 +} + +a3_bind_irqs + +dump_irq_binding eth0 +dump_irq_binding eth1 +dump_irq_binding eth2 +dump_irq_binding eth3 +dump_irq_binding eth4 diff --git a/sample_workloads/nccltest/docker/scripts/container_entry.sh b/sample_workloads/nccltest/docker/scripts/container_entry.sh new file mode 100644 index 00000000..af968941 --- /dev/null +++ b/sample_workloads/nccltest/docker/scripts/container_entry.sh @@ -0,0 +1,151 @@ +#!/bin/bash + +# Container entry script. Runs NCCL-level benchmarks and report results. + +set -u + +# Job parameters. +: "${JOB_TIMESTAMP:?Must set JOB_TIMESTAMP}" +: "${JOB_NAME:?Must set JOB_NAME}" +: "${MASTER_ADDR:?Must set MASTER_ADDR}" +: "${NNODES:?Must set NNODES}" +: "${NODE_RANK:?Must set NODE_RANK}" + +# Benchmark parameters. +: "${BENCHMARKS_CSV:?Must set BENCHMARKS_CSV}" +: "${MASKS_CSV:?Must set MASKS_CSV}" +: "${MSG_SIZE_BEGIN:?Must set MSG_SIZE_BEGIN}" +: "${MSG_SIZE_END:?Must set MSG_SIZE_END}" +: "${GPUS_PER_NODE:?Must set GPUS_PER_NODE}" +: "${WARMUP_ITERS:?Must set WARMUP_ITERS}" +: "${RUN_ITERS:?Must set RUN_ITERS}" +: "${N_RUNS:?Must set N_RUNS}" + +# Unreserved cores for taskset call. This is a CSV of ranges for cores unused +# by TCPX. +: "${UNRESERVED_CORES:?Must set UNRESERVED_CORES}" + +# Telemetry. +: "${GPU_TELEMETRY:?Must set GPU_TELEMETRY}" + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +# If input is malformed, error straight away. +IFS=',' read -r -a BENCHMARKS <<< "$BENCHMARKS_CSV" +IFS=',' read -r -a MASKS <<< "$MASKS_CSV" + +if [[ "${#BENCHMARKS[@]}" -ne "${#MASKS[@]}" ]]; then + echo "Mismatching length of BENCHMARKS_CSV and MASKS_CSV, aborting..." + exit 1 +fi + +for BENCHMARK in ${BENCHMARKS[@]}; do + if [[ "$BENCHMARK" != "all_gather_perf" && "$BENCHMARK" != "all_reduce_perf" && \ + "$BENCHMARK" != "reduce_scatter_perf" && "$BENCHMARK" != "broadcast_perf" && \ + "$BENCHMARK" != "reduce_perf" && "$BENCHMARK" != "sendrecv_perf" && \ + "$BENCHMARK" != "scatter_perf" && "$BENCHMARK" != "gather_perf" && \ + "$BENCHMARK" != "alltoall_perf" && "$BENCHMARK" != "hypercube_perf" ]]; then + echo "${BENCHMARK} is not a legal benchmark, aborting..." + exit 2 + fi +done + +# Create shared directory for GCE compatibility. +mkdir -p /usr/share/nccl_benchmarks + +# Wait for master node to be ready. +MASTER_READY="" +while [[ -z "$MASTER_READY" ]]; do + echo "Waiting for master..." + MASTER_READY=$( getent hosts ${MASTER_ADDR} ) + echo $MASTER_READY + sleep 5 +done + +# IRQ tunings. +/scripts/tune_net.sh + +# Start SSH service on hosts and send address of self to master node. +mkdir -p /run/mpi_bootstrap +service ssh start + +RANK_FILENAME="rank${NODE_RANK}.txt" +hostname > "$RANK_FILENAME" +while true; do + scp -o StrictHostKeyChecking=no -P 222 \ + "$RANK_FILENAME" "${MASTER_ADDR}:/run/mpi_bootstrap/" + + EXIT_STATUS=$? + if [[ "$EXIT_STATUS" -eq 0 ]]; then + break + fi + sleep 1 +done + +# Mount GCS. +echo "Mounting GCS..." +GCS_ROOT_DIR=/workspace/logs +JOB_LOG_DIR_NAME="${JOB_TIMESTAMP}_${JOB_NAME}_nnodes_${NNODES}_gpus_${GPUS_PER_NODE}" +JOB_LOG_DIR="${GCS_ROOT_DIR}/${JOB_LOG_DIR_NAME}" + +mkdir -p "$GCS_ROOT_DIR" +if [[ ! -z "${GCS_BUCKET}" ]]; then + gcsfuse --implicit-dirs "$GCS_BUCKET" "$GCS_ROOT_DIR" + echo "GCS mount complete; results at ${GCS_BUCKET}/${JOB_LOG_DIR_NAME}" +else + echo "GCS Bucket not specified, no logs will be uploaded. Local logs can be found at ${GCS_ROOT_DIR}/${JOB_LOG_DIR_NAME}" +fi + +if [[ "$NODE_RANK" -eq 0 ]]; then + # Once host information has arrived, initialize SSH, generate hostfile, and + # start the tests. + echo "Waiting for host information to arrive..." + NRANKS_READY=0 + while [[ "$NRANKS_READY" -lt "$NNODES" ]]; do + NRANKS_READY=$( ls /run/mpi_bootstrap/rank*.txt | wc -l ) + sleep 1 + done + + for (( i = 0; i < NNODES; ++i )); do + cat "/run/mpi_bootstrap/rank${i}.txt" >> /run/mpi_bootstrap/hosts.txt + done + + cat /run/mpi_bootstrap/hosts.txt | xargs /scripts/init_ssh.sh + pushd /scripts + cat /run/mpi_bootstrap/hosts.txt | xargs /scripts/gen_hostfiles.sh + popd + + # Run workload and process results. + for i in "${!BENCHMARKS[@]}"; do + BENCHMARK=${BENCHMARKS[i]} + MASK=${MASKS[i]} + echo "Running benchmark ${BENCHMARK} with mask ${MASK}..." + + BM_LOG_DIR="${JOB_LOG_DIR}/bm_${BENCHMARK}_mask_${MASK}" + mkdir -p "$BM_LOG_DIR" + + NNODES="$NNODES" \ + BM_LOG_DIR="$BM_LOG_DIR" \ + BENCHMARK="$BENCHMARK" \ + MASK="$MASK" \ + MSG_SIZE_BEGIN="$MSG_SIZE_BEGIN" \ + MSG_SIZE_END="$MSG_SIZE_END" \ + GPUS_PER_NODE="$GPUS_PER_NODE" \ + WARMUP_ITERS="$WARMUP_ITERS" \ + RUN_ITERS="$RUN_ITERS" \ + N_RUNS="$N_RUNS" \ + UNRESERVED_CORES="$UNRESERVED_CORES" \ + GPU_TELEMETRY="$GPU_TELEMETRY" \ + "${SCRIPT_DIR}/run_nccl_benchmark.sh" + done + + # Tell each node the MPI workload has terminated. + mpirun --mca btl tcp,self --mca btl_tcp_if_include eth0 \ + --mca routed direct --allow-run-as-root \ + -np "$NNODES" --hostfile /scripts/hostfiles${NNODES}/hostfile1 \ + touch /usr/share/nccl_benchmarks/workload_terminated +else + while [[ ! -e /usr/share/nccl_benchmarks/workload_terminated ]]; do + sleep 10 + done +fi diff --git a/sample_workloads/nccltest/docker/scripts/mpi_entry.sh b/sample_workloads/nccltest/docker/scripts/mpi_entry.sh new file mode 100644 index 00000000..00bb5d22 --- /dev/null +++ b/sample_workloads/nccltest/docker/scripts/mpi_entry.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# Job parameters. +: "${RUN_LOG_DIR:?Must set RUN_LOG_DIR}" + +# Benchmark parameters. +: "${MSG_SIZE_BEGIN:?Must set MSG_SIZE_BEGIN}" +: "${MSG_SIZE_END:?Must set MSG_SIZE_END}" +: "${WARMUP_ITERS:?Must set WARMUP_ITERS}" +: "${RUN_ITERS:?Must set RUN_ITERS}" + +# Unreserved cores for taskset call. This is a CSV of ranges for cores unused +# by TCPX. +: "${UNRESERVED_CORES:?Must set UNRESERVED_CORES}" + +# Telemetry. +: "${GPU_TELEMETRY:?Must set GPU_TELEMETRY}" + +# OpenMPI parameters. +: "${OMPI_COMM_WORLD_RANK:?Must set OMPI_COMM_WORLD_RANK}" +: "${OMPI_COMM_WORLD_LOCAL_SIZE:?Must set OMPI_COMM_WORLD_LOCAL_SIZE}" +: "${OMPI_COMM_WORLD_LOCAL_RANK:?Must set OMPI_COMM_WORLD_LOCAL_RANK}" + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +# Compute node rank based on global rank and local world size. +# This is a workaround for us not being able to send this info directly in MPI. +NODE_RANK=$(( OMPI_COMM_WORLD_RANK / OMPI_COMM_WORLD_LOCAL_SIZE )) + +TELEMETRY_DIR="${RUN_LOG_DIR}/telemetry/node${NODE_RANK}" +# Add GPU profiling. +NSYS_PREFIX="" +if [[ "$GPU_TELEMETRY" == "true" ]]; then + GPU_TELEMETRY_DIR="${TELEMETRY_DIR}/gpu" + mkdir -p "$GPU_TELEMETRY_DIR" + GPU_TELEMETRY_OUTPUT="${GPU_TELEMETRY_DIR}/rank${OMPI_COMM_WORLD_LOCAL_RANK}" + NSYS_PREFIX="nsys profile \ + --wait primary -o ${GPU_TELEMETRY_OUTPUT} \ + --force-overwrite true -t cuda,nvtx -s none --export sqlite" +fi + +$NSYS_PREFIX \ +taskset -c "$UNRESERVED_CORES" \ + /third_party/nccl-tests-mpi/build/${BENCHMARK} \ + -b "$MSG_SIZE_BEGIN" -e "$MSG_SIZE_END" -f 2 -g 1 -w "$WARMUP_ITERS" --iters "$RUN_ITERS" -c 0 diff --git a/sample_workloads/nccltest/docker/scripts/run_nccl_benchmark.sh b/sample_workloads/nccltest/docker/scripts/run_nccl_benchmark.sh new file mode 100644 index 00000000..d21d1663 --- /dev/null +++ b/sample_workloads/nccltest/docker/scripts/run_nccl_benchmark.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +set -u +set -o pipefail + +# Job parameters. +: "${NNODES:?Must set NNODES}" +: "${BM_LOG_DIR:?Must set BM_LOG_DIR}" + +# Benchmark parameters. +: "${BENCHMARK:?Must set BENCHMARK}" +: "${MASK:?Must set MASK}" +: "${MSG_SIZE_BEGIN:?Must set MSG_SIZE_BEGIN}" +: "${MSG_SIZE_END:?Must set MSG_SIZE_END}" +: "${GPUS_PER_NODE:?Must set GPUS_PER_NODE}" +: "${WARMUP_ITERS:?Must set WARMUP_ITERS}" +: "${RUN_ITERS:?Must set RUN_ITERS}" +: "${N_RUNS:?Must set N_RUNS}" + +# Unreserved cores for taskset call. This is a CSV of ranges for cores unused +# by TCPX. +: "${UNRESERVED_CORES:?Must set UNRESERVED_CORES}" + +# Telemetry. +: "${GPU_TELEMETRY:?Must set GPU_TELEMETRY}" + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +NRANKS=$(( NNODES * GPUS_PER_NODE )) + +# Generate CUDA_VISIBLE_DEVICES. +CUDA_VISIBLE_DEVICES=$( seq -s, 0 1 $(( GPUS_PER_NODE - 1 )) ) + +# Generate NCCL flags for application. +NCCL_FLAGS=$( env | egrep ^NCCL | awk '{ printf "-x %s ", $0; }' ) + +# Run actual NCCL benchmarks. +for (( i = 1; i <= $N_RUNS; ++i )); do + RUN_LOG_DIR="${BM_LOG_DIR}/per_run_results/run_${i}" + mkdir -p "$RUN_LOG_DIR" + LOGFILE_PATH="${RUN_LOG_DIR}/logs.txt" + + echo "benchmark: ${BENCHMARK}, mask: ${MASK}, run ${i}/${N_RUNS}" + + # Run benchmark, with a 3-hour timeout. + mpirun --mca btl tcp,self --mca btl_tcp_if_include eth0 \ + --mca routed direct --allow-run-as-root -np "$NRANKS" \ + --hostfile "/scripts/hostfiles${NNODES}/hostfile${GPUS_PER_NODE}" \ + --timeout $(( 3 * 60 * 60 )) \ + -x LD_LIBRARY_PATH -x PATH \ + -x "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}" \ + $NCCL_FLAGS \ + -x "RUN_LOG_DIR=${RUN_LOG_DIR}" \ + -x "UNRESERVED_CORES=${UNRESERVED_CORES}" \ + -x "GPU_TELEMETRY=${GPU_TELEMETRY}" \ + -x "BENCHMARK=${BENCHMARK}" \ + -x "MSG_SIZE_BEGIN=${MSG_SIZE_BEGIN}" \ + -x "MSG_SIZE_END=${MSG_SIZE_END}" \ + -x "WARMUP_ITERS=${WARMUP_ITERS}" \ + -x "RUN_ITERS=${RUN_ITERS}" \ + -x "NCCL_TESTS_SPLIT_MASK=${MASK}" \ + "${SCRIPT_DIR}/mpi_entry.sh" 2>&1 | \ + tee "$LOGFILE_PATH" + + EXIT_STATUS=$? + if [[ "$EXIT_STATUS" -ne 0 ]]; then + echo "WARNING: got non-zero exit status ${EXIT_STATUS}" + fi + + # Generate the CSVs for the run. + CSV_PATH="${RUN_LOG_DIR}/results.csv" + grep "float" "$LOGFILE_PATH" | \ + awk '{ printf "%s,%s,%s\n", $4, $9, $11 }' > "$CSV_PATH" +done diff --git a/sample_workloads/nccltest/gke/Chart.yaml b/sample_workloads/nccltest/gke/Chart.yaml new file mode 100644 index 00000000..997419b6 --- /dev/null +++ b/sample_workloads/nccltest/gke/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: nccl-benchmarks +description: nccl-benchmarks +type: application +version: 0.1.0 +appVersion: 0.1.0 diff --git a/sample_workloads/nccltest/gke/templates/nccl_benchmarks.yaml b/sample_workloads/nccltest/gke/templates/nccl_benchmarks.yaml new file mode 100644 index 00000000..2b0fbd8c --- /dev/null +++ b/sample_workloads/nccltest/gke/templates/nccl_benchmarks.yaml @@ -0,0 +1,176 @@ +# yamllint disable +{{- $requiredVar := .Values.cluster.nNodes | required ".Values.cluster.nNodes is required, add --set cluster.nNodes=2" -}} + +{{ $timestamp := now | date "2006-01-02-150405" }} + +apiVersion: v1 +kind: Service +metadata: + name: "nccl-benchmarks-{{ $.Release.Name }}-{{ $timestamp }}" +spec: + selector: + name: "nccl-benchmarks-{{ $.Release.Name }}-{{ $timestamp }}" + clusterIP: None +--- +{{ $node_count := .Values.cluster.nNodes | int }} +{{ $superblock_count := .Values.cluster.nSuperblocks | int }} +{{ $nodesPerSuperblock := divf $node_count $superblock_count | ceil }} + +{{- range $node_index, $element := until $node_count }} +apiVersion: v1 +kind: Pod +metadata: + name: nccl-benchmarks-{{ $.Release.Name }}-{{ $timestamp }}-pod{{ $node_index }} + {{- if eq $node_index 0 }} + labels: + name: nccl-benchmarks-{{ $.Release.Name }}-{{ $timestamp }} + {{- end }} +spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + hostname: nccl-benchmarks-pod{{ $node_index }} + subdomain: nccl-benchmarks-{{ $timestamp }} + serviceAccountName: "default" + restartPolicy: Never + {{- if $.Values.cluster.sbPlacement }} + {{ $superblockChunk := div $node_index $nodesPerSuperblock | int }} + {{ $superblockIndex := add $.Values.cluster.startSuperblock $superblockChunk | int }} + nodeSelector: + superblock: "{{ $superblockIndex }}" + {{- end }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-accelerator + operator: Exists + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + volumes: + - name: nvidia-install-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia/lib64 + - name: tcpd-socket + hostPath: + path: /run/tcpx + - name: shared-memory + emptyDir: + medium: "Memory" + sizeLimit: 200Gi + - name: tcpx-nccl-plugin-volume + emptyDir: {} + - name: workload-terminated-volume + emptyDir: {} + initContainers: + - name: tcpx-nccl-plugin-installer + image: {{ $.Values.ncclPlugin.image }}:{{ $.Values.ncclPlugin.tag }} + imagePullPolicy: Always + volumeMounts: + - name: tcpx-nccl-plugin-volume + mountPath: /var/lib/tcpx + resources: + requests: + cpu: 150m + command: + - /bin/bash + - -c + - | + /scripts/container_entry.sh install --install-nccl + {{- if $.Values.telemetry.gpu }} \ + --nccl-buildtype=nvtx + {{- end }} + containers: + - name: tcpd-daemon + image: {{ $.Values.rxdm.image }}:{{ $.Values.rxdm.tag }} + imagePullPolicy: Always + command: + - "bash" + - "-c" + - | + /tcpgpudmarxd/build/app/tcpgpudmarxd {{- range $.Values.rxdm.flags }} {{.}} {{- end }} & + while [ ! -e "/usr/share/nccl_benchmarks/workload_terminated" ]; do sleep 10; done + pkill -e "^"tcpgpudmarxd || true + sleep 30 + securityContext: + privileged: true + volumeMounts: + - name: nvidia-install-dir-host + mountPath: /usr/local/nvidia/lib64 + - name: tcpd-socket + mountPath: /tmp + - name: workload-terminated-volume + mountPath: /usr/share/nccl_benchmarks + env: + - name: LD_LIBRARY_PATH + value: /usr/local/nvidia/lib64 + - name: nccl-benchmarks + image: {{ $.Values.ncclBenchmarks.image }}:{{ $.Values.ncclBenchmarks.tag }} + imagePullPolicy: Always + securityContext: + privileged: true + capabilities: + add: + - SYS_ADMIN + - SYS_PTRACE + - IPC_LOCK + env: + - name: JOB_TIMESTAMP + value: "{{ $timestamp }}" + - name: JOB_NAME + value: "{{ $.Release.Name }}" + - name: MASTER_ADDR + value: "nccl-benchmarks-{{ $.Release.Name }}-{{ $timestamp }}" + - name: NNODES + value: "{{ $node_count }}" + - name: NODE_RANK + value: "{{ $node_index }}" + - name: GCS_BUCKET + value: "{{ $.Values.cluster.gcsBucket }}" + - name: LD_LIBRARY_PATH + value: "/usr/local/tcpx/lib64:/usr/local/nvidia/lib64" + - name: BENCHMARKS_CSV + value: "{{ $.Values.ncclBenchmarks.benchmarks }}" + - name: MASKS_CSV + value: "{{ $.Values.ncclBenchmarks.masks }}" + - name: MSG_SIZE_BEGIN + value: "{{ $.Values.ncclBenchmarks.msgSizeBegin }}" + - name: MSG_SIZE_END + value: "{{ $.Values.ncclBenchmarks.msgSizeEnd }}" + - name: GPUS_PER_NODE + value: "{{ $.Values.ncclBenchmarks.gpusPerNode }}" + - name: WARMUP_ITERS + value: "{{ $.Values.ncclBenchmarks.warmupIters }}" + - name: RUN_ITERS + value: "{{ $.Values.ncclBenchmarks.runIters }}" + - name: N_RUNS + value: "{{ $.Values.ncclBenchmarks.nRuns }}" + - name: UNRESERVED_CORES + value: "{{ $.Values.ncclPlugin.unreservedCores }}" + - name: GPU_TELEMETRY + value: "{{ $.Values.telemetry.gpu }}" + {{- range $key, $value := $.Values.ncclPlugin.envs }} + - name: "{{ $key }}" + value: "{{ $value }}" + {{- end }} + {{- if $.Values.telemetry.gpu }} + - name: NCCL_PROXY_NVTX_ENABLE + value: "1" + {{- end }} + volumeMounts: + - name: nvidia-install-dir-host + mountPath: /usr/local/nvidia/lib64 + - name: tcpd-socket + mountPath: /tmp + - name: shared-memory + mountPath: /dev/shm + - name: tcpx-nccl-plugin-volume + mountPath: /usr/local/tcpx + - name: workload-terminated-volume + mountPath: /usr/share/nccl_benchmarks + resources: + limits: + nvidia.com/gpu: !!int 8 +--- +{{- end }} diff --git a/sample_workloads/nccltest/gke/values.yaml b/sample_workloads/nccltest/gke/values.yaml new file mode 100644 index 00000000..bbb66df3 --- /dev/null +++ b/sample_workloads/nccltest/gke/values.yaml @@ -0,0 +1,59 @@ +cluster: + nNodes: null # Must specify on commandline (--set cluster.nNodes=2) + sbPlacement: false # Set `true` if running across multiple superblocks + nSuperblocks: 1 + startSuperblock: 1 + gcsBucket: null + +ncclBenchmarks: + image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-benchmark" + tag: "latest" + # A comma-separated list of benchmarks to run. + benchmarks: "all_gather_perf,all_reduce_perf,sendrecv_perf" + # A comma-separated list of hex masks for the benchmarks. + # Must be of the same length as ncclBenchmarks.benchmarks. + # Each mask is recommended to be less than ncclBenchmarks.gpusPerNode. + masks: "0x0,0x0,0x7" + msgSizeBegin: "1M" + msgSizeEnd: "8G" + # Number of GPUs per node. Must be one of 1, 2, 4, 8. + gpusPerNode: 8 + warmupIters: 5 + runIters: 100 + nRuns: 5 + +telemetry: + gpu: false + +rxdm: + image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev" + tag: "v2.0.9" + flags: ["--setup_param \"--verbose 128 2 0\"", "--gpu_nic_preset a3vm", "--gpu_shmem_type fd"] + +ncclPlugin: + image: "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev" + tag: "v3.1.6_2023_10_23" + unreservedCores: "0-7,104-111,52-59,156-163" + envs: + NCCL_GPUDIRECTTCPX_FORCE_ACK: "0" + NCCL_SOCKET_IFRAME: "eth0" + NCCL_DYNAMIC_CHUNK_SIZE: 524288 + NCCL_P2P_NET_CHUNKSIZE: 524288 + NCCL_P2P_PCI_CHUNKSIZE: 524288 + NCCL_P2P_NVL_CHUNKSIZE: 1048576 + NCCL_GPUDIRECTTCPX_TX_BINDINGS: + "eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" + NCCL_GPUDIRECTTCPX_RX_BINDINGS: + "eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" + NCCL_NSOCKS_PERTHREAD: 4 + NCCL_SOCKET_NTHREADS: 1 + NCCL_MAX_NCHANNELS: 12 + NCCL_MIN_NCHANNELS: 12 + NCCL_GPUDIRECTTCPX_SOCKET_IFNAME: "eth1,eth2,eth3,eth4" + NCCL_GPUDIRECTTCPX_CTRL_DEV: "eth0" + NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS: "1000000" + NCCL_CROSS_NIC: 0 + NCCL_ALGO: "Ring" + NCCL_PROTO: "Simple" + NCCL_NET_GDR_LEVEL: "PIX" + NCCL_P2P_PXN_LEVEL: 0