microsoft · chhwang · Dec 27, 2023 · Dec 27, 2023 · Dec 27, 2023 · Dec 27, 2023
diff --git a/.github/workflows/integration-test-rocm-backup.yml b/.github/workflows/integration-test-rocm-backup.yml
@@ -0,0 +1,67 @@
+name: "IntegrationTest (ROCm)"
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  IntegrationTest:
+    runs-on: [ self-hosted, AMD ]
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        rocm: [ rocm6.0 ]
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.rocm }}
+      cancel-in-progress: true
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Build
+        run: |
+          mkdir build && cd build
+          cmake -DCMAKE_BUILD_TYPE=Release ..
+          make -j
+
+      - name: Run mscclpp AllGather test
+        run: |
+          set -e
+          mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+          mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+          mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+          mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+
+      - name: Run mscclpp SendRecv test
+        run: |
+          set -e
+          mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+
+      - name: Run mscclpp AllReduce test
+        run: |
+          set -e
+          mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+          mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+          mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+          mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
+          mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
+          mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
+          mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
+
+      - name: Run mscclpp AllToAll test
+        run: |
+          set -e
+          mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+          mpirun -np 8 --bind-to numa ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+
+      # - name: Check collective primitives performance
+      #   run: |
+      #     set -e
+      #     python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
diff --git a/.github/workflows/ut-rocm-backup.yml b/.github/workflows/ut-rocm-backup.yml
@@ -0,0 +1,51 @@
+name: "UnitTest (ROCm)"
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  UnitTest:
+    runs-on: [ self-hosted, AMD ]
+    defaults:
+      run:
+        shell: bash
+    timeout-minutes: 30
+    strategy:
+      matrix:
+        rocm: [ rocm6.0 ]
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.rocm }}
+      cancel-in-progress: true
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Build
+        run: |
+          mkdir build && cd build
+          cmake -DCMAKE_BUILD_TYPE=Release ..
+          make -j
+        working-directory: ${{ github.workspace }}
+
+      - name: UnitTests
+        run: |
+          ./build/test/unit_tests
+
+      - name: MpUnitTests
+        run: |
+          set -e
+          mpirun -np 2 ./build/test/mp_unit_tests --gtest_filter=-*Ib*
+          mpirun -np 4 ./build/test/mp_unit_tests --gtest_filter=-*Ib*
+          mpirun -np 8 ./build/test/mp_unit_tests --gtest_filter=-*Ib*
+
+      # - name: PyTests
+      #   run: |
+      #     set -e
+      #     cd build && make pylib-copy
+      #     mpirun -np 8 $(which pytest) ../python/test/test_mscclpp.py -x
diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile
@@ -27,8 +27,8 @@ ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
 ADD . /tmp/mscclpp
 WORKDIR /tmp/mscclpp
 ARG TARGET="cuda12.1"
-RUN cuda_major_version=$(echo ${TARGET} | grep -oP 'cuda\K[0-9]+') && \
-    python3 -m pip install --no-cache-dir -r python/requirements_cu${cuda_major_version}.txt
+RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
+    python3 -m pip install --no-cache-dir -r python/requirements_${target_type}.txt
 
 # Set PATH
 RUN echo PATH="${PATH}" > /etc/environment

diff --git a/docker/build.sh b/docker/build.sh
@@ -7,20 +7,22 @@ baseImageTable=(
     ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
     ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
     ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
+    ["rocm6.0"]="rocm/dev-ubuntu-20.04:6.0-complete"
 )
 
 declare -A extraLdPathTable
 extraLdPathTable=(
     ["cuda11.8"]="/usr/local/cuda-11.8/lib64"
     ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
     ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
+    ["rocm6.0"]="/opt/rocm/lib"
 )
 
 GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
 TARGET=${1}
 
 print_usage() {
-    echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2]"
+    echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|rocm6.0]"
 }
 
 if [[ ! -v "baseImageTable[${TARGET}]" ]]; then

diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -87,8 +87,8 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./test/mp_unit_tests -ip_port 10.
 [Install the MSCCL++ Python package](https://github.com/microsoft/mscclpp/blob/chhwang/docs/docs/quickstart.md#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system.
 
 ```bash
-# Choose either `requirements_cu11.txt` or `requirements_cu12.txt` according to your CUDA version.
-$ python3 -m pip install -r ./python/requirements_cu12.txt
+# Choose `requirements_*.txt` according to your CUDA/ROCm version.
+$ python3 -m pip install -r ./python/requirements_cuda12.txt
 $ mpirun -tag-output -np 8 python3 ./python/benchmark/allreduce_bench.py
 ```
 

diff --git a/python/requirements_cu11.txt → python/requirements_cuda11.txt b/python/requirements_cu11.txt → python/requirements_cuda11.txt
diff --git a/python/requirements_cu12.txt → python/requirements_cuda12.txt b/python/requirements_cu12.txt → python/requirements_cuda12.txt
diff --git a/python/requirements_rocm6.txt b/python/requirements_rocm6.txt
@@ -0,0 +1,6 @@
+mpi4py
+prettytable
+netifaces
+pytest
+numpy
+matplotlib
diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh
@@ -14,9 +14,9 @@ for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
 done
 
 if [[ "${CUDA_VERSION}" == *"11."* ]]; then
-    pip3 install -r /root/mscclpp/python/requirements_cu11.txt
+    pip3 install -r /root/mscclpp/python/requirements_cuda11.txt
 else
-    pip3 install -r /root/mscclpp/python/requirements_cu12.txt
+    pip3 install -r /root/mscclpp/python/requirements_cuda12.txt
 fi
 
 cd /root/mscclpp && pip3 install .