mscclpp-test in Python (#204)

Co-authored-by: Binyang Li <[email protected]> Co-authored-by: Saeed Maleki <[email protected]> Co-authored-by: Esha Choukse <[email protected]>
microsoft · Nov 16, 2023 · 060fda1 · 060fda1
1 parent e710701
commit 060fda1
Show file tree

Hide file tree

Showing 24 changed files with 1,589 additions and 155 deletions.
diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml
@@ -112,3 +112,20 @@ jobs:
         set -e
         python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
       workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+  - task: Bash@3
+    name: PythonAllReduceBenchmark
+    displayName: Python Allreduce Benchmark
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        export PATH=/usr/local/mpi/bin:$PATH
+        python3 -m pip install .
+        if [[ '$(containerImage)' == *'cuda11'* ]]; then
+          pip3 install -r ./python/requirements_cu11.txt
+        else
+          pip3 install -r ./python/requirements_cu12.txt
+        fi
+        mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/benchmark/allreduce_bench.py
+      workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
@@ -83,7 +83,7 @@ jobs:
         tail -f output/mscclit-000000 &
         CHILD_PID=$!
         parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mscclpp-test'
+        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test'
         kill $CHILD_PID
 
   - task: Bash@3
@@ -102,7 +102,7 @@ jobs:
         tail -f output/mscclit-000000 &
         CHILD_PID=$!
         parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mp-ut'
+        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut'
         kill $CHILD_PID
 
   - task: Bash@3
@@ -121,7 +121,26 @@ jobs:
         tail -f output/mscclit-000000 &
         CHILD_PID=$!
         parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh pytests'
+        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests'
+        kill $CHILD_PID
+
+  - task: Bash@3
+    name: RunMultiNodePythonBenchmark
+    displayName: Run multi-nodes python benchmark
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
+        SSH_OPTION="StrictHostKeyChecking=no"
+        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+        rm -rf output/*
+        mkdir -p output
+        touch output/mscclit-000000
+        tail -f output/mscclit-000000 &
+        CHILD_PID=$!
+        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
+        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark'
         kill $CHILD_PID
 
   - task: AzureCLI@2

diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
@@ -81,9 +81,9 @@ jobs:
         export PATH=/usr/local/mpi/bin:$PATH
         cd build && make pylib-copy
         if [[ '$(containerImage)' == *'cuda11'* ]]; then
-          pip3 install -r ../python/test/requirements_cu11.txt
+          pip3 install -r ../python/requirements_cu11.txt
         else
-          pip3 install -r ../python/test/requirements_cu12.txt
+          pip3 install -r ../python/requirements_cu12.txt
         fi
-        mpirun -tag-output -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x
+        mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x
       workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/include/mscclpp/concurrency.hpp b/include/mscclpp/concurrency.hpp
@@ -33,11 +33,13 @@ struct DeviceSyncer {
       if (tmpIsAdd) {
         if (atomicAdd(&count_, 1) == maxOldCnt) {
           flag_ = 1;
+          count_ = 0;
         }
         POLL_MAYBE_JAILBREAK(!flag_, maxSpinCount);
       } else {
-        if (atomicSub(&count_, 1) == 1) {
+        if (atomicAdd(&count_, 1) == maxOldCnt) {
           flag_ = 0;
+          count_ = 0;
         }
         POLL_MAYBE_JAILBREAK(flag_, maxSpinCount);
       }

diff --git a/include/mscclpp/semaphore_device.hpp b/include/mscclpp/semaphore_device.hpp
@@ -23,7 +23,7 @@ struct Host2DeviceSemaphoreDeviceHandle {
   }
 
   /// Wait for the host to signal.
-  __forceinline__ __device__ void wait(int64_t maxSpinCount = 10000000) {
+  __forceinline__ __device__ void wait(int64_t maxSpinCount = 100000000) {
     (*expectedInboundSemaphoreId) += 1;
     POLL_MAYBE_JAILBREAK((cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*inboundSemaphoreId}.load(
                               cuda::memory_order_acquire) < (*expectedInboundSemaphoreId)),
@@ -48,7 +48,7 @@ struct SmDevice2DeviceSemaphoreDeviceHandle {
   }
 
   /// Wait for the remote device to signal.
-  __forceinline__ __device__ void wait(int64_t maxSpinCount = 10000000) {
+  __forceinline__ __device__ void wait(int64_t maxSpinCount = 100000000) {
     (*expectedInboundSemaphoreId) += 1;
     POLL_MAYBE_JAILBREAK((cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*inboundSemaphoreId}.load(
                               cuda::memory_order_acquire) < (*expectedInboundSemaphoreId)),
@@ -68,6 +68,19 @@ struct SmDevice2DeviceSemaphoreDeviceHandle {
                                                                                            cuda::memory_order_seq_cst);
   }
 
+  /// Signal the remote device.
+  ///
+  /// This function is a relaxed version of signal() and provides no guarantee on the completion of memory operations.
+  /// User requires to call proper fencing before using this function.
+  ///
+  __forceinline__ __device__ void relaxedSignal() {
+    // This fence ensures that preceding writes are visible on the peer GPU before the incremented
+    // `outboundSemaphoreId` is visible.
+    semaphoreIncrement();
+    cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*remoteInboundSemaphoreId}.store(semaphoreGetLocal(),
+                                                                                           cuda::memory_order_relaxed);
+  }
+
   /// Signal the remote device for copied packets.
   ///
   /// Unlike @ref signal(), this function provides no guarantee on the completion of memory operations. This is

diff --git a/include/mscclpp/sm_channel_device.hpp b/include/mscclpp/sm_channel_device.hpp
@@ -16,30 +16,22 @@ namespace Element {
 
 /// Load an element from DRAM.
 ///
-/// This is a warpper of ld.volatile.global.* PTX instruction. Address alignment is not this function's
-/// responsibility.
-///
 /// @param v The value to be loaded.
 /// @param p The address of the value to be loaded.
 ///
 template <typename T>
 __forceinline__ __device__ void load(T& v, const T* p) {
-  // We should only use the specialized functions.
-  __assert_fail("Unsupported type", __FILE__, __LINE__, __PRETTY_FUNCTION__);
+  v = *p;
 }
 
 /// Write an element on DRAM.
 ///
-/// This is a wrapper of st.volatile.global.* PTX instruction. Address alignment is not this function's
-/// responsibility.
-///
 /// @param p The address of the value to be written.
 /// @param v The value to be written.
 ///
 template <typename T>
 __forceinline__ __device__ void store(T* p, const T& v) {
-  // We should only use the specialized functions.
-  __assert_fail("Unsupported type", __FILE__, __LINE__, __PRETTY_FUNCTION__);
+  *p = v;
 }
 
 /// Copy aligned elements from the source memory to the destination memory.
@@ -64,52 +56,6 @@ __forceinline__ __device__ void copy(T* dst, T* src, uint64_t numElems, uint32_t
   }
 }
 
-template <>
-__forceinline__ __device__ void load<long long>(long long& v, const long long* p) {
-  asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(v) : "l"(p) : "memory");
-}
-
-template <>
-__forceinline__ __device__ void store<long long>(long long* p, const long long& v) {
-  asm volatile("st.volatile.global.u64 [%0], %1;" : : "l"(p), "l"(v) : "memory");
-}
-
-template <>
-__forceinline__ __device__ void load<int>(int& v, const int* p) {
-  asm volatile("ld.volatile.global.u32 %0, [%1];" : "=r"(v) : "l"(p) : "memory");
-}
-
-template <>
-__forceinline__ __device__ void store<int>(int* p, const int& v) {
-  asm volatile("st.volatile.global.u32 [%0], %1;" : : "l"(p), "r"(v) : "memory");
-}
-
-template <>
-__forceinline__ __device__ void load<longlong2>(longlong2& v, const longlong2* p) {
-  asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");
-}
-
-template <>
-__forceinline__ __device__ void store<longlong2>(longlong2* p, const longlong2& v) {
-  asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" : : "l"(p), "l"(v.x), "l"(v.y) : "memory");
-}
-
-template <>
-__forceinline__ __device__ void load<int4>(int4& v, const int4* p) {
-  asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];"
-               : "=r"(v.x), "=r"(v.y), "=r"(v.z), "=r"(v.w)
-               : "l"(p)
-               : "memory");
-}
-
-template <>
-__forceinline__ __device__ void store<int4>(int4* p, const int4& v) {
-  asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};"
-               :
-               : "l"(p), "r"(v.x), "r"(v.y), "r"(v.z), "r"(v.w)
-               : "memory");
-}
-
 }  // namespace Element
 
 #endif  // __CUDACC__
@@ -315,6 +261,13 @@ struct SmChannelDeviceHandle {
   ///
   __forceinline__ __device__ void signal() { semaphore_.signal(); }
 
+  /// Signal the remote semaphore.
+  ///
+  /// This function is a relaxed version of signal() and provides no guarantee on the completion of memory operations.
+  /// User requires to call proper fencing before using this function.
+  ///
+  __forceinline__ __device__ void relaxedSignal() { semaphore_.relaxedSignal(); }
+
   /// Signal the remote semaphore for copied packets.
   ///
   /// Unlike @ref signal(), this function provides no guarantee on the completion of memory operations. This is

diff --git a/python/benchmark/__init__.py b/python/benchmark/__init__.py