Skip to content

Commit

Permalink
mscclpp-test in Python (#204)
Browse files Browse the repository at this point in the history
Co-authored-by: Binyang Li <[email protected]>
Co-authored-by: Saeed Maleki <[email protected]>
Co-authored-by: Esha Choukse <[email protected]>
  • Loading branch information
4 people authored Nov 16, 2023
1 parent e710701 commit 060fda1
Show file tree
Hide file tree
Showing 24 changed files with 1,589 additions and 155 deletions.
17 changes: 17 additions & 0 deletions .azure-pipelines/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,20 @@ jobs:
set -e
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
name: PythonAllReduceBenchmark
displayName: Python Allreduce Benchmark
inputs:
targetType: 'inline'
script: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
python3 -m pip install .
if [[ '$(containerImage)' == *'cuda11'* ]]; then
pip3 install -r ./python/requirements_cu11.txt
else
pip3 install -r ./python/requirements_cu12.txt
fi
mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/benchmark/allreduce_bench.py
workingDirectory: '$(System.DefaultWorkingDirectory)'
25 changes: 22 additions & 3 deletions .azure-pipelines/multi-nodes-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ jobs:
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mscclpp-test'
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test'
kill $CHILD_PID
- task: Bash@3
Expand All @@ -102,7 +102,7 @@ jobs:
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mp-ut'
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut'
kill $CHILD_PID
- task: Bash@3
Expand All @@ -121,7 +121,26 @@ jobs:
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh pytests'
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests'
kill $CHILD_PID
- task: Bash@3
name: RunMultiNodePythonBenchmark
displayName: Run multi-nodes python benchmark
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark'
kill $CHILD_PID
- task: AzureCLI@2
Expand Down
6 changes: 3 additions & 3 deletions .azure-pipelines/ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@ jobs:
export PATH=/usr/local/mpi/bin:$PATH
cd build && make pylib-copy
if [[ '$(containerImage)' == *'cuda11'* ]]; then
pip3 install -r ../python/test/requirements_cu11.txt
pip3 install -r ../python/requirements_cu11.txt
else
pip3 install -r ../python/test/requirements_cu12.txt
pip3 install -r ../python/requirements_cu12.txt
fi
mpirun -tag-output -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x
mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x
workingDirectory: '$(System.DefaultWorkingDirectory)'
4 changes: 3 additions & 1 deletion include/mscclpp/concurrency.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,13 @@ struct DeviceSyncer {
if (tmpIsAdd) {
if (atomicAdd(&count_, 1) == maxOldCnt) {
flag_ = 1;
count_ = 0;
}
POLL_MAYBE_JAILBREAK(!flag_, maxSpinCount);
} else {
if (atomicSub(&count_, 1) == 1) {
if (atomicAdd(&count_, 1) == maxOldCnt) {
flag_ = 0;
count_ = 0;
}
POLL_MAYBE_JAILBREAK(flag_, maxSpinCount);
}
Expand Down
17 changes: 15 additions & 2 deletions include/mscclpp/semaphore_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ struct Host2DeviceSemaphoreDeviceHandle {
}

/// Wait for the host to signal.
__forceinline__ __device__ void wait(int64_t maxSpinCount = 10000000) {
__forceinline__ __device__ void wait(int64_t maxSpinCount = 100000000) {
(*expectedInboundSemaphoreId) += 1;
POLL_MAYBE_JAILBREAK((cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*inboundSemaphoreId}.load(
cuda::memory_order_acquire) < (*expectedInboundSemaphoreId)),
Expand All @@ -48,7 +48,7 @@ struct SmDevice2DeviceSemaphoreDeviceHandle {
}

/// Wait for the remote device to signal.
__forceinline__ __device__ void wait(int64_t maxSpinCount = 10000000) {
__forceinline__ __device__ void wait(int64_t maxSpinCount = 100000000) {
(*expectedInboundSemaphoreId) += 1;
POLL_MAYBE_JAILBREAK((cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*inboundSemaphoreId}.load(
cuda::memory_order_acquire) < (*expectedInboundSemaphoreId)),
Expand All @@ -68,6 +68,19 @@ struct SmDevice2DeviceSemaphoreDeviceHandle {
cuda::memory_order_seq_cst);
}

/// Signal the remote device.
///
/// This function is a relaxed version of signal() and provides no guarantee on the completion of memory operations.
/// User requires to call proper fencing before using this function.
///
__forceinline__ __device__ void relaxedSignal() {
// This fence ensures that preceding writes are visible on the peer GPU before the incremented
// `outboundSemaphoreId` is visible.
semaphoreIncrement();
cuda::atomic_ref<uint64_t, cuda::thread_scope_system>{*remoteInboundSemaphoreId}.store(semaphoreGetLocal(),
cuda::memory_order_relaxed);
}

/// Signal the remote device for copied packets.
///
/// Unlike @ref signal(), this function provides no guarantee on the completion of memory operations. This is
Expand Down
65 changes: 9 additions & 56 deletions include/mscclpp/sm_channel_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,30 +16,22 @@ namespace Element {

/// Load an element from DRAM.
///
/// This is a warpper of ld.volatile.global.* PTX instruction. Address alignment is not this function's
/// responsibility.
///
/// @param v The value to be loaded.
/// @param p The address of the value to be loaded.
///
template <typename T>
__forceinline__ __device__ void load(T& v, const T* p) {
// We should only use the specialized functions.
__assert_fail("Unsupported type", __FILE__, __LINE__, __PRETTY_FUNCTION__);
v = *p;
}

/// Write an element on DRAM.
///
/// This is a wrapper of st.volatile.global.* PTX instruction. Address alignment is not this function's
/// responsibility.
///
/// @param p The address of the value to be written.
/// @param v The value to be written.
///
template <typename T>
__forceinline__ __device__ void store(T* p, const T& v) {
// We should only use the specialized functions.
__assert_fail("Unsupported type", __FILE__, __LINE__, __PRETTY_FUNCTION__);
*p = v;
}

/// Copy aligned elements from the source memory to the destination memory.
Expand All @@ -64,52 +56,6 @@ __forceinline__ __device__ void copy(T* dst, T* src, uint64_t numElems, uint32_t
}
}

template <>
__forceinline__ __device__ void load<long long>(long long& v, const long long* p) {
asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(v) : "l"(p) : "memory");
}

template <>
__forceinline__ __device__ void store<long long>(long long* p, const long long& v) {
asm volatile("st.volatile.global.u64 [%0], %1;" : : "l"(p), "l"(v) : "memory");
}

template <>
__forceinline__ __device__ void load<int>(int& v, const int* p) {
asm volatile("ld.volatile.global.u32 %0, [%1];" : "=r"(v) : "l"(p) : "memory");
}

template <>
__forceinline__ __device__ void store<int>(int* p, const int& v) {
asm volatile("st.volatile.global.u32 [%0], %1;" : : "l"(p), "r"(v) : "memory");
}

template <>
__forceinline__ __device__ void load<longlong2>(longlong2& v, const longlong2* p) {
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");
}

template <>
__forceinline__ __device__ void store<longlong2>(longlong2* p, const longlong2& v) {
asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" : : "l"(p), "l"(v.x), "l"(v.y) : "memory");
}

template <>
__forceinline__ __device__ void load<int4>(int4& v, const int4* p) {
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];"
: "=r"(v.x), "=r"(v.y), "=r"(v.z), "=r"(v.w)
: "l"(p)
: "memory");
}

template <>
__forceinline__ __device__ void store<int4>(int4* p, const int4& v) {
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};"
:
: "l"(p), "r"(v.x), "r"(v.y), "r"(v.z), "r"(v.w)
: "memory");
}

} // namespace Element

#endif // __CUDACC__
Expand Down Expand Up @@ -315,6 +261,13 @@ struct SmChannelDeviceHandle {
///
__forceinline__ __device__ void signal() { semaphore_.signal(); }

/// Signal the remote semaphore.
///
/// This function is a relaxed version of signal() and provides no guarantee on the completion of memory operations.
/// User requires to call proper fencing before using this function.
///
__forceinline__ __device__ void relaxedSignal() { semaphore_.relaxedSignal(); }

/// Signal the remote semaphore for copied packets.
///
/// Unlike @ref signal(), this function provides no guarantee on the completion of memory operations. This is
Expand Down
Empty file added python/benchmark/__init__.py
Empty file.
Loading

0 comments on commit 060fda1

Please sign in to comment.