Skip to content

Commit

Permalink
Merge branch 'main' into saemal/py_mscclpp
Browse files Browse the repository at this point in the history
  • Loading branch information
chhwang authored Nov 8, 2023
2 parents 23cbb94 + 0863e86 commit d84a9f1
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 22 deletions.
28 changes: 14 additions & 14 deletions .azure-pipelines/multi-nodes-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
mkdir build && cd build
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON ..
make -j
make pylib-copy
workingDirectory: '$(System.DefaultWorkingDirectory)'
Expand All @@ -53,11 +53,11 @@ jobs:
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: mscclpp
azureSubscription: msccl-it
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name mscclpp-it-vmss --resource-group msccl-dev
az vmss start --name mscclit-vmss --resource-group msccl-IT
- task: Bash@3
name: DeployTestEnv
Expand All @@ -79,10 +79,10 @@ jobs:
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclpp-it-000000
tail -f output/mscclpp-it-000000 &
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclpp-it-000000 -l azureuser -x "-i ${KeyFilePath}" \
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mscclpp-test'
kill $CHILD_PID
Expand All @@ -98,10 +98,10 @@ jobs:
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclpp-it-000000
tail -f output/mscclpp-it-000000 &
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclpp-it-000000 -l azureuser -x "-i ${KeyFilePath}" \
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mp-ut'
kill $CHILD_PID
Expand All @@ -117,10 +117,10 @@ jobs:
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
rm -rf output/*
mkdir -p output
touch output/mscclpp-it-000000
tail -f output/mscclpp-it-000000 &
touch output/mscclit-000000
tail -f output/mscclit-000000 &
CHILD_PID=$!
parallel-ssh -t 0 -H mscclpp-it-000000 -l azureuser -x "-i ${KeyFilePath}" \
parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh pytests'
kill $CHILD_PID
Expand All @@ -129,8 +129,8 @@ jobs:
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: mscclpp
azureSubscription: msccl-it
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name mscclpp-it-vmss --resource-group msccl-dev
az vmss deallocate --name mscclit-vmss --resource-group msccl-IT
4 changes: 2 additions & 2 deletions test/deploy/config
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Host mscclpp-it-000000
Host mscclit-000000
Port 22345
IdentityFile /root/mscclpp/sshkey
StrictHostKeyChecking no
Host mscclpp-it-000001
Host mscclit-000001
Port 22345
IdentityFile /root/mscclpp/sshkey
StrictHostKeyChecking no
4 changes: 2 additions & 2 deletions test/deploy/hostfile
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
azureuser@mscclpp-it-000000
azureuser@mscclpp-it-000001
azureuser@mscclit-000000
azureuser@mscclit-000001
4 changes: 2 additions & 2 deletions test/deploy/hostfile_mpi
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
mscclpp-it-000000
mscclpp-it-000001
mscclit-000000
mscclit-000001
4 changes: 2 additions & 2 deletions test/deploy/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,12 @@ function run_mp_ut()
echo "============Run multi-process unit tests on 2 nodes (np=2, npernode=1)========================="
/usr/local/mpi/bin/mpirun -allow-run-as-root -tag-output -np 2 --bind-to numa \
-hostfile /root/mscclpp/hostfile_mpi -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 1 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclpp-it-000000:20003
-npernode 1 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003

echo "============Run multi-process unit tests on 2 nodes (np=16, npernode=8)========================="
/usr/local/mpi/bin/mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
-hostfile /root/mscclpp/hostfile_mpi -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclpp-it-000000:20003
-npernode 8 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
}

function run_pytests()
Expand Down

0 comments on commit d84a9f1

Please sign in to comment.