From 32a1a3799aac4e74c893abcd59fcf2680f11471b Mon Sep 17 00:00:00 2001 From: Christian Schou Oxvig Date: Wed, 29 Nov 2023 14:41:05 +0100 Subject: [PATCH] LUMI mpi4py OSU test scripts WIP. --- examples/LUMI/conda_mpi4py_mpich/README.md | 3 - .../run_cotainr_bind_mpi_hello_world.sh | 4 +- .../run_cotainr_bind_osu.sh | 69 +++++++++++++++++++ .../run_cotainr_hybrid_mpi_hello_world.sh | 2 +- .../run_cotainr_hybrid_osu.sh | 67 ++++++++++++++++++ .../run_cray_python_mpi_hello_world.sh | 4 +- .../conda_mpi4py_mpich/run_cray_python_osu.sh | 46 +++++++++++++ .../run_lumisif_mpi_hello_world.sh | 4 +- 8 files changed, 189 insertions(+), 10 deletions(-) create mode 100644 examples/LUMI/conda_mpi4py_mpich/run_cotainr_bind_osu.sh create mode 100644 examples/LUMI/conda_mpi4py_mpich/run_cotainr_hybrid_osu.sh create mode 100644 examples/LUMI/conda_mpi4py_mpich/run_cray_python_osu.sh diff --git a/examples/LUMI/conda_mpi4py_mpich/README.md b/examples/LUMI/conda_mpi4py_mpich/README.md index c3eab48..36470aa 100644 --- a/examples/LUMI/conda_mpi4py_mpich/README.md +++ b/examples/LUMI/conda_mpi4py_mpich/README.md @@ -24,9 +24,6 @@ Copy everything to LUMI, update the `--account=project_` SBATCH TODO: - Ideally, update the conda numpy package to 1.26.1, though it may be a problem: https://github.com/conda-forge/numpy-feedstock/pull/302 -- mpi4py conda packages not compatible with mpich=3.4.3 conda packages (or is it?) -- Set a better shebang: #!/bin/bash -e -- Rename cray_python to cray-python in sbatch file names. - TL;DR or somehow separate recommended example from tests/benchmarks diff --git a/examples/LUMI/conda_mpi4py_mpich/run_cotainr_bind_mpi_hello_world.sh b/examples/LUMI/conda_mpi4py_mpich/run_cotainr_bind_mpi_hello_world.sh index 7b72401..676d58b 100644 --- a/examples/LUMI/conda_mpi4py_mpich/run_cotainr_bind_mpi_hello_world.sh +++ b/examples/LUMI/conda_mpi4py_mpich/run_cotainr_bind_mpi_hello_world.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/bash -e # # A LUMI SLURM batch script for the LUMI mpi4py MPICH example from # https://github.com/DeiC-HPC/cotainr @@ -24,7 +24,7 @@ CONTAINERS=(\ export MPIR_CVAR_DEBUG_SUMMARY=1 export FI_LOG_LEVEL=Info -source lumi-singularity-bindings.sh # or use the LUMI singularity-bindings module +source $PROJECT_DIR/lumi-singularity-bindings.sh # or use the LUMI singularity-bindings module for container in ${CONTAINERS[@]}; do echo "=============== Run using $container ===============" diff --git a/examples/LUMI/conda_mpi4py_mpich/run_cotainr_bind_osu.sh b/examples/LUMI/conda_mpi4py_mpich/run_cotainr_bind_osu.sh new file mode 100644 index 0000000..c9ab2ca --- /dev/null +++ b/examples/LUMI/conda_mpi4py_mpich/run_cotainr_bind_osu.sh @@ -0,0 +1,69 @@ +#!/bin/bash -e +# +# A LUMI SLURM batch script for the LUMI mpi4py MPICH example from +# https://github.com/DeiC-HPC/cotainr +# This script runs the OSU benchmarks with Numpy buffers +# using a cotainr container including a generic MPICH, bind mounting the host MPI. +# +#SBATCH --job-name=mpi4py-cotainr-bind-osu +#SBATCH --nodes=2 +#SBATCH --tasks-per-node=1 +#SBATCH --output="output_%x_%j.txt" +#SBATCH --partition=small +#SBATCH --exclusive +#SBATCH --time=00:10:00 +#SBATCH --account=project_ + +PROJECT_DIR= +OSU_PY_BENCHMARK_DIR=$PROJECT_DIR/osu-micro-benchmarks-7.0.1/python/ +RESULTS_DIR=$PROJECT_DIR/test_results +CONTAINERS=(\ + "cotainr-mpich3-pip-mpi4py.sif" \ + "cotainr-mpich4-pip-mpi4py.sif") + +set -x +mkdir -p $RESULTS_DIR + +source $PROJECT_DIR/lumi-singularity-bindings.sh # or use the LUMI singularity-bindings module + +for container in ${CONTAINERS[@]}; do + # Single node runs + srun --nodes=1 --tasks-per-node=2 \ + singularity exec \ + --bind=$PROJECT_DIR \ + $PROJECT_DIR/containers/$container \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=bw --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-bw-single-$container.txt + srun --nodes=1 --tasks-per-node=2 \ + singularity exec \ + --bind=$PROJECT_DIR \ + $PROJECT_DIR/containers/$container \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=latency --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-latency-single-$container.txt + srun --nodes=1 --tasks-per-node=2 \ + singularity exec \ + --bind=$PROJECT_DIR \ + $PROJECT_DIR/containers/$container \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=allgather --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-allgather-single-$container.txt + + # Multi node runs + srun \ + singularity exec \ + --bind=$PROJECT_DIR \ + $PROJECT_DIR/containers/$container \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=bw --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-bw-multi-$container.txt + srun \ + singularity exec \ + --bind=$PROJECT_DIR \ + $PROJECT_DIR/containers/$container \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=latency --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-latency-multi-$container.txt + srun \ + singularity exec \ + --bind=$PROJECT_DIR \ + $PROJECT_DIR/containers/$container \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=allgather --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-allgather-multi-$container.txt +done diff --git a/examples/LUMI/conda_mpi4py_mpich/run_cotainr_hybrid_mpi_hello_world.sh b/examples/LUMI/conda_mpi4py_mpich/run_cotainr_hybrid_mpi_hello_world.sh index e89f482..9fc5652 100644 --- a/examples/LUMI/conda_mpi4py_mpich/run_cotainr_hybrid_mpi_hello_world.sh +++ b/examples/LUMI/conda_mpi4py_mpich/run_cotainr_hybrid_mpi_hello_world.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/bash -e # # A LUMI SLURM batch script for the LUMI mpi4py MPICH example from # https://github.com/DeiC-HPC/cotainr diff --git a/examples/LUMI/conda_mpi4py_mpich/run_cotainr_hybrid_osu.sh b/examples/LUMI/conda_mpi4py_mpich/run_cotainr_hybrid_osu.sh new file mode 100644 index 0000000..d523087 --- /dev/null +++ b/examples/LUMI/conda_mpi4py_mpich/run_cotainr_hybrid_osu.sh @@ -0,0 +1,67 @@ +#!/bin/bash -e +# +# A LUMI SLURM batch script for the LUMI mpi4py MPICH example from +# https://github.com/DeiC-HPC/cotainr +# This script runs the OSU benchmarks with Numpy buffers +# using a cotainr container including a generic MPICH, using the container MPI. +# +#SBATCH --job-name=mpi4py-cotainr-hybrid-osu +#SBATCH --nodes=2 +#SBATCH --tasks-per-node=1 +#SBATCH --output="output_%x_%j.txt" +#SBATCH --partition=small +#SBATCH --exclusive +#SBATCH --time=00:30:00 +#SBATCH --account=project_ + +PROJECT_DIR= +OSU_PY_BENCHMARK_DIR=$PROJECT_DIR/osu-micro-benchmarks-7.0.1/python/ +RESULTS_DIR=$PROJECT_DIR/test_results +CONTAINERS=(\ + "cotainr-mpich3-pip-mpi4py.sif" \ + "cotainr-mpich4-pip-mpi4py.sif") + +set -x +mkdir -p $RESULTS_DIR + +for container in ${CONTAINERS[@]}; do + # Single node runs + srun --nodes=1 --tasks-per-node=2 --mpi=pmi2 \ + singularity exec \ + --bind=$PROJECT_DIR \ + $PROJECT_DIR/containers/$container \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=bw --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-bw-single-$container.txt + srun --nodes=1 --tasks-per-node=2 --mpi=pmi2 \ + singularity exec \ + --bind=$PROJECT_DIR \ + $PROJECT_DIR/containers/$container \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=latency --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-latency-single-$container.txt + srun --nodes=1 --tasks-per-node=2 --mpi=pmi2 \ + singularity exec \ + --bind=$PROJECT_DIR \ + $PROJECT_DIR/containers/$container \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=allgather --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-allgather-single-$container.txt + + # Multi node runs + srun --mpi=pmi2 \ + singularity exec \ + --bind=$PROJECT_DIR \ + $PROJECT_DIR/containers/$container \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=bw --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-bw-multi-$container.txt + srun --mpi=pmi2 \ + singularity exec \ + --bind=$PROJECT_DIR \ + $PROJECT_DIR/containers/$container \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=latency --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-latency-multi-$container.txt + srun --mpi=pmi2 \ + singularity exec \ + --bind=$PROJECT_DIR \ + $PROJECT_DIR/containers/$container \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=allgather --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-allgather-multi-$container.txt +done diff --git a/examples/LUMI/conda_mpi4py_mpich/run_cray_python_mpi_hello_world.sh b/examples/LUMI/conda_mpi4py_mpich/run_cray_python_mpi_hello_world.sh index ed5f6b3..164b8ff 100644 --- a/examples/LUMI/conda_mpi4py_mpich/run_cray_python_mpi_hello_world.sh +++ b/examples/LUMI/conda_mpi4py_mpich/run_cray_python_mpi_hello_world.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/bash -e # # A LUMI SLURM batch script for the LUMI mpi4py MPICH example from # https://github.com/DeiC-HPC/cotainr @@ -21,4 +21,4 @@ PROJECT_DIR= export MPIR_CVAR_DEBUG_SUMMARY=1 export FI_LOG_LEVEL=Info -srun python $PROJECT_DIR/mpi_hello_world.py +srun python3 $PROJECT_DIR/mpi_hello_world.py diff --git a/examples/LUMI/conda_mpi4py_mpich/run_cray_python_osu.sh b/examples/LUMI/conda_mpi4py_mpich/run_cray_python_osu.sh new file mode 100644 index 0000000..8de13c7 --- /dev/null +++ b/examples/LUMI/conda_mpi4py_mpich/run_cray_python_osu.sh @@ -0,0 +1,46 @@ +#!/bin/bash -e +# +# A LUMI SLURM batch script for the LUMI mpi4py MPICH example from +# https://github.com/DeiC-HPC/cotainr +# This script runs the OSU benchmarks with Numpy buffers +# using the LUMI cray-python module +# +#SBATCH --job-name=mpi4py-cray-python-osu +#SBATCH --nodes=2 +#SBATCH --tasks-per-node=1 +#SBATCH --output="output_%x_%j.txt" +#SBATCH --partition=small +#SBATCH --exclusive +#SBATCH --time=00:10:00 +#SBATCH --account=project_ + +module load cray-python + +PROJECT_DIR= +OSU_PY_BENCHMARK_DIR=$PROJECT_DIR/osu-micro-benchmarks-7.0.1/python/ +RESULTS_DIR=$PROJECT_DIR/test_results + +set -x +mkdir -p $RESULTS_DIR + +# Single node runs +srun --nodes=1 --tasks-per-node=2 \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=bw --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-bw-single.txt +srun --nodes=1 --tasks-per-node=2 \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=latency --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-latency-single.txt +srun --nodes=1 --tasks-per-node=2 \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=allgather --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-allgather-single.txt + +# Multi node runs +srun \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=bw --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-bw-multi.txt +srun \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=latency --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-latency-multi.txt +srun \ + python3 $OSU_PY_BENCHMARK_DIR/run.py --benchmark=allgather --buffer=numpy \ + > $RESULTS_DIR/$SLURM_JOB_NAME-allgather-multi.txt diff --git a/examples/LUMI/conda_mpi4py_mpich/run_lumisif_mpi_hello_world.sh b/examples/LUMI/conda_mpi4py_mpich/run_lumisif_mpi_hello_world.sh index a930d40..33dcad9 100644 --- a/examples/LUMI/conda_mpi4py_mpich/run_lumisif_mpi_hello_world.sh +++ b/examples/LUMI/conda_mpi4py_mpich/run_lumisif_mpi_hello_world.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/bash -e # # A LUMI SLURM batch script for the LUMI mpi4py MPICH example from # https://github.com/DeiC-HPC/cotainr @@ -25,7 +25,7 @@ cat > $PROJECT_DIR/run-script.sh << EOF \$WITH_CONDA # Run application -python $PROJECT_DIR/mpi_hello_world.py +python3 $PROJECT_DIR/mpi_hello_world.py EOF chmod +x $PROJECT_DIR/run-script.sh