From 1d14e4e4df5ceb0a8058668d181b1ff454a4852b Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Tue, 9 Apr 2024 13:50:59 -0500 Subject: [PATCH 1/2] Update E3SM machine config file This is from https://github.com/E3SM-Project/E3SM/blob/rljacob/mach/new-chrys-soft/cime_config/machines/config_machines.xml which has not yet been merged into `master` but without which Chrysalis runs are hanging rather than canceling. --- mache/cime_machine_config/config_machines.xml | 907 ++++++++++++++---- 1 file changed, 695 insertions(+), 212 deletions(-) diff --git a/mache/cime_machine_config/config_machines.xml b/mache/cime_machine_config/config_machines.xml index 0e5bf4bf..7080d03f 100644 --- a/mache/cime_machine_config/config_machines.xml +++ b/mache/cime_machine_config/config_machines.xml @@ -244,6 +244,7 @@ cray-netcdf-hdf5parallel/4.9.0.3 cray-parallel-netcdf/1.12.3.3 cmake/3.24.3 + evp-patch @@ -266,6 +267,7 @@ $SHELL{if [ -z "$Trilinos_ROOT" ]; then echo /global/common/software/e3sm/mali_tpls/trilinos-e3sm-serial-release-gcc; else echo "$Trilinos_ROOT"; fi} $ENV{CRAY_NETCDF_HDF5PARALLEL_PREFIX} $ENV{CRAY_PARALLEL_NETCDF_PREFIX} + 4000MB $SHELL{if [ -z "$ADIOS2_ROOT" ]; then echo /global/cfs/cdirs/e3sm/3rdparty/adios2/2.9.1/cray-mpich-8.1.25/intel-2023.1.0; else echo "$ADIOS2_ROOT"; fi} @@ -288,6 +290,12 @@ $SHELL{if [ -z "$ADIOS2_ROOT" ]; then echo /global/cfs/cdirs/e3sm/3rdparty/adios2/2.9.1/cray-mpich-8.1.25/aocc-4.0.0; else echo "$ADIOS2_ROOT"; fi} + + $SHELL{if [ -z "$MOAB_ROOT" ]; then echo /global/cfs/cdirs/e3sm/software/moab/intel; else echo "$MOAB_ROOT"; fi} + + + $SHELL{if [ -z "$MOAB_ROOT" ]; then echo /global/cfs/cdirs/e3sm/software/moab/gnu; else echo "$MOAB_ROOT"; fi} + -1 @@ -384,6 +392,7 @@ cudatoolkit/11.7 craype-accel-nvidia80 + gcc-mixed/11.2.0 @@ -438,8 +447,166 @@ - - Muller small internal machine at NERSC with GPU nodes similar to pm-gpu + + Muller CPU-only nodes on internal NERSC machine, similar to pm-cpu (very small) + $ENV{NERSC_HOST}:muller + Linux + intel,gnu,nvidia,amdclang + mpich + e3sm + /global/cfs/cdirs/e3sm + e3sm,m3411,m3412 + $ENV{SCRATCH}/e3sm_scratch/muller-cpu + /global/cfs/cdirs/e3sm/www/$ENV{USER} + http://portal.nersc.gov/project/e3sm/$ENV{USER} + /global/cfs/cdirs/e3sm/inputdata + /global/cfs/cdirs/e3sm/inputdata/atm/datm7 + $CIME_OUTPUT_ROOT/archive/$CASE + /global/cfs/cdirs/e3sm/baselines/$COMPILER + /global/cfs/cdirs/e3sm/tools/cprnc/cprnc + 10 + e3sm_developer + 4 + nersc_slurm + e3sm + 256 + 128 + TRUE + + srun + + --label + -n {{ total_tasks }} -N {{ num_nodes }} + -c $SHELL{echo 256/`./xmlquery --value MAX_MPITASKS_PER_NODE`|bc} + $SHELL{if [ 128 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu_bind=cores"; else echo "--cpu_bind=threads";fi;} + -m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`} + + + + /usr/share/lmod/8.3.1/init/perl + /usr/share/lmod/8.3.1/init/python + /usr/share/lmod/8.3.1/init/sh + /usr/share/lmod/8.3.1/init/csh + /usr/share/lmod/lmod/libexec/lmod perl + /usr/share/lmod/lmod/libexec/lmod python + module + module + + + cpe + cray-hdf5-parallel + cray-netcdf-hdf5parallel + cray-parallel-netcdf + cray-netcdf + cray-hdf5 + PrgEnv-gnu + PrgEnv-intel + PrgEnv-nvidia + PrgEnv-cray + PrgEnv-aocc + intel + intel-oneapi + nvidia + aocc + cudatoolkit + climate-utils + craype-accel-nvidia80 + craype-accel-host + perftools-base + perftools + darshan + + + + + PrgEnv-gnu/8.5.0 + gcc-native/12.3 + cray-libsci/23.12.5 + + + + + PrgEnv-intel/8.5.0 + intel/2023.2.0 + + + + PrgEnv-nvidia + nvidia/23.9 + cray-libsci/23.12.5 + + + + PrgEnv-aocc + aocc/4.1.0 + cray-libsci/23.12.5 + + + + craype-accel-host + craype/2.7.30 + cray-mpich/8.1.28 + + cray-hdf5-parallel/1.12.2.9 + cray-netcdf-hdf5parallel/4.9.0.9 + cray-parallel-netcdf/1.12.3.9 + cmake/3.24.3 + + + + $CIME_OUTPUT_ROOT/$CASE/run + $CIME_OUTPUT_ROOT/$CASE/bld + 0.1 + 0.20 + + + 1 + 1 + 128M + spread + threads + FALSE + /global/cfs/cdirs/e3sm/perl/lib/perl5-only-switch + software + MPI_Bcast + $SHELL{if [ -z "$Albany_ROOT" ]; then echo /global/common/software/e3sm/mali_tpls/albany-e3sm-serial-release-gcc; else echo "$Albany_ROOT"; fi} + $SHELL{if [ -z "$Trilinos_ROOT" ]; then echo /global/common/software/e3sm/mali_tpls/trilinos-e3sm-serial-release-gcc; else echo "$Trilinos_ROOT"; fi} + $ENV{CRAY_NETCDF_HDF5PARALLEL_PREFIX} + $ENV{CRAY_PARALLEL_NETCDF_PREFIX} + + + $SHELL{if [ -z "$ADIOS2_ROOT" ]; then echo /global/cfs/cdirs/e3sm/3rdparty/adios2/2.9.1/cray-mpich-8.1.25/intel-2023.1.0; else echo "$ADIOS2_ROOT"; fi} + + + $SHELL{if [ -z "$ADIOS2_ROOT" ]; then echo /global/cfs/cdirs/e3sm/3rdparty/adios2/2.9.1/cray-mpich-8.1.25/gcc-11.2.0; else echo "$ADIOS2_ROOT"; fi} + Generic + + + $SHELL{if [ -z "$ADIOS2_ROOT" ]; then echo /global/cfs/cdirs/e3sm/3rdparty/adios2/2.9.1/cray-mpich-8.1.25/nvidia-22.7; else echo "$ADIOS2_ROOT"; fi} + + + $SHELL{if [ -z "$BLAS_ROOT" ]; then echo /opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers; else echo "$BLAS_ROOT"; fi} + $SHELL{if [ -z "$LAPACK_ROOT" ]; then echo /opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers; else echo "$LAPACK_ROOT"; fi} + NVHPC + + + Intel10_64_dyn + + + $SHELL{if [ -z "$ADIOS2_ROOT" ]; then echo /global/cfs/cdirs/e3sm/3rdparty/adios2/2.9.1/cray-mpich-8.1.25/aocc-4.0.0; else echo "$ADIOS2_ROOT"; fi} + + + -1 + + + + + Muller GPU nodes on internal machine at NERSC. similar to pm-gpu $ENV{NERSC_HOST}:muller Linux gnugpu,gnu,nvidiagpu,nvidia @@ -447,7 +614,7 @@ e3sm_g /global/cfs/cdirs/e3sm e3sm,m3411,m3412 - $ENV{SCRATCH}/e3sm_scratch/muller + $ENV{SCRATCH}/e3sm_scratch/muller-gpu /global/cfs/cdirs/e3sm/www/$ENV{USER} http://portal.nersc.gov/project/e3sm/$ENV{USER} /global/cfs/cdirs/e3sm/inputdata @@ -488,6 +655,7 @@ module + cpe cray-hdf5-parallel cray-netcdf-hdf5parallel cray-parallel-netcdf @@ -514,15 +682,18 @@ PrgEnv-gnu/8.3.3 gcc/11.2.0 + PrgEnv-nvidia - nvidia/22.7 + nvidia/23.9 cudatoolkit/11.7 + craype-accel-nvidia80 @@ -546,7 +717,14 @@ cray-hdf5-parallel/1.12.2.3 cray-netcdf-hdf5parallel/4.9.0.3 cray-parallel-netcdf/1.12.3.3 + cmake/3.24.3 + evp-patch @@ -893,6 +1071,7 @@ --gpu-bind=closest romio_cb_read=disable 0 + $SHELL{which hipcc} 10 @@ -1034,11 +1213,12 @@ Linux crayclang-scream mpich - CLI133_crusher + CLI115 /lustre/orion/cli133/proj-shared/$ENV{USER}/e3sm_scratch/crusher /lustre/orion/cli115/world-shared/e3sm/inputdata /lustre/orion/cli115/world-shared/e3sm/inputdata/atm/datm7 $CIME_OUTPUT_ROOT/archive/$CASE + /lustre/orion/cli133/world-shared/e3sm/baselines/$COMPILER /lustre/orion/cli115/world-shared/e3sm/tools/cprnc/cprnc 8 1 @@ -1130,11 +1310,12 @@ Linux crayclang-scream mpich - CLI133_crusher + CLI115 /lustre/orion/cli133/proj-shared/$ENV{USER}/e3sm_scratch/crusher /lustre/orion/cli115/world-shared/e3sm/inputdata /lustre/orion/cli115/world-shared/e3sm/inputdata/atm/datm7 $CIME_OUTPUT_ROOT/archive/$CASE + /lustre/orion/cli133/world-shared/e3sm/baselines/$COMPILER /lustre/orion/cli115/world-shared/e3sm/tools/cprnc/cprnc 8 1 @@ -1207,6 +1388,92 @@ + + Frontier. AMD EPYC 7A53 64C nodes, 128 hwthreads, 512GB DDR4, 4 MI250X GPUs. + .*frontier.* + CNL + crayclang-scream + mpich + cli115 + /lustre/orion/proj-shared/cli115 + .* + /lustre/orion/cli115/proj-shared/$ENV{USER}/e3sm_scratch + /lustre/orion/cli115/world-shared/e3sm/inputdata + /lustre/orion/cli115/world-shared/e3sm/inputdata/atm/datm7 + $CIME_OUTPUT_ROOT/archive/$CASE + /lustre/orion/cli115/world-shared/e3sm/baselines/frontier/$COMPILER + /lustre/orion/cli115/world-shared/e3sm/tools/cprnc/cprnc + 8 + 1 + slurm + e3sm + 56 + 8 + TRUE + + + srun + + -l -K -n {{ total_tasks }} -N {{ num_nodes }} + --gpus-per-node=8 --gpu-bind=closest + -c $ENV{OMP_NUM_THREADS} + + + + + /usr/share/lmod/lmod/init/sh + /usr/share/lmod/lmod/init/csh + /usr/share/lmod/lmod/init/perl + /usr/share/lmod/lmod/init/env_modules_python.py + /usr/share/lmod/lmod/libexec/lmod perl + module + module + /usr/share/lmod/lmod/libexec/lmod python + + + PrgEnv-cray + craype-accel-amd-gfx90a + rocm/5.1.0 + libunwind/1.6.2 + + + cce/15.0.1 + craype craype/2.7.20 + cray-mpich cray-mpich/8.1.26 + cray-python/3.9.13.1 + subversion/1.14.1 + git/2.36.1 + cmake/3.21.3 + cray-hdf5-parallel/1.12.2.1 + cray-netcdf-hdf5parallel/4.9.0.1 + cray-parallel-netcdf/1.12.3.1 + darshan-runtime + + + + $CIME_OUTPUT_ROOT/$CASE/run + $CIME_OUTPUT_ROOT/$CASE/bld + 0.1 + 0 + + $ENV{NETCDF_DIR} + $ENV{PNETCDF_DIR} + + 1 + 1 + 2 + $SHELL{which hipcc} + $ENV{CRAY_LD_LIBRARY_PATH}:$ENV{LD_LIBRARY_PATH} + True + + + + 128M + spread + threads + + + Stampede2. Intel skylake nodes at TACC. 48 cores per node, batch system is SLURM @@ -1512,7 +1779,7 @@ mappy LINUX proxy.sandia.gov:80 - gnu,gnu9,intel + gnu openmpi /sems-data-store/ACME/mappy/timings .* @@ -1548,24 +1815,15 @@ sems-archive-env acme-env sems-archive-git - sems-archive-cmake/3.19.1 + acme-cmake/3.26.3 - acme-gcc/8.1.0 - - - sems-archive-gcc/9.2.0 + acme-gcc/11.2.0 - - sems-archive-intel/19.0.5 - - + acme-netcdf/4.4.1/exo_acme acme-pfunit/3.2.8/base - - sems-archive-netcdf/4.7.3/base - acme-openmpi/4.1.4 acme-netcdf/4.7.4/acme @@ -1582,6 +1840,7 @@ spread threads Generic + 4000MB @@ -1957,7 +2216,6 @@ /nfs/gce/projects/climate/software/linux-ubuntu20.04-x86_64/netcdf/4.8.0c-4.3.1cxx-4.5.3f-parallel/mpich-4.0/gcc-11.1.0 /nfs/gce/projects/climate/software/linux-ubuntu20.04-x86_64/pnetcdf/1.12.2/mpich-4.0/gcc-11.1.0 $SHELL{if [ -z "$MOAB_ROOT" ]; then echo /nfs/gce/projects/climate/software/moab/devel/mpich-4.0/gcc-11.1.0; else echo "$MOAB_ROOT"; fi} - /nfs/gce/projects/climate/software/moab/devel/mpich-4.0/gcc-11.1.0 @@ -2304,12 +2562,12 @@ intel-mkl/2020.4.304-g2qaxzf - openmpi/4.1.3-pin4k7o - hdf5/1.10.7-eewgp6v - netcdf-c/4.4.1-ihoo4zq - netcdf-cxx/4.2-soitsxm - netcdf-fortran/4.4.4-tplolxh - parallel-netcdf/1.11.0-gvcfihh + openmpi/4.1.6-2mm63n2 + hdf5/1.10.7-4cghwvq + netcdf-c/4.4.1-a4hji6e + netcdf-cxx/4.2-ldoxr43 + netcdf-fortran/4.4.4-husened + parallel-netcdf/1.11.0-icrpxty intel-mpi/2019.9.304-tkzvizk @@ -2355,12 +2613,13 @@ $CIME_OUTPUT_ROOT/$CASE/bld 0.05 0.05 - 1000 + 0 /lcrc/group/e3sm/soft/perl/chrys/lib/perl5 $SHELL{dirname $(dirname $(which nc-config))} $SHELL{dirname $(dirname $(which nf-config))} $SHELL{dirname $(dirname $(which pnetcdf_version))} + ^lockedfile,individual 128M @@ -2437,7 +2696,7 @@ $CIME_OUTPUT_ROOT/$CASE/run $CIME_OUTPUT_ROOT/$CASE/bld 0.1 - 1000 + 0 $SHELL{dirname $(dirname $(which nc-config))} $SHELL{dirname $(dirname $(which nf-config))} @@ -2516,7 +2775,7 @@ $CIME_OUTPUT_ROOT/$CASE/run $CIME_OUTPUT_ROOT/$CASE/bld 0.1 - 1000 + 0 $SHELL{dirname $(dirname $(which nc-config))} $SHELL{dirname $(dirname $(which nf-config))} @@ -2636,32 +2895,171 @@ - - LLNL Linux Cluster, Linux (pgi), 56 pes/node, batch system is Slurm + + ANL LCRC cluster 825-node AMD 7713 2-sockets 128-cores per node + ilogin(1|2|3|4).lcrc.anl.gov LINUX - intel - mpich - cbronze - /p/lustre2/$USER/e3sm_scratch/ruby - /usr/gdata/climdat/ccsm3data/inputdata - /usr/gdata/climdat/ccsm3data/inputdata/atm/datm7 - /p/lustre2/$USER/archive/$CASE - /p/lustre2/$USER/ccsm_baselines/$COMPILER - /usr/gdata/climdat/tools/cprnc + gnu + openmpi + e3sm + /lcrc/group/e3sm/$USER/scratch/improv + /lcrc/group/e3sm/data/inputdata + /lcrc/group/e3sm/data/inputdata/atm/datm7 + /lcrc/group/e3sm/$USER/scratch/improv/archive/$CASE + /lcrc/group/e3sm/baselines/improv/$COMPILER + /lcrc/group/e3sm/tools/cprnc/cprnc.improv 8 - lc_slurm - donahue5 -at- llnl.gov - 56 - 56 - - - - - srun - - - /usr/share/lmod/lmod/init/env_modules_python.py - /usr/share/lmod/lmod/init/perl + e3sm_integration + 8 + pbspro + E3SM + 128 + 128 + FALSE + + mpirun + + --tag-output -n {{ total_tasks }} + --map-by ppr:1:core:PE=$ENV{OMP_NUM_THREADS} --bind-to core --oversubscribe + + + + /gpfs/fs1/soft/chrysalis/spack/opt/spack/linux-centos8-x86_64/gcc-9.3.0/lmod-8.3-5be73rg/lmod/lmod/init/sh + /gpfs/fs1/soft/chrysalis/spack/opt/spack/linux-centos8-x86_64/gcc-9.3.0/lmod-8.3-5be73rg/lmod/lmod/init/csh + /gpfs/fs1/soft/chrysalis/spack/opt/spack/linux-centos8-x86_64/gcc-9.3.0/lmod-8.3-5be73rg/lmod/lmod/init/env_modules_python.py + /gpfs/fs1/soft/chrysalis/spack/opt/spack/linux-centos8-x86_64/gcc-9.3.0/lmod-8.3-5be73rg/lmod/lmod/libexec/lmod python + module + module + + + cmake/3.27.4 + + + gcc/12.3.0 + + + $CIME_OUTPUT_ROOT/$CASE/run + $CIME_OUTPUT_ROOT/$CASE/bld + 0.05 + 0 + + /lcrc/group/e3sm/soft/improv/netcdf-c/4.9.2b/gcc-12.3.0/openmpi-4.1.6 + /lcrc/group/e3sm/soft/improv/netcdf-fortran/4.6.1b/gcc-12.3.0/openmpi-4.1.6 + /lcrc/group/e3sm/soft/improv/pnetcdf/1.12.3/gcc-12.3.0/openmpi-4.1.6 + /lcrc/group/e3sm/soft/improv/pnetcdf/1.12.3/gcc-12.3.0/openmpi-4.1.6/bin:/lcrc/group/e3sm/soft/improv/netcdf-fortran/4.6.1b/gcc-12.3.0/openmpi-4.1.6/bin:/lcrc/group/e3sm/soft/improv/netcdf-c/4.9.2b/gcc-12.3.0/openmpi-4.1.6/bin:/lcrc/group/e3sm/soft/improv/openmpi/4.1.6/gcc-12.3.0/bin:/lcrc/group/e3sm/soft/perl/improv/bin:$ENV{PATH} + $SHELL{lp=/lcrc/group/e3sm/soft/improv/netlib-lapack/3.12.0/gcc-12.3.0:/lcrc/group/e3sm/soft/improv/pnetcdf/1.12.3/gcc-12.3.0/openmpi-4.1.6/lib:/lcrc/group/e3sm/soft/improv/netcdf-fortran/4.6.1b/gcc-12.3.0/openmpi-4.1.6/lib:/lcrc/group/e3sm/soft/improv/netcdf-c/4.9.2b/gcc-12.3.0/openmpi-4.1.6/lib:/opt/pbs/lib:/lcrc/group/e3sm/soft/improv/openmpi/4.1.6/gcc-12.3.0/lib; if [ -z "$LD_LIBRARY_PATH" ]; then echo $lp; else echo "$lp:$LD_LIBRARY_PATH"; fi} + + + 128M + + + cores + + + + + LLNL Linux Cluster, Linux, 4 V100 GPUs/node, 44 IBM P9 cpu cores/node + lassen.* + LINUX + gnugpu + spectrum-mpi + cbronze + /usr/workspace/$USER/e3sm_scratch + /usr/gdata/climdat/ccsm3data/inputdata + /usr/gdata/climdat/ccsm3data/inputdata/atm/datm7 + /usr/workspace/$USER/archive/$CASE + /usr/gdata/climdat/baselines/$COMPILER + 16 + lsf + donahue5 -at- llnl.gov + 40 + 40 + + + + + jsrun + + -X 1 + $SHELL{if [ {{ total_tasks }} -eq 1 ];then echo --nrs 1 --rs_per_host 1;else echo --nrs $NUM_RS --rs_per_host $RS_PER_NODE;fi} + --tasks_per_rs $SHELL{echo "({{ tasks_per_node }} + $RS_PER_NODE - 1)/$RS_PER_NODE"|bc} + -d plane:$SHELL{echo "({{ tasks_per_node }} + $RS_PER_NODE - 1)/$RS_PER_NODE"|bc} + --cpu_per_rs $ENV{CPU_PER_RS} + --gpu_per_rs $ENV{GPU_PER_RS} + --bind packed:smt:$ENV{OMP_NUM_THREADS} + --latency_priority $ENV{LTC_PRT} + --stdio_mode prepended + $ENV{JSRUN_THREAD_VARS} + $ENV{SMPIARGS} + + + + /usr/share/lmod/lmod/init/env_modules_python.py + /usr/share/lmod/lmod/init/perl + /usr/share/lmod/lmod/init/sh + /usr/share/lmod/lmod/init/csh + module + module + /usr/share/lmod/lmod/libexec/lmod python + /usr/share/lmod/lmod/libexec/lmod perl + + + git + gcc/8.3.1 + cuda/11.8.0 + cmake/3.16.8 + spectrum-mpi + python/3.7.2 + + + /p/gpfs1/$USER/e3sm_scratch/$CASE/run + $CIME_OUTPUT_ROOT/$CASE/bld + + + + + -E OMP_NUM_THREADS=$ENV{OMP_NUM_THREADS} -E OMP_PROC_BIND=spread -E OMP_PLACES=threads -E OMP_STACKSIZE=256M + + + y + /usr/gdata/climdat/netcdf/bin:$ENV{PATH} + /usr/gdata/climdat/netcdf/lib:$ENV{LD_LIBRARY_PATH} + /usr/gdata/climdat/netcdf + 2 + 20 + 2 + gpu-cpu + $SHELL{echo "2*((`./xmlquery --value TOTAL_TASKS` + `./xmlquery --value TASKS_PER_NODE` - 1)/`./xmlquery --value TASKS_PER_NODE`)"|bc} + --smpiargs="-gpu" + + + + + LLNL Linux Cluster, Linux (pgi), 56 pes/node, batch system is Slurm + LINUX + intel + mpich + cbronze + /p/lustre2/$USER/e3sm_scratch/ruby + /usr/gdata/climdat/ccsm3data/inputdata + /usr/gdata/climdat/ccsm3data/inputdata/atm/datm7 + /p/lustre2/$USER/archive/$CASE + /p/lustre2/$USER/ccsm_baselines/$COMPILER + /usr/gdata/climdat/tools/cprnc + 8 + lc_slurm + donahue5 -at- llnl.gov + 56 + 56 + + + + + srun + + + /usr/share/lmod/lmod/init/env_modules_python.py + /usr/share/lmod/lmod/init/perl /usr/share/lmod/lmod/init/sh /usr/share/lmod/lmod/init/csh module @@ -2675,17 +3073,20 @@ intel-classic/2021.6.0-magic mvapich2/2.3.7 cmake/3.19.2 - netcdf-fortran-parallel/4.6.0 - netcdf-c-parallel/4.9.0 + /usr/gdata/climdat/install/quartz/modulefiles + hdf5/1.12.2 + netcdf-c/4.9.0 + netcdf-fortran/4.6.0 parallel-netcdf/1.12.3 + screamML-venv/0.0.1 $CIME_OUTPUT_ROOT/$CASE/run $CIME_OUTPUT_ROOT/$CASE/bld - /usr/tce/packages/netcdf-fortran/netcdf-fortran-4.6.0-mvapich2-2.3.7-intel-classic-2021.6.0/ - /usr/tce/packages/parallel-netcdf/parallel-netcdf-1.12.3-mvapich2-2.3.7-intel-classic-2021.6.0/ - + /usr/gdata/climdat/install/quartz/netcdf-fortran/ + /usr/tce/packages/parallel-netcdf/parallel-netcdf-1.12.3-mvapich2-2.3.7-intel-classic-2021.6.0 + @@ -2963,6 +3364,101 @@ + + ALCF Polaris 560 nodes, 2.8 GHz AMD EPYC Milan 7543P 32c CPU, 4 NVIDIA A100 GPUs + polaris-* + Linux + gnu,gnugpu,nvidia,nvidiagpu + mpich + E3SM_RRM + E3SM_RRM + /grand/E3SMinput/performance_archive + SCREAM_Calib,E3SM_RRM + /eagle/$PROJECT/$USER/scratch + /grand/E3SMinput/data + /grand/E3SMinput/data/atm/datm7 + $CIME_OUTPUT_ROOT/archive/$CASE + /grand/E3SMinput/baselines/$COMPILER + /grand/E3SMinput/soft/cprnc/cprnc + 4 + e3sm_developer + 4 + pbspro + e3sm + 64 + 4 + 32 + 32 + TRUE + + mpiexec + + -np {{ total_tasks }} --label + -ppn {{ tasks_per_node }} + --cpu-bind depth -envall + -d $ENV{OMP_NUM_THREADS} + $ENV{GPU_TILE_COMPACT} + + + + /usr/share/lmod/8.3.1/init/python + /usr/share/lmod/8.3.1/init/sh + /usr/share/lmod/8.3.1/init/csh + /usr/share/lmod/lmod/libexec/lmod python + module + module + + + cmake/3.23.2 + craype-x86-rome + + + PrgEnv-gnu/8.3.3 + + + gcc/12.2.0 gcc/11.2.0 + cudatoolkit-standalone/11.4.4 + + + PrgEnv-nvhpc/8.3.3 + + + cudatoolkit-standalone/11.4.4 + craype-accel-nvidia80 + + + craype-network-ofi + libfabric/1.15.2.0 + cray-libsci/23.02.1.1 + cray-hdf5-parallel/1.12.2.3 + cray-netcdf-hdf5parallel/4.9.0.3 + cray-parallel-netcdf/1.12.3.3 + + + $CIME_OUTPUT_ROOT/$CASE/run + $CIME_OUTPUT_ROOT/$CASE/bld + + $ENV{CRAY_NETCDF_HDF5PARALLEL_PREFIX} + $ENV{CRAY_PARALLEL_NETCDF_PREFIX} + 0 + host + + + + 1 + nvidia80 + /grand/E3SMinput/soft/qsub/set_affinity_gpu_polaris.sh + + + /opt/cray/pe/gcc/11.2.0/snos/lib64/libstdc++.so + + + 128M + spread + threads + + + ANL Sunspot Test and Development System (TDS), batch system is pbspro uan-.* @@ -3777,181 +4273,153 @@ - - LANL Linux Cluster, 36 pes/node, batch system slurm - gr-fe.*.lanl.gov - LINUX - intel,gnu - openmpi,impi,mvapich - climateacme - /lustre/scratch4/turquoise/$ENV{USER}/E3SM/scratch - /lustre/scratch4/turquoise/$ENV{USER}/E3SM/input_data - /lustre/scratch4/turquoise/$ENV{USER}/E3SM/input_data/atm/datm7 - /lustre/scratch4/turquoise/$ENV{USER}/E3SM/archive/$CASE - /lustre/scratch4/turquoise/$ENV{USER}/E3SM/input_data/ccsm_baselines/$COMPILER - /turquoise/usr/projects/climate/SHARED_CLIMATE/software/wolf/cprnc/v0.40/cprnc - 4 - e3sm_developer - slurm - luke.vanroekel @ gmail.com - 36 - 32 - TRUE - - srun - - -n {{ total_tasks }} - - - - - - - /usr/share/Modules/init/perl.pm - /usr/share/Modules/init/python.py - /etc/profile.d/z00_lmod.sh - /etc/profile.d/z00_lmod.csh - /usr/share/lmod/lmod/libexec/lmod perl - /usr/share/lmod/lmod/libexec/lmod python - module - module - - - cmake/3.16.2 - - - gcc/6.4.0 - openmpi/2.1.2 - - - gcc/6.4.0 - mvapich2/2.3 - - - intel/19.0.4 - intel-mpi/2019.4 - - - intel/18.0.2 - mvapich2/2.2 - - - intel/19.0.4 - openmpi/2.1.2 - - - friendly-testing - hdf5-parallel/1.8.16 - pnetcdf/1.11.2 - netcdf-h5parallel/4.7.3 - mkl/2019.0.4 - - - $CIME_OUTPUT_ROOT/$CASE/run - $CIME_OUTPUT_ROOT/$CASE/bld - - $ENV{MKLROOT} - romio_ds_write=disable;romio_ds_read=disable;romio_cb_write=enable;romio_cb_read=enable - - - - - LANL Linux Cluster, 36 pes/node, batch system slurm - ba-fe.*.lanl.gov - LINUX - intel,gnu - openmpi,impi,mvapich - climateacme - /lustre/scratch4/turquoise/$ENV{USER}/E3SM/scratch - /lustre/scratch4/turquoise/$ENV{USER}/E3SM/input_data - /lustre/scratch4/turquoise/$ENV{USER}/E3SM/input_data/atm/datm7 - /lustre/scratch4/turquoise/$ENV{USER}/E3SM/archive/$CASE - /lustre/scratch4/turquoise/$ENV{USER}/E3SM/input_data/ccsm_baselines/$COMPILER - /turquoise/usr/projects/climate/SHARED_CLIMATE/software/wolf/cprnc/v0.40/cprnc - 4 + + Chicoma CPU-only nodes at LANL IC. Each node has 2 AMD EPYC 7H12 64-Core (Milan) 512GB + ch-fe* + Linux + gnu,intel,nvidia,amdclang + mpich + /lustre/scratch5/$ENV{USER}/E3SM/scratch/chicoma-cpu + /usr/projects/e3sm/inputdata + /usr/projects/e3sm/inputdata/atm/datm7 + /lustre/scratch5/$ENV{USER}/E3SM/archive/$CASE + /lustre/scratch5/$ENV{USER}/E3SM/input_data/ccsm_baselines/$COMPILER + /usr/projects/climate/SHARED_CLIMATE/software/badger/cprnc + 10 e3sm_developer + 4 slurm e3sm - 36 - 32 + 256 + 128 TRUE srun - -n {{ total_tasks }} - - - - + --label + -n {{ total_tasks }} -N {{ num_nodes }} + -c $SHELL{echo 256/`./xmlquery --value MAX_MPITASKS_PER_NODE`|bc} + $SHELL{if [ 128 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu_bind=cores"; else echo "--cpu_bind=threads";fi;} + -m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`} + - - /usr/share/Modules/init/perl.pm - /usr/share/Modules/init/python.py - /etc/profile.d/z00_lmod.sh - /etc/profile.d/z00_lmod.csh + + /usr/share/lmod/8.3.1/init/perl + + /usr/share/lmod/8.3.1/init/python + /usr/share/lmod/8.3.1/init/sh + /usr/share/lmod/8.3.1/init/csh /usr/share/lmod/lmod/libexec/lmod perl /usr/share/lmod/lmod/libexec/lmod python module module + - - cmake/3.16.2 - - - gcc/6.4.0 - openmpi/2.1.2 + cray-hdf5-parallel + cray-netcdf-hdf5parallel + cray-parallel-netcdf + cray-netcdf + cray-hdf5 + PrgEnv-gnu + PrgEnv-intel + PrgEnv-nvidia + PrgEnv-cray + PrgEnv-aocc + intel + intel-oneapi + nvidia + aocc + cudatoolkit + climate-utils + craype-accel-nvidia80 + craype-accel-host + perftools-base + perftools + darshan - - gcc/6.4.0 - mvapich2/2.3 + + + PrgEnv-gnu/8.4.0 + gcc/12.2.0 + cray-libsci/23.05.1.4 - - intel/19.0.4 - intel-mpi/2019.4 + + + PrgEnv-nvidia/8.4.0 + nvidia/22.7 + cray-libsci/23.05.1.4 - - intel/18.0.2 - mvapich2/2.2 + + + PrgEnv-intel/8.4.0 + intel-classic/2023.2.0 - - intel/19.0.4 - openmpi/2.1.2 + + + PrgEnv-aocc/8.4.0 + aocc/3.2.0 + cray-libsci/23.05.1.4 + - friendly-testing - hdf5-parallel/1.8.16 - pnetcdf/1.11.2 - netcdf-h5parallel/4.7.3 - mkl/2019.0.4 + craype-accel-host + craype/2.7.21 + cray-mpich/8.1.26 + libfabric/1.15.2.0 + cray-hdf5-parallel/1.12.2.3 + cray-netcdf-hdf5parallel/4.9.0.3 + cray-parallel-netcdf/1.12.3.3 + cmake/3.25.1 + $CIME_OUTPUT_ROOT/$CASE/run $CIME_OUTPUT_ROOT/$CASE/bld + 0.1 + - $ENV{MKLROOT} + 1 + 1 + 128M + spread + threads + FALSE + /usr/projects/climate/SHARED_CLIMATE/software/chicoma-cpu/perl5-only-switch/lib/perl5 romio_ds_write=disable;romio_ds_read=disable;romio_cb_write=enable;romio_cb_read=enable + software + MPI_Bcast + $ENV{CRAY_NETCDF_HDF5PARALLEL_PREFIX} + $ENV{CRAY_PARALLEL_NETCDF_PREFIX} + + -1 + - - Chicoma CPU-only nodes at LANL IC. Each node has 2 AMD EPYC 7H12 64-Core (Milan) 512GB + + Chicoma GPU nodes at LANL IC. Each GPU node has single +AMD EPYC 7713 64-Core (Milan) (256GB) and 4 nvidia A100' ch-fe* Linux - gnu,nvidia,intel,aocc,amdclang + gnugpu,gnu,nvidiagpu,nvidia mpich - /lustre/scratch4/turquoise/$ENV{USER}/E3SM/scratch/chicoma-cpu + /lustre/scratch5/$ENV{USER}/E3SM/scratch/chicoma-gpu /usr/projects/e3sm/inputdata /usr/projects/e3sm/inputdata/atm/datm7 - /lustre/scratch4/turquoise/$ENV{USER}/E3SM/archive/$CASE - /lustre/scratch4/turquoise/$ENV{USER}/E3SM/input_data/ccsm_baselines/$COMPILER + /lustre/scratch5/$ENV{USER}/E3SM/archive/$CASE + /lustre/scratch5/$ENV{USER}/E3SM/input_data/ccsm_baselines/$COMPILER /usr/projects/climate/SHARED_CLIMATE/software/badger/cprnc 10 e3sm_developer 4 slurm e3sm - 256 - 64 + 128 + 256 + 256 + 4 + 64 + 64 TRUE srun @@ -3959,7 +4427,7 @@ --label -n {{ total_tasks }} -N {{ num_nodes }} -c $ENV{OMP_NUM_THREADS} - $SHELL{if [ 128 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu_bind=cores"; else echo "--cpu_bind=threads";fi;} + $SHELL{if [ 128 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu_bind=cores"; else echo "--cpu_bind=threads";fi;} -m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`} @@ -3978,50 +4446,64 @@ cray-hdf5-parallel cray-netcdf-hdf5parallel cray-parallel-netcdf + cray-netcdf + cray-hdf5 PrgEnv-gnu + PrgEnv-intel PrgEnv-nvidia PrgEnv-cray PrgEnv-aocc + intel + intel-oneapi + nvidia + aocc + cudatoolkit + climate-utils craype-accel-nvidia80 craype-accel-host - cce + perftools-base + perftools + darshan - + PrgEnv-gnu/8.4.0 - gcc/12.2.0 + gcc/11.2.0 - + PrgEnv-nvidia/8.4.0 nvidia/22.7 - - PrgEnv-intel/8.4.0 - intel-classic/2023.2.0 + + cudatoolkit/22.7_11.7 + craype-accel-nvidia80 - - PrgEnv-aocc/8.4.0 - aocc/3.2.0 + + cudatoolkit/22.7_11.7 + craype-accel-nvidia80 + gcc-mixed/11.2.0 - - PrgEnv-aocc/8.4.0 - aocc/3.2.0 + + craype-accel-host - + craype-accel-host - cray-libsci - craype + + + + cray-libsci/23.05.1.4 + craype/2.7.21 cray-mpich/8.1.26 libfabric/1.15.2.0 cray-hdf5-parallel/1.12.2.3 cray-netcdf-hdf5parallel/4.9.0.3 cray-parallel-netcdf/1.12.3.3 - cmake/3.22.3 + cmake/3.25.1 @@ -4042,6 +4524,7 @@ MPI_Bcast $ENV{CRAY_NETCDF_HDF5PARALLEL_PREFIX} $ENV{CRAY_PARALLEL_NETCDF_PREFIX} + /usr/projects/e3sm/cudatoolkit:$ENV{PKG_CONFIG_PATH} -1 From 8d606732664ea288b8901615a2804b1d0ecbbaef Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Tue, 9 Apr 2024 13:53:24 -0500 Subject: [PATCH 2/2] Update OpenMPI and other modules for Chrysalis --- mache/spack/chrysalis_intel_openmpi.yaml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mache/spack/chrysalis_intel_openmpi.yaml b/mache/spack/chrysalis_intel_openmpi.yaml index 3fd830a8..f59e94fc 100644 --- a/mache/spack/chrysalis_intel_openmpi.yaml +++ b/mache/spack/chrysalis_intel_openmpi.yaml @@ -18,7 +18,7 @@ spack: all: compiler: [intel@20.0.4] providers: - mpi: [openmpi@4.1.3] + mpi: [openmpi@4.1.6] {% if e3sm_lapack %} lapack: [intel-mkl@2020.4.304] {% endif %} @@ -93,10 +93,10 @@ spack: buildable: false openmpi: externals: - - spec: openmpi@4.1.3 - prefix: /gpfs/fs1/soft/chrysalis/spack/opt/spack/linux-centos8-x86_64/intel-20.0.4/openmpi-4.1.3-pin4k7o + - spec: openmpi@4.1.6 + prefix: /gpfs/fs1/soft/chrysalis/spack/opt/spack/linux-centos8-x86_64/intel-20.0.4/openmpi-4.1.6-2mm63n2 modules: - - openmpi/4.1.3-pin4k7o + - openmpi/4.1.6-2mm63n2 buildable: false intel-mkl: externals: @@ -109,30 +109,30 @@ spack: hdf5: externals: - spec: hdf5@1.10.7+cxx+fortran+hl+mpi - prefix: /gpfs/fs1/soft/chrysalis/spack/opt/spack/linux-centos8-x86_64/intel-20.0.4/hdf5-1.10.7-eewgp6v + prefix: /gpfs/fs1/soft/chrysalis/spack/opt/spack/linux-centos8-x86_64/intel-20.0.4/hdf5-1.10.7-4cghwvq modules: - - hdf5/1.10.7-eewgp6v + - hdf5/1.10.7-4cghwvq buildable: false netcdf-c: externals: - spec: netcdf-c@4.4.1+mpi~parallel-netcdf - prefix: /gpfs/fs1/soft/chrysalis/spack/opt/spack/linux-centos8-x86_64/intel-20.0.4/netcdf-c-4.4.1-ihoo4zq + prefix: /gpfs/fs1/soft/chrysalis/spack/opt/spack/linux-centos8-x86_64/intel-20.0.4/netcdf-c-4.4.1-a4hji6e modules: - - netcdf-c/4.4.1-ihoo4zq + - netcdf-c/4.4.1-a4hji6e buildable: false netcdf-fortran: externals: - spec: netcdf-fortran@4.4.4 - prefix: /gpfs/fs1/soft/chrysalis/spack/opt/spack/linux-centos8-x86_64/intel-20.0.4/netcdf-fortran-4.4.4-tplolxh + prefix: /gpfs/fs1/soft/chrysalis/spack/opt/spack/linux-centos8-x86_64/intel-20.0.4/netcdf-fortran-4.4.4-husened modules: - - netcdf-fortran/4.4.4-tplolxh + - netcdf-fortran/4.4.4-husened buildable: false parallel-netcdf: externals: - spec: parallel-netcdf@1.11.0+cxx+fortran - prefix: /gpfs/fs1/soft/chrysalis/spack/opt/spack/linux-centos8-x86_64/intel-20.0.4/parallel-netcdf-1.11.0-gvcfihh + prefix: /gpfs/fs1/soft/chrysalis/spack/opt/spack/linux-centos8-x86_64/intel-20.0.4/parallel-netcdf-1.11.0-icrpxty modules: - - parallel-netcdf/1.11.0-gvcfihh + - parallel-netcdf/1.11.0-icrpxty buildable: false {% endif %} config: