From 727b2140b81662316f6b6c51fa749ce5cb986b34 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Tue, 7 Mar 2023 22:59:47 +0100 Subject: [PATCH] Update config_machines.xml from E3SM/master commit: c4516984157e9dd616e902370328b71674f65642 --- mache/cime_machine_config/config_machines.xml | 400 ++++++++++++++++-- 1 file changed, 358 insertions(+), 42 deletions(-) diff --git a/mache/cime_machine_config/config_machines.xml b/mache/cime_machine_config/config_machines.xml index 132a843c..a863b175 100644 --- a/mache/cime_machine_config/config_machines.xml +++ b/mache/cime_machine_config/config_machines.xml @@ -233,6 +233,7 @@ $CIME_OUTPUT_ROOT/$CASE/run $CIME_OUTPUT_ROOT/$CASE/bld 0.1 + 0.20 1 @@ -568,7 +569,7 @@ Crusher. NCCS moderate-security system that contains similar hardware and software as the upcoming Frontier system at ORNL. 192 AMD EPYC 7A53 64C nodes, 128 hwthreads, 512GB DDR4, 4 MI250X GPUs .*crusher.* CNL - gnu,crayclang,amdclang + gnu,crayclang,amdclang,gnugpu,crayclanggpu,amdclanggpu mpich cli133_crusher /gpfs/alpine/cli133/proj-shared/$ENV{USER}/e3sm_scratch/crusher @@ -583,14 +584,17 @@ e3sm 56 56 + 8 + 8 + 8 TRUE srun -l -K -n {{ total_tasks }} -N {{ num_nodes }} - --threads-per-core=1 -c $ENV{OMP_NUM_THREADS} - -m *:block + $ENV{NTASKS_PER_GPU} + $ENV{GPU_BIND_ARGS} @@ -602,19 +606,31 @@ module module /usr/share/lmod/lmod/libexec/lmod python - + PrgEnv-cray PrgEnv-cray/8.3.3 - cce cce/14.0.0 + cce cce/14.0.2 - + + craype-accel-amd-gfx90a + rocm/5.4.0 + + PrgEnv-cray PrgEnv-amd/8.3.3 amd amd/5.4.0 - + + craype-accel-amd-gfx90a + + PrgEnv-cray PrgEnv-gnu/8.3.3 + gcc gcc/11.2.0 + + + craype-accel-amd-gfx90a + rocm/5.4.0 cray-python/3.9.12.1 @@ -631,13 +647,209 @@ 0.1 0.25 0 + + $ENV{NETCDF_DIR} + $ENV{PNETCDF_DIR} + + + + + $ENV{CRAY_LIBSCI_DIR}/amd/4.0/x86_64/lib:$ENV{LD_LIBRARY_PATH} + + + --ntasks-per-gpu=$SHELL{echo "`./xmlquery --value MAX_MPITASKS_PER_NODE`/8"|bc} + --gpu-bind=closest + romio_cb_read=disable + 0 + + + 10 + 3 + + + 128M + spread + threads + + + + + + + + Crusher. NCCS moderate-security system that contains similar hardware and software as the upcoming Frontier system at ORNL. 192 AMD EPYC 7A53 64C nodes, 128 hwthreads, 512GB DDR4, 4 MI250X GPUs + .*crusher.* + CNL + gnu,crayclang-scream,amdclang + mpich + CLI133_crusher + /gpfs/alpine/cli133/proj-shared/$ENV{USER}/e3sm_scratch/crusher + /gpfs/alpine/cli115/world-shared/e3sm/inputdata + /gpfs/alpine/cli115/world-shared/e3sm/inputdata/atm/datm7 + $CIME_OUTPUT_ROOT/archive/$CASE + /gpfs/alpine/cli133/world-shared/e3sm/tools/cprnc/cprnc + 8 + 1 + slurm + e3sm + 56 + 56 + TRUE + + + srun + + -l -K -n {{ total_tasks }} -N {{ num_nodes }} + + + --threads-per-core=1 + -c $ENV{OMP_NUM_THREADS} + -m *:block + + + + + + /usr/share/lmod/lmod/init/sh + /usr/share/lmod/lmod/init/csh + /usr/share/lmod/lmod/init/perl + /usr/share/lmod/lmod/init/env_modules_python.py + /usr/share/lmod/lmod/libexec/lmod perl + module + module + /usr/share/lmod/lmod/libexec/lmod python + + + + PrgEnv-cray PrgEnv-cray/8.3.3 + cce cce/14.0.0 + + + + + PrgEnv-cray PrgEnv-amd/8.3.3 + amd amd/5.1.0 + + + + + PrgEnv-cray PrgEnv-gnu/8.3.3 + + + cray-mpich/8.1.12 + cray-python/3.9.4.2 + subversion/1.14.0 + git/2.31.1 + cmake/3.21.3 + zlib/1.2.11 + cray-libsci/21.08.1.2 + cray-hdf5-parallel/1.12.1.1 + cray-netcdf-hdf5parallel/4.8.1.1 + cray-parallel-netcdf/1.12.1.7 + + + + $CIME_OUTPUT_ROOT/$CASE/run + $CIME_OUTPUT_ROOT/$CASE/bld + 0.1 $ENV{NETCDF_DIR} $ENV{PNETCDF_DIR} + $ENV{CRAY_LIBSCI_DIR}/amd/4.0/x86_64/lib:$ENV{LD_LIBRARY_PATH} + + + 0 + + + + 128M + spread + threads + + + + + + + + + Crusher. NCCS moderate-security system that contains similar hardware and software as the upcoming Frontier system at ORNL. 192 AMD EPYC 7A53 64C nodes, 128 hwthreads, 512GB DDR4, 4 MI250X GPUs + .*crusher.* + CNL + crayclang-scream + mpich + CLI133_crusher + /gpfs/alpine/cli133/proj-shared/$ENV{USER}/e3sm_scratch/crusher + /gpfs/alpine/cli115/world-shared/e3sm/inputdata + /gpfs/alpine/cli115/world-shared/e3sm/inputdata/atm/datm7 + $CIME_OUTPUT_ROOT/archive/$CASE + /gpfs/alpine/cli133/world-shared/e3sm/tools/cprnc/cprnc + 8 + 1 + slurm + e3sm + 64 + 8 + TRUE + + + srun + + -l -K -n {{ total_tasks }} -N {{ num_nodes }} + + + --gpus-per-node=8 --gpu-bind=closest + -c $ENV{OMP_NUM_THREADS} + + + + + + + /usr/share/lmod/lmod/init/sh + /usr/share/lmod/lmod/init/csh + /usr/share/lmod/lmod/init/perl + /usr/share/lmod/lmod/init/env_modules_python.py + /usr/share/lmod/lmod/libexec/lmod perl + module + module + /usr/share/lmod/lmod/libexec/lmod python + + + PrgEnv-cray + + + rocm/5.1.0 + + + cray-python/3.9.4.2 + subversion/1.14.0 + git/2.31.1 + cmake/3.21.3 + zlib/1.2.11 + cray-hdf5-parallel/1.12.1.1 + cray-netcdf-hdf5parallel/4.8.1.1 + cray-parallel-netcdf/1.12.1.7 + + + + $CIME_OUTPUT_ROOT/$CASE/run + $CIME_OUTPUT_ROOT/$CASE/bld + 0.1 + 0 + + $ENV{NETCDF_DIR} + $ENV{PNETCDF_DIR} + 0 + + 0 + romio_cb_read=disable + + 128M spread @@ -645,6 +857,11 @@ + + + + + Cori. XC40 Cray system at NERSC. Haswell partition. os is CNL, 32 pes/node, batch system is SLURM cori-knl-is-default @@ -1095,6 +1312,36 @@ + + Windows Subsystem for Linux v2, using Ubuntu distribution + none + LINUX + gnu + mpich + $ENV{HOME}/e3sm_scratch + $ENV{HOME}/pt-e3sm-inputdata + $ENV{HOME}/pt-e3sm-inputdata + $ENV{HOME}/e3sm_scratch/archive/$CASE + $ENV{HOME}/e3sm_baselines + $CCSMROOT/tools/cprnc/build/cprnc + make + 4 + e3sm_developer + none + thorntonpe at ornl dot gov + 4 + 4 + + mpirun + + -np {{ total_tasks }} + + + + $ENV{HOME}/e3sm_scratch/$CASE/run + $ENV{HOME}/e3sm_scratch/$CASE/bld + + Singularity container singularity @@ -1219,7 +1466,7 @@ mappy LINUX proxy.sandia.gov:80 - gnu,intel + gnu,gnu9,intel openmpi /sems-data-store/ACME/mappy/timings .* @@ -1231,15 +1478,14 @@ /sems-data-store/ACME/mappy/cprnc/cprnc 64 e3sm_developer - none + slurm_single_node jgfouca at sandia dot gov 64 64 - mpirun + srun - -np {{ total_tasks }} - --map-by ppr:{{ tasks_per_numa }}:socket:PE=$ENV{OMP_NUM_THREADS} --bind-to hwthread:overload-allowed + --cpu_bind=threads @@ -1261,15 +1507,21 @@ acme-gcc/8.1.0 + + sems-archive-gcc/9.2.0 + sems-archive-intel/19.0.5 - + acme-netcdf/4.4.1/exo_acme acme-pfunit/3.2.8/base + + sems-archive-netcdf/4.7.3/base + - acme-openmpi/2.1.5 + acme-openmpi/4.1.4 acme-netcdf/4.7.4/acme @@ -1286,6 +1538,60 @@ + + Sandia GPU testbed + weaver + LINUX + gnugpu + openmpi + /home/projects/e3sm/timings + .* + $ENV{HOME}/acme/scratch + /home/projects/e3sm/scream/data + /home/projects/e3sm/scream/data/atm/datm7 + $CIME_OUTPUT_ROOT/archive/$CASE + /home/projects/e3sm/baselines/weaver/$COMPILER + /home/projects/e3sm/cprnc/cprnc + 32 + e3sm_developer + lsf + jgfouca at sandia dot gov + 32 + 32 + + mpirun + + -np {{ total_tasks }} + + + + /usr/share/Modules/init/sh + /usr/share/Modules/init/python.py + module + /usr/bin/modulecmd python + + + cuda/10.1.105 + ucx/1.6.0 + git/2.10.1 + python/3.7.3 + cmake/3.18.0 + perl/5.22.1 + + + $CIME_OUTPUT_ROOT/$CASE/run + $CIME_OUTPUT_ROOT/$CASE/bld + 0.1 + 0 + + + /ascldap/users/projects/e3sm/scream/libs/mpfr/install/weaver/lib:/ascldap/users/projects/e3sm/scream/libs/gcc/install/weaver/gcc/8.5.0/lib64:/ascldap/users/projects/e3sm/scream/libs/gcc/install/weaver/gcc/8.5.0/lib:$ENV{LD_LIBRARY_PATH} + /ascldap/users/projects/e3sm/scream/libs/gcc/install/weaver/gcc/8.5.0/bin:/ascldap/users/projects/e3sm/scream/libs/gcc/install/weaver/gcc/8.5.0/libexec/gcc/powerpc64le-unknown-linux-gnu/8.5.0:/ascldap/users/projects/e3sm/scream/libs/openmpi/install/weaver/gcc/8.5.0/cuda/10.1.105/bin:/ascldap/users/projects/e3sm/scream/libs/pnetcdf/install/weaver/gcc/8.5.0/cuda/10.1.105/bin:/ascldap/users/projects/e3sm/scream/libs/netcdf-c/install/weaver/gcc/8.5.0/cuda/10.1.105/bin:/ascldap/users/projects/e3sm/scream/libs/netcdf-fortran/install/weaver/gcc/8.5.0/cuda/10.1.105/bin:/ascldap/users/projects/e3sm/scream/libs/wget/bin:/ascldap/users/jgfouca/perl5/bin:$ENV{PATH} + /ascldap/users/jgfouca/perl5/lib/perl5 + /ascldap/users/jgfouca/perl5 + + + IBM Power 8 Testbed machine white @@ -1770,7 +2076,7 @@ ANL/LCRC Linux Cluster - b.*.lcrc.anl.gov + (b\d+|blues.*).lcrc.anl.gov LINUX intel,gnu impi,openmpi,mvapich @@ -1927,7 +2233,7 @@ srun --mpi=pmi2 -l -n {{ total_tasks }} -N {{ num_nodes }} --kill-on-bad-exit - $SHELL{if [ 64 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu_bind=cores"; else echo "--cpu_bind=threads";fi;} + $SHELL{if [ 64 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu_bind=cores"; else echo "--cpu_bind=threads";fi;} -c $SHELL{echo 128/ {{ tasks_per_node }} |bc} -m plane={{ tasks_per_node }} @@ -2277,16 +2583,17 @@ LINUX intel mpich - /p/lustre2/$USER + cbronze + /p/lustre2/$USER/e3sm_scratch/syrah /usr/gdata/climdat/ccsm3data/inputdata /usr/gdata/climdat/ccsm3data/inputdata/atm/datm7 - /p/lustre2/$CCSMUSER/archive/$CASE - /p/lustre2/$CCSMUSER/ccsm_baselines/$COMPILER - /p/lscratchd/ma21/ccsm3data/tools/cprnc/cprnc + /p/lustre2/$USER/archive/$CASE + /p/lustre2/$USER/ccsm_baselines/$COMPILER + /usr/gdata/climdat/tools/cprnc 8 lc_slurm donahue5 -at- llnl.gov - 16 + 32 16 @@ -2313,8 +2620,8 @@ pnetcdf/1.9.0 - /p/lustre2/$CCSMUSER/ACME/$CASE/run - /p/lustre2/$CCSMUSER/$CASE/bld + $CIME_OUTPUT_ROOT/$CASE/run + $CIME_OUTPUT_ROOT/$CASE/bld /usr/tce/packages/netcdf-fortran/netcdf-fortran-4.4.4-intel-19.0.4/ /usr/tce/packages/netcdf-fortran/netcdf-fortran-4.4.4-intel-19.0.4/ @@ -2329,16 +2636,17 @@ LINUX intel mpich - /p/lustre1/$USER + cbronze + /p/lustre2/$USER/e3sm_scratch/quartz /usr/gdata/climdat/ccsm3data/inputdata /usr/gdata/climdat/ccsm3data/inputdata/atm/datm7 - /p/lustre1/$CCSMUSER/archive/$CASE - /p/lustre1/$CCSMUSER/ccsm_baselines/$COMPILER - /p/lscratchd/ma21/ccsm3data/tools/cprnc/cprnc + /p/lustre2/$USER/archive/$CASE + /p/lustre2/$USER/ccsm_baselines/$COMPILER + /usr/gdata/climdat/tools/cprnc 8 lc_slurm donahue5 -at- llnl.gov - 36 + 72 36 @@ -2365,8 +2673,8 @@ pnetcdf/1.9.0 - /p/lustre1/$USER/$CASE/run - /p/lustre1/$USER/$CASE/bld + $CIME_OUTPUT_ROOT/$CASE/run + $CIME_OUTPUT_ROOT/$CASE/bld /usr/tce/packages/netcdf-fortran/netcdf-fortran-4.4.4-intel-19.0.4/ /usr/tce/packages/netcdf-fortran/netcdf-fortran-4.4.4-intel-19.0.4/ @@ -3571,10 +3879,10 @@ /usr/Modules/bin/modulecmd python - cmake/3.15.0 + cmake/3.15.0 perl - xml-libxml/2.0116 - python/3.6 + xml-libxml/2.0116 + python/3.6 intel/2016.4.072 @@ -3649,10 +3957,10 @@ /usr/Modules/bin/modulecmd perl /usr/Modules/bin/modulecmd python - + cmake/3.15.0 perl - xml-libxml/2.0116 + xml-libxml/2.0116 python/3.6 @@ -3831,21 +4139,21 @@ nvhpc/21.11 + + xl/16.1.1-10 + + + gcc/9.1.0 + cuda/10.1.243 cuda/11.0.3 - - xl/16.1.1-10 - cuda/10.1.243 - - gcc/9.1.0 - spectrum-mpi/10.4.0.3-20210112 hdf5/1.10.7 @@ -3890,6 +4198,10 @@ cpu-cpu $SHELL{echo "2*((`./xmlquery --value TOTAL_TASKS` + `./xmlquery --value TASKS_PER_NODE` - 1)/`./xmlquery --value TASKS_PER_NODE`)"|bc} $SHELL{echo "(`./xmlquery --value MAX_TASKS_PER_NODE`+41)/42"|bc} + 1 + 1 + mlx5_3:1,mlx5_0:1 + mlx5_0:1,mlx5_3:1 2 @@ -3919,6 +4231,10 @@ 1 gpu-cpu $SHELL{echo "6*((`./xmlquery --value TOTAL_TASKS` + `./xmlquery --value TASKS_PER_NODE` - 1)/`./xmlquery --value TASKS_PER_NODE`)"|bc} + 1 + 1 + mlx5_3:1,mlx5_0:1 + mlx5_0:1,mlx5_3:1 @@ -4762,4 +5078,4 @@ >> e3sm.log.$LID 2>&1 - \ No newline at end of file +