Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added new genoa partition to Snellius configuration + clarify comments on CPU auto-detection #84

Merged
merged 7 commits into from
Sep 19, 2023
27 changes: 11 additions & 16 deletions config/aws_citc.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
# This is an example configuration file
# WARNING: for CPU autodetect to work correctly you need to
# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun
# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository

# Note that CPU autodetect currently does not work with this configuration file on AWS.
# This is because there is no system mpirun, and the CPU autodetection doesn't load any modules
# that would make an mpirun command available (as normal multiprocessing tests would).
# In order to do CPU autodetection, you'll need to change the launcer to srun:
# 'launcher = srun'
# You can run the CPU autodetect by listing all tests (reframe -l ...)
# and then, once all CPUs are autodetected, change the launcher back to mpirun for a 'real' run (reframe -r ...)
# Without this, the autodetect job fails because
# 1. A missing mpirun command
# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job

# Another known issue is that CPU autodetection fails if run from an actual installation of ReFrame.
# It only works if run from a clone of their Github Repo. See https://github.com/reframe-hpc/reframe/issues/2914
# Related issues
# 1. https://github.com/reframe-hpc/reframe/issues/2926
# 2. https://github.com/reframe-hpc/reframe/issues/2914

import os

Expand Down Expand Up @@ -119,6 +118,8 @@
'logging': common_logging_config(reframe_prefix),
'general': [
{
# Enable automatic detection of CPU architecture for each partition
# See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information
'remote_detect': True,
}
],
Expand All @@ -127,12 +128,6 @@
# Add default things to each partition:
partition_defaults = {
'scheduler': 'squeue',
# mpirun causes problems with cpu autodetect, since there is no system mpirun.
# See https://github.com/EESSI/test-suite/pull/53#issuecomment-1590849226
# and this feature request https://github.com/reframe-hpc/reframe/issues/2926
# However, using srun requires either using pmix or proper pmi2 integration in the MPI library
# See https://github.com/EESSI/test-suite/pull/53#issuecomment-1598753968
# Thus, we use mpirun for now, and manually swap to srun if we want to autodetect CPUs...
'launcher': 'mpirun',
'environs': ['default'],
'features': [
Expand Down
33 changes: 24 additions & 9 deletions config/izum_vega.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# WARNING: for CPU autodetect to work correctly you need to
# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun
# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository
# 3. Temporarily change the 'access' field for the GPU partition to
# 'access': ['-p gpu', '--export=None', '--gres=gpu:1'],

# Without this, the autodetect job fails because
# 1. A missing mpirun command
# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job
# 3. Vega doesn't allow submission to the GPU partition without requesting at least one GPU (change #2)

# Related issues
# 1. https://github.com/reframe-hpc/reframe/issues/2926
# 2. https://github.com/reframe-hpc/reframe/issues/2914

import os

from eessi.testsuite.common_config import common_logging_config
Expand All @@ -9,13 +24,6 @@

# This is an example configuration file
site_configuration = {
'general': [
{
# Enable automatic detection of CPU architecture for each partition
# See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information
'remote_detect': True,
}
],
'systems': [
{
'name': 'vega',
Expand All @@ -37,7 +45,7 @@
# Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1)
'export OMPI_MCA_pml=ucx',
],
'launcher': 'mpirun', # Needs to be temporarily changed to srun for cpu autodetection
'launcher': 'mpirun',
# Use --export=None to avoid that login environment is passed down to submitted jobs
'access': ['-p cpu', '--export=None'],
'environs': ['default'],
Expand All @@ -60,7 +68,7 @@
# Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1)
'export OMPI_MCA_pml=ucx',
],
'launcher': 'mpirun', # Needs to be temporarily changed to srun for cpu autodetection
'launcher': 'mpirun',
# Use --export=None to avoid that login environment is passed down to submitted jobs
'access': ['-p gpu', '--export=None'],
'environs': ['default'],
Expand Down Expand Up @@ -94,4 +102,11 @@
},
],
'logging': common_logging_config(reframe_prefix),
'general': [
{
# Enable automatic detection of CPU architecture for each partition
# See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information
'remote_detect': True,
}
],
}
23 changes: 23 additions & 0 deletions config/settings_example.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
# WARNING: for CPU autodetect to work correctly you need to
# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun
# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository
# If your system has a GPU partition, it might force jobs to request at least one GPU. If that is the
# case, you also need to temporarily change 'access' field for the GPU partition to include the request
# for one GPU, e.g. 'access': ['-p gpu', '--export=None', '--gres=gpu:1'],

# Without this, the autodetect job fails because
# 1. A missing mpirun command
# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job

# Related issues
# 1. https://github.com/reframe-hpc/reframe/issues/2926
# 2. https://github.com/reframe-hpc/reframe/issues/2914


"""
Example configuration file
"""
Expand Down Expand Up @@ -79,6 +95,13 @@
},
],
'logging': common_logging_config(),
'general': [
{
# Enable automatic detection of CPU architecture for each partition
# See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information
'remote_detect': True,
}
],
}

# optional logging to syslog
Expand Down
42 changes: 35 additions & 7 deletions config/surf_snellius.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# WARNING: for CPU autodetect to work correctly you need to
# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun
# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository
# 3. Temporarily change the 'access' field for the GPU partition to
# 'access': ['-p gpu', '--export=None', '--exclusive'],

# Without this, the autodetect job fails because
# 1. A missing mpirun command
# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job
# 3. Snellius doesn't allow submission to the GPU partition without requesting at least one GPU

# Related issues
# 1. https://github.com/reframe-hpc/reframe/issues/2926
# 2. https://github.com/reframe-hpc/reframe/issues/2914

import os

from eessi.testsuite.common_config import common_logging_config
Expand All @@ -19,18 +34,32 @@
'stagedir': f'/scratch-shared/{os.environ.get("USER")}/reframe_output/staging',
'partitions': [
{
'name': 'thin',
'name': 'rome',
'scheduler': 'slurm',
'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'],
'launcher': 'mpirun',
'access': ['-p rome', '--export=None'],
'environs': ['default'],
'max_jobs': 120,
'features': [
FEATURES[CPU],
],
'descr': 'AMD Rome CPU partition with native EESSI stack'
},
{
'name': 'genoa',
'scheduler': 'slurm',
'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'],
'launcher': 'mpirun',
'access': ['-p thin', '--export=None'],
'access': ['-p genoa', '--export=None'],
'environs': ['default'],
'max_jobs': 120,
'features': [
FEATURES[CPU],
],
'descr': 'Test CPU partition with native EESSI stack'
'descr': 'AMD Genoa CPU partition with native EESSI stack'
},

{
'name': 'gpu',
'scheduler': 'slurm',
Expand All @@ -57,7 +86,7 @@
'extras': {
GPU_VENDOR: GPU_VENDORS[NVIDIA],
},
'descr': 'Test GPU partition with native EESSI stack'
'descr': 'Nvidia A100 GPU partition with native EESSI stack'
},
]
},
Expand All @@ -73,9 +102,8 @@
'logging': common_logging_config(reframe_prefix),
'general': [
{
# For autodetect to work, temporarily change:
# 1. The launchers to srun
# 2. Add --exclusive to GPU 'access' field above (avoids submission error that no GPUs are requested)
# Enable automatic detection of CPU architecture for each partition
# See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information
'remote_detect': True,
}
],
Expand Down
Loading