Merge branch 'main' into osu_followup

EESSI · Feb 16, 2024 · 1c65a3c · 1c65a3c
2 parents 018ce56 + d516f05
commit 1c65a3c
Show file tree

Hide file tree

Showing 12 changed files with 93 additions and 26 deletions.
diff --git a/config/aws_mc.py b/config/aws_mc.py
@@ -13,7 +13,7 @@
 import os
 
 from eessi.testsuite.common_config import common_logging_config, common_eessi_init
-from eessi.testsuite.constants import FEATURES
+from eessi.testsuite.constants import FEATURES, SCALES
 
 # This config will write all staging, output and logging to subdirs under this prefix
 # Override with RFM_PREFIX environment variable
@@ -97,7 +97,7 @@
     'environs': ['default'],
     'features': [
         FEATURES['CPU']
-    ],
+    ] + list(SCALES.keys()),
     'prepare_cmds': [
         'source %s' % common_eessi_init(),
         # Required when using srun as launcher with --export=NONE in partition access, in order to ensure job

diff --git a/config/github_actions.py b/config/github_actions.py
@@ -17,7 +17,7 @@
                     'scheduler': 'local',
                     'launcher': 'local',
                     'environs': ['default'],
-                    'features': [FEATURES[CPU]],
+                    'features': [FEATURES[CPU]] + list(SCALES.keys()),
                     'processor': {'num_cpus': 2},
                     'resources': [
                         {

diff --git a/config/it4i_karolina.py b/config/it4i_karolina.py
@@ -52,7 +52,7 @@
                     'max_jobs': 120,
                     'features': [
                         FEATURES[CPU],
-                    ],
+                    ] + list(SCALES.keys()),
                     'descr': 'CPU Universal Compute Nodes, see https://docs.it4i.cz/karolina/hardware-overview/'
                 },
 # We don't have GPU budget on Karolina at this time
@@ -88,7 +88,7 @@
 #                     ],
 #                     'features': [
 #                         FEATURES[GPU],
-#                     ],
+#                     ] + list(SCALES.keys()),
 #                     'descr': 'GPU partition with accelerated nodes, see https://docs.it4i.cz/karolina/hardware-overview/'
 #                 },
             ]

diff --git a/config/izum_vega.py b/config/izum_vega.py
@@ -58,7 +58,7 @@
                     ],
                     'features': [
                         FEATURES[CPU],
-                    ],
+                    ] + list(SCALES.keys()),
                     'descr': 'CPU partition Standard, see https://en-doc.vega.izum.si/architecture/'
                 },
                 {
@@ -97,7 +97,7 @@
                     ],
                     'features': [
                         FEATURES[GPU],
-                    ],
+                    ] + list(SCALES.keys()),
                     'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/'
                 },
             ]

diff --git a/config/settings_example.py b/config/settings_example.py
@@ -56,7 +56,9 @@
                             'options': ['--mem={size}'],
                         }
                     ],
-                    'features': [FEATURES[CPU]],
+                    # list(SCALES.keys()) adds all the scales from eessi.testsuite.constants as valid for thi partition
+                    # Can be modified if not all scales can run on this partition, see e.g. the surf_snellius.py config
+                    'features': [FEATURES[CPU]] + list(SCALES.keys()),
                 },
                 {
                     'name': 'gpu_partition',
@@ -94,7 +96,7 @@
                     'features': [
                         FEATURES[CPU],
                         FEATURES[GPU],
-                    ],
+                    ] + list(SCALES.keys()),
                     'extras': {
                         GPU_VENDOR: GPU_VENDORS[NVIDIA],
                     },

diff --git a/config/surf_snellius.py b/config/surf_snellius.py
@@ -22,6 +22,9 @@
 # Override with RFM_PREFIX environment variable
 reframe_prefix = os.path.join(os.environ['HOME'], 'reframe_runs')
 
+# Jobs that partially fill multiple nodes are not allowed on the GPU partition
+valid_scales_snellius_gpu = [s for s in SCALES if s not in ['1_cpn_2_nodes', '1_cpn_4_nodes']]
+
 # This is an example configuration file
 site_configuration = {
     'systems': [
@@ -49,7 +52,7 @@
                     ],
                     'features': [
                         FEATURES[CPU],
-                    ],
+                    ] + list(SCALES.keys()),
                     'descr': 'AMD Rome CPU partition with native EESSI stack'
                 },
                 {
@@ -68,7 +71,7 @@
                     ],
                     'features': [
                         FEATURES[CPU],
-                    ],
+                    ] + list(SCALES.keys()),
                     'descr': 'AMD Genoa CPU partition with native EESSI stack'
                 },
 
@@ -98,7 +101,7 @@
                     ],
                     'features': [
                         FEATURES[GPU],
-                    ],
+                    ] + valid_scales_snellius_gpu,
                     'extras': {
                         GPU_VENDOR: GPU_VENDORS[NVIDIA],
                     },

diff --git a/config/vsc_hortense.py b/config/vsc_hortense.py
@@ -53,7 +53,7 @@ def command(self, job):
                     ],
                     'features': [
                         FEATURES[CPU],
-                    ],
+                    ] + list(SCALES.keys()),
                 },
                 {
                     'name': 'cpu_rome_512gb',
@@ -80,7 +80,7 @@ def command(self, job):
                     ],
                     'features': [
                         FEATURES[CPU],
-                    ],
+                    ] + list(SCALES.keys()),
                 },
                 {
                     'name': 'cpu_milan',
@@ -107,7 +107,7 @@ def command(self, job):
                     ],
                     'features': [
                         FEATURES[CPU],
-                    ],
+                    ] + list(SCALES.keys()),
                 },
                 {
                     'name': 'gpu_rome_a100_40gb',
@@ -128,7 +128,7 @@ def command(self, job):
                     },
                     'features': [
                         FEATURES[GPU],
-                    ],
+                    ] + list(SCALES.keys()),
                     'extras': {
                         GPU_VENDOR: GPU_VENDORS[NVIDIA],
                     },
@@ -169,7 +169,7 @@ def command(self, job):
                     },
                     'features': [
                         FEATURES[GPU],
-                    ],
+                    ] + list(SCALES.keys()),
                     'extras': {
                         GPU_VENDOR: GPU_VENDORS[NVIDIA],
                     },

diff --git a/eessi/testsuite/constants.py b/eessi/testsuite/constants.py
@@ -60,3 +60,6 @@
     '8_nodes': {'num_nodes': 8, 'node_part': 1},
     '16_nodes': {'num_nodes': 16, 'node_part': 1},
 }
+
+# When tests are filtered by the hooks, the valid_systems is set to this system name:
+INVALID_SYSTEM = "INVALID_SYSTEM"
diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py
@@ -3,6 +3,7 @@
 """
 import math
 import shlex
+import warnings
 
 import reframe as rfm
 
@@ -283,15 +284,64 @@ def _assign_one_task_per_gpu(test: rfm.RegressionTest):
     log(f'num_tasks set to {test.num_tasks}')
 
 
+def _set_or_append_valid_systems(test: rfm.RegressionTest, valid_systems: str):
+    """
+    Sets test.valid_systems based on the valid_systems argument.
+    - If valid_systems is an empty string, test.valid_systems is set equal to eessi.testsuite.constants.INVALID_SYSTEM
+    - If test.valid_systems was an empty list, leave it as is (test should not be run)
+    - If test.valid_systems was at the default value ['*'], it is overwritten by [valid_system]
+    - If test.valid_systems was already set and is a list of one element, valid_system is appended to it,
+    which allows adding requests for multiple partition features by different hooks.
+    - If test.valid_systems was already set and is a list of multiple elements, we warn that the use has to take
+    care of filtering him/herself. This is typically the case when someone overrides the valid_systems on command line.
+    In this scenario, this function leaves test.valid_systems as it is.
+    """
+
+    # This indicates an invalid test that always has to be filtered
+    if valid_systems == '':
+        test.valid_systems = [INVALID_SYSTEM]
+        return
+
+    # test.valid_systems wasn't set yet, so set it
+    if len(test.valid_systems) == 0:
+        # test.valid_systems is empty, meaning all tests are filtered out. This hook shouldn't change that
+        return
+    # test.valid_systems still at default value, so overwrite
+    elif len(test.valid_systems) == 1 and test.valid_systems[0] == '*':
+        test.valid_systems = [valid_systems]
+    # test.valid_systems was set before, so append
+    elif len(test.valid_systems) == 1:
+        test.valid_systems[0] = f'{test.valid_systems[0]} {valid_systems}'
+    else:
+        warn_msg = f"valid_systems has multiple ({len(test.valid_systems)}) items,"
+        warn_msg += f" which is not supported by this hook."
+        warn_msg += f" Make sure to handle filtering yourself."
+        warnings.warn(warn_msg)
+        return
+
+
+def filter_supported_scales(test: rfm.RegressionTest):
+    """
+    Filter tests scales based on which scales are supported by each partition in the ReFrame configuration.
+    Filtering is done using features, i.e. the current test scale is requested as a feature.
+    Any partition that does not include this feature in the ReFrame configuration file will effectively be filtered out.
+    """
+    valid_systems = f'+{test.scale}'
+
+    # Change test.valid_systems accordingly:
+    _set_or_append_valid_systems(test, valid_systems)
+
+    log(f'valid_systems set to {test.valid_systems}')
+
 def filter_valid_systems_by_device_type(test: rfm.RegressionTest, required_device_type: str):
     """
     Filter valid_systems by required device type and by whether the module supports CUDA,
     unless valid_systems is specified with --setvar valid_systems=<comma-separated-list>.
-    """
-    if test.valid_systems:
-        # valid_systems is specified, so don't filter
-        return
 
+    Any invalid combination (e.g. a non-CUDA module with a required_device_type GPU) will
+    cause the valid_systems to be set to an empty string, and consequently the
+    test.valid_systems to an invalid system name (eessi.testsuite.constants.INVALID_SYSTEM).
+    """
     is_cuda_module = is_cuda_required_module(test.module_name)
 
     if is_cuda_module and required_device_type == DEVICE_TYPES[GPU]:
@@ -312,8 +362,8 @@ def filter_valid_systems_by_device_type(test: rfm.RegressionTest, required_devic
         # Invalid combination: a module without GPU support cannot use a GPU
         valid_systems = ''
 
-    if valid_systems:
-        test.valid_systems = [valid_systems]
+    # Change test.valid_systems accordingly:
+    _set_or_append_valid_systems(test, valid_systems)
 
     log(f'valid_systems set to {test.valid_systems}')
 

diff --git a/eessi/testsuite/tests/apps/gromacs.py b/eessi/testsuite/tests/apps/gromacs.py
@@ -42,14 +42,17 @@
 class EESSI_GROMACS(gromacs_check):
     scale = parameter(SCALES.keys())
     valid_prog_environs = ['default']
-    valid_systems = []
+    valid_systems = ['*']
     time_limit = '30m'
     module_name = parameter(find_modules('GROMACS'))
 
     @run_after('init')
     def run_after_init(self):
         """Hooks to run after the init phase"""
 
+        # Filter on which scales are supported by the partitions defined in the ReFrame configuration
+        hooks.filter_supported_scales(self)
+
         # Make sure that GPU tests run in partitions that support running on a GPU,
         # and that CPU-only tests run in partitions that support running CPU-only.
         # Also support setting valid_systems on the cmd line.

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
@@ -42,7 +42,7 @@ class EESSI_OSU_Micro_Benchmarks_pt2pt(osu_benchmark):
     ''' Run-only OSU test '''
     scale = parameter(filter_scales_pt2pt())
     valid_prog_environs = ['default']
-    valid_systems = []
+    valid_systems = ['*']
     time_limit = '30m'
     module_name = parameter(find_modules('OSU-Micro-Benchmarks'))
     # Device type for non-cuda OSU-Micro-Benchmarks should run on hosts of both node types. To do this the default
@@ -57,6 +57,9 @@ def run_after_init(self):
         # Note: device_buffers variable is inherited from the hpctestlib class and adds options to the launcher
         # commands (before setup) if not equal to 'cpu'. We set it to 'cpu' initially and change it later in this hook depending on the test.
         self.device_buffers = 'cpu'
+        # Filter on which scales are supported by the partitions defined in the ReFrame configuration
+        hooks.filter_supported_scales(self)
+
         hooks.filter_valid_systems_by_device_type(self, required_device_type=self.device_type)
         is_cuda_module = utils.is_cuda_required_module(self.module_name)
         # This part of the hook is meant to be for the OSU cpu tests. This is required since the non CUDA module should

diff --git a/eessi/testsuite/tests/apps/tensorflow/tensorflow.py b/eessi/testsuite/tests/apps/tensorflow/tensorflow.py
@@ -16,7 +16,7 @@ class EESSI_TensorFlow(rfm.RunOnlyRegressionTest):
     # This test can run at any scale, so parameterize over all known SCALES
     scale = parameter(SCALES.keys())
     valid_prog_environs = ['default']
-    valid_systems = []
+    valid_systems = ['*']
 
     # Parameterize over all modules that start with TensorFlow
     module_name = parameter(utils.find_modules('TensorFlow'))
@@ -70,6 +70,9 @@ def perf(self):
     @run_after('init')
     def run_after_init(self):
         """hooks to run after the init phase"""
+        # Filter on which scales are supported by the partitions defined in the ReFrame configuration
+        hooks.filter_supported_scales(self)
+
         hooks.filter_valid_systems_by_device_type(self, required_device_type=self.device_type)
         hooks.set_modules(self)
         hooks.set_tag_scale(self)