Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support always requesting GPUs on partitions that require it #116

Merged
merged 14 commits into from
Feb 29, 2024
2 changes: 2 additions & 0 deletions eessi/testsuite/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
INTEL = 'INTEL'
NODE = 'NODE'
NVIDIA = 'NVIDIA'
ALWAYS_REQUEST_GPUS = 'ALWAYS_REQUEST_GPUS'

DEVICE_TYPES = {
CPU: 'cpu',
Expand All @@ -31,6 +32,7 @@
FEATURES = {
CPU: 'cpu',
GPU: 'gpu',
ALWAYS_REQUEST_GPUS: 'always_request_gpus',
}

GPU_VENDORS = {
Expand Down
78 changes: 46 additions & 32 deletions eessi/testsuite/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@

import reframe as rfm

from eessi.testsuite.constants import * # noqa
from eessi.testsuite.constants import *
from eessi.testsuite.utils import (get_max_avail_gpus_per_node, is_cuda_required_module, log,
check_proc_attribute_defined)


def assign_default_num_cpus_per_node(test: rfm.RegressionTest):
def _assign_default_num_cpus_per_node(test: rfm.RegressionTest):
"""
Check if the default number of cpus per node is already defined in the test
(e.g. by earlier hooks like set_tag_scale).
Expand All @@ -34,6 +34,27 @@ def assign_default_num_cpus_per_node(test: rfm.RegressionTest):
log(f'default_num_cpus_per_node set to {test.default_num_cpus_per_node}')


def _assign_default_num_gpus_per_node(test: rfm.RegressionTest):
"""
Check if the default number of gpus per node is already defined in the test
(e.g. by earlier hooks like set_tag_scale).
If so, check if it doesn't exceed the maximum available.
If not, set default_num_gpus_per_node based on the maximum available gpus and node_part
"""

test.max_avail_gpus_per_node = get_max_avail_gpus_per_node(test)
if test.default_num_gpus_per_node:
# may skip if not enough GPUs
test.skip_if(
test.default_num_gpus_per_node > test.max_avail_gpus_per_node,
f'Number of GPUs per node in selected scale ({test.default_num_gpus_per_node}) is higher than max available'
f' ({test.max_avail_gpus_per_node}) in current partition ({test.current_partition.name}).'
)
else:
# no default set yet, so setting one
test.default_num_gpus_per_node = math.ceil(test.max_avail_gpus_per_node / test.node_part)


def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, num_per: int = 1):
"""
Assign one task per compute unit (COMPUTE_UNIT[CPU], COMPUTE_UNIT[CPU_SOCKET] or COMPUTE_UNIT[GPU]).
Expand Down Expand Up @@ -69,15 +90,18 @@ def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, n

# Check if either node_part, or default_num_cpus_per_node and default_num_gpus_per_node are set correctly
if not (
type(test.node_part) == int or
(type(test.default_num_cpus_per_node) == int and type(test.default_num_gpus_per_node) == int)
type(test.node_part) == int
or (type(test.default_num_cpus_per_node) == int and type(test.default_num_gpus_per_node) == int)
):
raise ValueError(
f'Either node_part ({test.node_part}), or default_num_cpus_per_node ({test.default_num_cpus_per_node}) and'
f' default num_gpus_per_node ({test.default_num_gpus_per_node}) must be defined and have integer values.'
)

assign_default_num_cpus_per_node(test)
_assign_default_num_cpus_per_node(test)

if FEATURES[GPU] in test.current_partition.features:
_assign_default_num_gpus_per_node(test)

if compute_unit == COMPUTE_UNIT[GPU]:
_assign_one_task_per_gpu(test)
Expand All @@ -90,6 +114,8 @@ def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, n
else:
raise ValueError(f'compute unit {compute_unit} is currently not supported')

_check_always_request_gpus(test)


def _assign_num_tasks_per_node(test: rfm.RegressionTest, num_per: int = 1):
"""
Expand All @@ -112,7 +138,6 @@ def _assign_num_tasks_per_node(test: rfm.RegressionTest, num_per: int = 1):
test.num_tasks_per_node = num_per
test.num_cpus_per_task = int(test.default_num_cpus_per_node / test.num_tasks_per_node)


# num_tasks_per_node is not set, but num_cpus_per_task is
elif not test.num_tasks_per_node:
test.num_tasks_per_node = int(test.default_num_cpus_per_node / test.num_cpus_per_task)
Expand Down Expand Up @@ -222,11 +247,6 @@ def _assign_one_task_per_gpu(test: rfm.RegressionTest):
--setvar num_cpus_per_task=<y> and/or
--setvar num_gpus_per_node=<z>.

Variables:
- max_avail_gpus_per_node: maximum available number of GPUs per node
- default_num_gpus_per_node: default number of GPUs per node as defined in the test
(e.g. by earlier hooks like set_tag_scale)

Default resources requested:
- num_gpus_per_node = default_num_gpus_per_node
- num_tasks_per_node = num_gpus_per_node
Expand All @@ -235,22 +255,6 @@ def _assign_one_task_per_gpu(test: rfm.RegressionTest):
If num_tasks_per_node is set, set num_gpus_per_node equal to either num_tasks_per_node or default_num_gpus_per_node
(whichever is smallest), unless num_gpus_per_node is also set.
"""
max_avail_gpus_per_node = get_max_avail_gpus_per_node(test)

# Check if the default number of gpus per node is already defined in the test
# (e.g. by earlier hooks like set_tag_scale).
# If so, check if it doesn't exceed the maximum available.
# If not, set default_num_gpus_per_node based on the maximum available gpus and node_part
if test.default_num_gpus_per_node:
# may skip if not enough GPUs
test.skip_if(
test.default_num_gpus_per_node > max_avail_gpus_per_node,
f'Requested GPUs per node ({test.default_num_gpus_per_node}) is higher than max available'
f' ({max_avail_gpus_per_node}) in current partition ({test.current_partition.name}).'
)
else:
# no default set yet, so setting one
test.default_num_gpus_per_node = math.ceil(max_avail_gpus_per_node / test.node_part)

# neither num_tasks_per_node nor num_gpus_per_node are set
if not test.num_tasks_per_node and not test.num_gpus_per_node:
Expand All @@ -273,7 +277,7 @@ def _assign_one_task_per_gpu(test: rfm.RegressionTest):
# limit num_cpus_per_task to the maximum available cpus per gpu
test.num_cpus_per_task = min(
int(test.default_num_cpus_per_node / test.num_tasks_per_node),
int(test.max_avail_cpus_per_node / max_avail_gpus_per_node)
int(test.max_avail_cpus_per_node / test.max_avail_gpus_per_node)
)

test.num_tasks = test.num_nodes * test.num_tasks_per_node
Expand Down Expand Up @@ -303,8 +307,8 @@ def _set_or_append_valid_systems(test: rfm.RegressionTest, valid_systems: str):
return

# test.valid_systems wasn't set yet, so set it
if len(test.valid_systems) == 0:
# test.valid_systems is empty, meaning all tests are filtered out. This hook shouldn't change that
if len(test.valid_systems) == 0 or test.valid_systems == [INVALID_SYSTEM]:
# test.valid_systems is empty or invalid, meaning all tests are filtered out. This hook shouldn't change that
return
# test.valid_systems still at default value, so overwrite
elif len(test.valid_systems) == 1 and test.valid_systems[0] == '*':
Expand All @@ -314,8 +318,8 @@ def _set_or_append_valid_systems(test: rfm.RegressionTest, valid_systems: str):
test.valid_systems[0] = f'{test.valid_systems[0]} {valid_systems}'
else:
warn_msg = f"valid_systems has multiple ({len(test.valid_systems)}) items,"
warn_msg += f" which is not supported by this hook."
warn_msg += f" Make sure to handle filtering yourself."
warn_msg += " which is not supported by this hook."
warn_msg += " Make sure to handle filtering yourself."
warnings.warn(warn_msg)
return

Expand All @@ -333,6 +337,7 @@ def filter_supported_scales(test: rfm.RegressionTest):

log(f'valid_systems set to {test.valid_systems}')


def filter_valid_systems_by_device_type(test: rfm.RegressionTest, required_device_type: str):
"""
Filter valid_systems by required device type and by whether the module supports CUDA,
Expand Down Expand Up @@ -459,3 +464,12 @@ def set_compact_thread_binding(test: rfm.RegressionTest):
log(f'Set environment variable OMP_PLACES to {test.env_vars["OMP_PLACES"]}')
log(f'Set environment variable OMP_PROC_BIND to {test.env_vars["OMP_PROC_BIND"]}')
log(f'Set environment variable KMP_AFFINITY to {test.env_vars["KMP_AFFINITY"]}')


def _check_always_request_gpus(test: rfm.RegressionTest):
"""
Make sure we always request enough GPUs if required for the current GPU partition (cluster-specific policy)
"""
if FEATURES[ALWAYS_REQUEST_GPUS] in test.current_partition.features and not test.num_gpus_per_node:
test.num_gpus_per_node = test.default_num_gpus_per_node
log(f'num_gpus_per_node set to {test.num_gpus_per_node} for partition {test.current_partition.name}')
104 changes: 55 additions & 49 deletions eessi/testsuite/tests/apps/osu.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,40 +51,50 @@ class EESSI_OSU_Micro_Benchmarks_pt2pt(osu_benchmark):
# unset num_tasks_per_node from the hpctestlib.
num_tasks_per_node = None

@run_after('init')
def filter_scales_2gpus(self):
"""Filter out scales with < 2 GPUs if running on GPUs"""
if (
self.device_type == DEVICE_TYPES[GPU]
and SCALES[self.scale]['num_nodes'] == 1
and SCALES[self.scale].get('num_gpus_per_node', 2) < 2
):
self.valid_systems = [INVALID_SYSTEM]
log(f'valid_systems set to {self.valid_systems} for scale {self.scale} and device_type {self.device_type}')

@run_after('init')
def filter_benchmark_pt2pt(self):
""" Filter out all non-mpi.pt2pt benchmarks """
if not self.benchmark_info[0].startswith('mpi.pt2pt'):
self.valid_systems = [INVALID_SYSTEM]

@run_after('init')
def run_after_init(self):
"""hooks to run after init phase"""
# Note: device_buffers variable is inherited from the hpctestlib class and adds options to the launcher
# commands (before setup) if not equal to 'cpu'. We set it to 'cpu' initially and change it later in this hook depending on the test.
self.device_buffers = 'cpu'

# Filter on which scales are supported by the partitions defined in the ReFrame configuration
hooks.filter_supported_scales(self)

hooks.filter_valid_systems_by_device_type(self, required_device_type=self.device_type)
is_cuda_module = utils.is_cuda_required_module(self.module_name)
# This part of the hook is meant to be for the OSU cpu tests. This is required since the non CUDA module should
# be able to run in the GPU partition as well. This is specific for this test and not covered by the function
# above.
if is_cuda_module and self.device_type == DEVICE_TYPES[GPU]:
# Sets to cuda as device buffer only if the module is compiled with CUDA.
self.device_buffers = 'cuda'

# If the device_type is CPU then device buffer should always be CPU.
if self.device_type == DEVICE_TYPES[CPU]:
self.device_buffers = 'cpu'

# This part of the code removes the collective communication calls out of the run list since this test is only
# meant for pt2pt.
if not self.benchmark_info[0].startswith('mpi.pt2pt'):
self.valid_systems = []
hooks.set_modules(self)

@run_after('setup')
def adjust_executable_opts(self):
"""The option "D D" is only meant for Devices if and not for CPU tests. This option is added by hpctestlib to
all pt2pt tests which is not required."""
if(self.device_type == DEVICE_TYPES[CPU]):
self.executable_opts = [ele for ele in self.executable_opts if ele != 'D']
# Set scales as tags
hooks.set_tag_scale(self)

@run_after('init')
def set_device_buffers(self):
"""
device_buffers is inherited from the hpctestlib class and adds options to the launcher
commands in a @run_before('setup') hook if not equal to 'cpu'.
Therefore, we must set device_buffers *before* the @run_before('setup') hooks.
"""
if self.device_type == DEVICE_TYPES[GPU]:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note: checking for device_type is enough here, as the is_cude_module check is already done in hooks.filter_valid_systems_by_device_type

self.device_buffers = 'cuda'

else:
# If the device_type is CPU then device_buffers should always be CPU.
self.device_buffers = 'cpu'

@run_after('init')
smoors marked this conversation as resolved.
Show resolved Hide resolved
def set_tag_ci(self):
Expand All @@ -108,44 +118,40 @@ def set_mem(self):
requirement."""
self.extra_resources = {'memory': {'size': '12GB'}}

@run_after('init')
def set_num_tasks(self):
""" Setting scales as tags. """
hooks.set_tag_scale(self)
@run_after('setup')
def adjust_executable_opts(self):
"""The option "D D" is only meant for Devices if and not for CPU tests.
This option is added by hpctestlib in a @run_before('setup') to all pt2pt tests which is not required.
Therefore we must override it *after* the 'setup' phase
"""
if self.device_type == DEVICE_TYPES[CPU]:
self.executable_opts = [ele for ele in self.executable_opts if ele != 'D']


@run_after('setup')
def set_num_tasks_per_node(self):
""" Setting number of tasks per node and cpus per task in this function. This function sets num_cpus_per_task
for 1 node and 2 node options where the request is for full nodes."""
if(SCALES.get(self.scale).get('num_nodes') == 1):
if SCALES.get(self.scale).get('num_nodes') == 1:
hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[NODE], 2)
else:
hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[NODE])

@run_after('setup')
def set_num_gpus_per_node(self):
"""
This test does not require gpus and is for host to host within GPU nodes. But some systems do require a GPU
allocation for to perform any activity in the GPU nodes.
Set number of GPUs per node for GPU-to-GPU tests
"""
if(FEATURES[GPU] in self.current_partition.features and not utils.is_cuda_required_module(self.module_name)):
max_avail_gpus_per_node = utils.get_max_avail_gpus_per_node(self)
# Here for the 2_node test we assign max_avail_gpus_per_node but some systems cannot allocate 1_cpn_2_nodes
# for GPUs but need all gpus allocated within the 2 nodes for this work which. The test may fail under such
# conditions for the scale 1_cpn_2_nodes because it is simply not allowed.
self.num_gpus_per_node = self.default_num_gpus_per_node or max_avail_gpus_per_node
elif(FEATURES[GPU] in self.current_partition.features and utils.is_cuda_required_module(self.module_name)):
max_avail_gpus_per_node = utils.get_max_avail_gpus_per_node(self)
if(SCALES.get(self.scale).get('num_nodes') == 1):
# Skip the single node test if there is only 1 device in the node.
if(max_avail_gpus_per_node == 1):
self.skip(msg="There is only 1 device within the node. Skipping tests involving only 1 node.")
else:
self.num_gpus_per_node = 2
else:
# Note these settings are for 1_cpn_2_nodes. In that case we want to test for only 1 GPU per node since
# we have not requested for full nodes.
self.num_gpus_per_node = self.default_num_gpus_per_node or max_avail_gpus_per_node
if self.device_type == DEVICE_TYPES[GPU]:
# Skip single-node tests with less than 2 GPU devices in the node
self.skip_if(
SCALES[self.scale]['num_nodes'] == 1 and self.default_num_gpus_per_node < 2,
"There are < 2 GPU devices present in the node."
f" Skipping tests with device_type={DEVICE_TYPES[GPU]} involving < 2 GPUs and 1 node."
)
if not self.num_gpus_per_node:
self.num_gpus_per_node = self.default_num_gpus_per_node
log(f'num_gpus_per_node set to {self.num_gpus_per_node} for partition {self.current_partition.name}')


@rfm.simple_test
Expand Down
6 changes: 6 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,9 @@ namespace_packages = eessi

[options.packages.find]
include = eessi*

[flake8]
max-line-length = 120
# ignore star imports (F403, F405)
# ignore obsolete warning (W503)
ignore = F403, F405, W503
Loading