-
Notifications
You must be signed in to change notification settings - Fork 11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PyTorch test that uses torchvision #130
Changes from 20 commits
2e7bf95
e089760
a675fc9
a6e53bc
8bb9bbd
a6bf34d
fce2a45
b868cc1
760cd59
fed8906
357f649
887f7b3
6b1e36a
2d33141
4cb7b36
2f0bea2
fc067b2
4ddfe23
73b7e84
07b2c1b
11146ef
8298e6a
af30b64
7ddeedb
d62443b
00fca31
a69e2d3
4c5c3e7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,147 @@ | ||||||
from itertools import chain | ||||||
|
||||||
import reframe as rfm | ||||||
import reframe.utility.sanity as sn | ||||||
# Added only to make the linter happy | ||||||
from reframe.core.builtins import parameter, variable, run_after, sanity_function, performance_function | ||||||
|
||||||
from eessi.testsuite import hooks | ||||||
from eessi.testsuite.constants import SCALES, TAGS, DEVICE_TYPES, COMPUTE_UNIT, CPU, NUMA_NODE, GPU, INVALID_SYSTEM | ||||||
from eessi.testsuite.utils import find_modules | ||||||
|
||||||
|
||||||
class EESSI_PyTorch_torchvision(rfm.RunOnlyRegressionTest): | ||||||
nn_model = parameter(['vgg16', 'resnet50', 'resnet152', 'densenet121', 'mobilenet_v3_large']) | ||||||
scale = parameter(SCALES.keys()) | ||||||
parallel_strategy = parameter([None, 'ddp']) | ||||||
compute_device = variable(str) | ||||||
# Both torchvision and PyTorch-bundle modules have everything needed to run this test | ||||||
module_name = parameter(chain(find_modules('torchvision'), find_modules('PyTorch-bundle'))) | ||||||
|
||||||
descr = 'Benchmark that runs a selected torchvision model on synthetic data' | ||||||
|
||||||
executable = 'python' | ||||||
|
||||||
valid_prog_environs = ['default'] | ||||||
valid_systems = ['*'] | ||||||
|
||||||
time_limit = '30m' | ||||||
|
||||||
@run_after('init') | ||||||
def prepare_test(self): | ||||||
|
||||||
# Set nn_model as executable option | ||||||
self.executable_opts = ['pytorch_synthetic_benchmark.py --model %s' % self.nn_model] | ||||||
|
||||||
# If not a GPU run, disable CUDA | ||||||
if self.compute_device != DEVICE_TYPES[GPU]: | ||||||
self.executable_opts += ['--no-cuda'] | ||||||
|
||||||
@run_after('init') | ||||||
def apply_init_hooks(self): | ||||||
# Filter on which scales are supported by the partitions defined in the ReFrame configuration | ||||||
hooks.filter_supported_scales(self) | ||||||
|
||||||
# Make sure that GPU tests run in partitions that support running on a GPU, | ||||||
# and that CPU-only tests run in partitions that support running CPU-only. | ||||||
# Also support setting valid_systems on the cmd line. | ||||||
hooks.filter_valid_systems_by_device_type(self, required_device_type=self.compute_device) | ||||||
|
||||||
# Support selecting modules on the cmd line. | ||||||
hooks.set_modules(self) | ||||||
|
||||||
# Support selecting scales on the cmd line via tags. | ||||||
hooks.set_tag_scale(self) | ||||||
|
||||||
@run_after('init') | ||||||
def set_tag_ci(self): | ||||||
if self.nn_model == 'resnet50': | ||||||
self.tags.add(TAGS['CI']) | ||||||
|
||||||
@run_after('setup') | ||||||
def apply_setup_hooks(self): | ||||||
if self.compute_device == DEVICE_TYPES[GPU]: | ||||||
hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[GPU]) | ||||||
else: | ||||||
# Hybrid code, so launch 1 rank per socket. | ||||||
# Probably, launching 1 task per NUMA domain is even better, but the current hook doesn't support it | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i guess these comments no longer apply, as we are now launching 1 rank per numa node? |
||||||
hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[NUMA_NODE]) | ||||||
|
||||||
# This is a hybrid test, binding is important for performance | ||||||
hooks.set_compact_process_binding(self) | ||||||
|
||||||
@run_after('setup') | ||||||
def set_ddp_env_vars(self): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
# Set environment variables for PyTorch DDP | ||||||
if self.parallel_strategy == 'ddp': | ||||||
# Set additional options required by DDP | ||||||
self.executable_opts += ["--master-port $(python python_get_free_socket.py)"] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i was a bit confused by the name of this script. can we call it |
||||||
self.executable_opts += ["--master-address $(hostname --fqdn)"] | ||||||
self.executable_opts += ["--world-size %s" % self.num_tasks] | ||||||
|
||||||
@run_after('setup') | ||||||
def filter_invalid_parameter_combinations(self): | ||||||
# We cannot detect this situation before the setup phase, because it requires self.num_tasks. | ||||||
# Thus, the core count of the node needs to be known, which is only the case after the setup phase. | ||||||
msg = "Skipping test: parallel strategy is 'None'," | ||||||
msg += f" but requested process count is larger than one ({self.num_tasks})." | ||||||
self.skip_if(self.num_tasks > 1 and self.parallel_strategy is None, msg) | ||||||
msg = f"Skipping test: parallel strategy is {self.parallel_strategy}," | ||||||
msg += " but only one process is requested." | ||||||
self.skip_if(self.num_tasks == 1 and self.parallel_strategy is not None, msg) | ||||||
|
||||||
@run_after('setup') | ||||||
def pass_parallel_strategy(self): | ||||||
# Set parallelization strategy when using more than one process | ||||||
if self.num_tasks != 1: | ||||||
self.executable_opts += ['--use-%s' % self.parallel_strategy] | ||||||
|
||||||
@run_after('setup') | ||||||
def avoid_horovod_cpu_contention(self): | ||||||
# Horovod had issues with CPU performance, see https://github.com/horovod/horovod/issues/2804 | ||||||
# The root cause is Horovod having two threads with very high utilization, which interferes with | ||||||
# the compute threads. It was fixed, but seems to be broken again in Horovod 0.28.1 | ||||||
# The easiest workaround is to reduce the number of compute threads by 2 | ||||||
if self.compute_device == DEVICE_TYPES[CPU] and self.parallel_strategy == 'horovod': | ||||||
self.env_vars['OMP_NUM_THREADS'] = max(self.num_cpus_per_task - 2, 2) # Never go below 2 compute threads | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if you have only 1 or 2 cores? is it still better to have 2 compute threads in that case? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll take it out. We don't have If we ever deploy horovod (with PyTorch support), we could bring this back. But then it'd be a horovod test, not a pytorch test :) |
||||||
|
||||||
@sanity_function | ||||||
def assert_num_ranks(self): | ||||||
'''Assert that the number of reported CPUs/GPUs used is correct''' | ||||||
return sn.assert_found(r'Total img/sec on %s .PU\(s\):.*' % self.num_tasks, self.stdout) | ||||||
|
||||||
@performance_function('img/sec') | ||||||
def total_throughput(self): | ||||||
'''Total training throughput, aggregated over all CPUs/GPUs''' | ||||||
return sn.extractsingle(r'Total img/sec on [0-9]+ .PU\(s\):\s+(?P<perf>\S+)', self.stdout, 'perf', float) | ||||||
|
||||||
@performance_function('img/sec') | ||||||
def througput_per_CPU(self): | ||||||
'''Training througput per CPU''' | ||||||
if self.compute_device == DEVICE_TYPES[CPU]: | ||||||
return sn.extractsingle(r'Img/sec per CPU:\s+(?P<perf_per_cpu>\S+)', self.stdout, 'perf_per_cpu', float) | ||||||
else: | ||||||
return sn.extractsingle(r'Img/sec per GPU:\s+(?P<perf_per_gpu>\S+)', self.stdout, 'perf_per_gpu', float) | ||||||
|
||||||
|
||||||
@rfm.simple_test | ||||||
class EESSI_PyTorch_torchvision_CPU(EESSI_PyTorch_torchvision): | ||||||
compute_device = DEVICE_TYPES[CPU] | ||||||
|
||||||
|
||||||
@rfm.simple_test | ||||||
class EESSI_PyTorch_torchvision_GPU(EESSI_PyTorch_torchvision): | ||||||
compute_device = DEVICE_TYPES[GPU] | ||||||
precision = parameter(['default', 'mixed']) | ||||||
|
||||||
@run_after('init') | ||||||
def prepare_gpu_test(self): | ||||||
# Set precision | ||||||
if self.precision == 'mixed': | ||||||
self.executable_opts += ['--use-amp'] | ||||||
|
||||||
@run_after('init') | ||||||
def skip_hvd_plus_amp(self): | ||||||
'''Skip combination of horovod and AMP, it does not work see https://github.com/horovod/horovod/issues/1417''' | ||||||
if self.parallel_strategy == 'horovod' and self.precision == 'mixed': | ||||||
self.valid_systems = [INVALID_SYSTEM] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Based on https://unix.stackexchange.com/a/132524 | ||
import socket | ||
|
||
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) | ||
s.bind(('', 0)) | ||
addr = s.getsockname() | ||
print(addr[1]) | ||
s.close() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please also update line 83 to: