From e52e4f96e1424fb2e1007da46709f9e29a30b058 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Thu, 29 Feb 2024 15:55:14 +0100 Subject: [PATCH 1/5] add linting github workflow --- .github/workflows/linting.yml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 .github/workflows/linting.yml diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml new file mode 100644 index 00000000..4aa08cb4 --- /dev/null +++ b/.github/workflows/linting.yml @@ -0,0 +1,28 @@ +name: Static Analysis +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{format('{0}:{1}:{2}', github.repository, github.ref, github.workflow)}} + cancel-in-progress: true + +jobs: + python-linting: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + + - name: set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.10 + + - name: install Python packages + run: | + pip install --upgrade pip + pip install --upgrade flake8 + + - name: Run flake8 to verify PEP8-compliance of Python code + run: flake8 From 42d7c3b3a60a203ec7f7630f3752dd7d9e6a3a73 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Thu, 29 Feb 2024 16:08:04 +0100 Subject: [PATCH 2/5] fix workflow --- .github/workflows/linting.yml | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 4aa08cb4..74ddf364 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -1,28 +1,25 @@ name: Static Analysis -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{format('{0}:{1}:{2}', github.repository, github.ref, github.workflow)}} - cancel-in-progress: true +on: [push, pull_request, workflow_dispatch] +permissions: read-all jobs: python-linting: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v4 + - name: Check out repository + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + persist-credentials: false - - name: set up Python - uses: actions/setup-python@v4 - with: - python-version: 3.10 + - name: Set up Python + uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 + with: + python-version: 3.10 - - name: install Python packages - run: | - pip install --upgrade pip - pip install --upgrade flake8 + - name: Install Python packages + run: | + pip install --upgrade pip + pip install --upgrade flake8 - - name: Run flake8 to verify PEP8-compliance of Python code - run: flake8 + - name: Run flake8 to verify PEP8-compliance of Python code + run: flake8 From bf547bcbee18b7b45a6f608aa6d17cd544684779 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Thu, 29 Feb 2024 16:12:00 +0100 Subject: [PATCH 3/5] use quotes for python version in workflow --- .github/workflows/linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 74ddf364..ac1ba57c 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -14,7 +14,7 @@ jobs: - name: Set up Python uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0 with: - python-version: 3.10 + python-version: '3.10' - name: Install Python packages run: | From 641a7d03b4bc7a46bc391a638eab537bbcb3c3d6 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Thu, 29 Feb 2024 17:13:49 +0100 Subject: [PATCH 4/5] fix formatting --- config/aws_mc.py | 4 +- config/github_actions.py | 14 +-- config/it4i_karolina.py | 74 +++++++-------- config/izum_vega.py | 4 +- config/settings_example.py | 18 ++-- config/surf_snellius.py | 6 +- config/vsc_hortense.py | 2 +- eessi/testsuite/common_config.py | 22 ++--- eessi/testsuite/tests/apps/gromacs.py | 1 + eessi/testsuite/tests/apps/osu.py | 40 ++++----- .../tests/apps/tensorflow/src/mnist_setup.py | 52 +++++------ .../tests/apps/tensorflow/src/tf_test.py | 90 +++++++++++-------- .../tests/apps/tensorflow/tensorflow.py | 3 +- eessi/testsuite/utils.py | 15 ++-- 14 files changed, 182 insertions(+), 163 deletions(-) diff --git a/config/aws_mc.py b/config/aws_mc.py index 11106e36..7f503bcc 100644 --- a/config/aws_mc.py +++ b/config/aws_mc.py @@ -43,8 +43,8 @@ 'name': 'x86_64-skylake-16c-30gb', 'access': ['--partition=x86-64-intel-skylake-node', '--export=NONE'], 'descr': 'Skylake, 16 cores, 30 GB', - }, - { + }, + { 'name': 'x86_64-zen2-16c-30gb', 'access': ['--partition=x86-64-amd-zen2-node', '--export=NONE'], 'descr': 'Zen2, 16 cores, 30 GB', diff --git a/config/github_actions.py b/config/github_actions.py index 4df3f06a..5328f6f3 100644 --- a/config/github_actions.py +++ b/config/github_actions.py @@ -1,7 +1,7 @@ # ReFrame configuration file that can be used in GitHub Actions with EESSI from eessi.testsuite.common_config import common_logging_config -from eessi.testsuite.constants import * # noqa: F403 +from eessi.testsuite.constants import * site_configuration = { @@ -26,18 +26,18 @@ } ], 'max_jobs': 1 - } - ] - } - ], + } + ] + } + ], 'environments': [ { 'name': 'default', 'cc': 'cc', 'cxx': '', 'ftn': '' - } - ], + } + ], 'general': [ { 'purge_environment': True, diff --git a/config/it4i_karolina.py b/config/it4i_karolina.py index 881bd4df..dbd62411 100644 --- a/config/it4i_karolina.py +++ b/config/it4i_karolina.py @@ -47,7 +47,7 @@ ], 'launcher': 'mpirun', # Use --export=None to avoid that login environment is passed down to submitted jobs - 'access': ['-p qcpu', '-A DD-23-96', '--export=None'], + 'access': ['-p qcpu', '-A DD-23-96', '--export=None'], 'environs': ['default'], 'max_jobs': 120, 'features': [ @@ -55,42 +55,42 @@ ] + list(SCALES.keys()), 'descr': 'CPU Universal Compute Nodes, see https://docs.it4i.cz/karolina/hardware-overview/' }, -# We don't have GPU budget on Karolina at this time -# { -# 'name': 'qgpu', -# 'scheduler': 'slurm', -# 'prepare_cmds': [ -# 'source %s' % common_eessi_init(), -# # Pass job environment variables like $PATH, etc., into job steps -# 'export SLURM_EXPORT_ENV=ALL', -# # Needed when using srun launcher -# # 'export SLURM_MPI_TYPE=pmix', # WARNING: this broke the GROMACS on Vega -# # Avoid https://github.com/EESSI/software-layer/issues/136 -# # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) -# 'export OMPI_MCA_pml=ucx', -# ], -# 'launcher': 'mpirun', -# # Use --export=None to avoid that login environment is passed down to submitted jobs -# 'access': ['-p gpu', '-A DD-23-96', '--export=None'], -# 'environs': ['default'], -# 'max_jobs': 60, -# 'devices': [ -# { -# 'type': DEVICE_TYPES[GPU], -# 'num_devices': 8, -# } -# ], -# 'resources': [ -# { -# 'name': '_rfm_gpu', -# 'options': ['--gpus-per-node={num_gpus_per_node}'], -# } -# ], -# 'features': [ -# FEATURES[GPU], -# ] + list(SCALES.keys()), -# 'descr': 'GPU partition with accelerated nodes, see https://docs.it4i.cz/karolina/hardware-overview/' -# }, + # We don't have GPU budget on Karolina at this time + # { + # 'name': 'qgpu', + # 'scheduler': 'slurm', + # 'prepare_cmds': [ + # 'source %s' % common_eessi_init(), + # # Pass job environment variables like $PATH, etc., into job steps + # 'export SLURM_EXPORT_ENV=ALL', + # # Needed when using srun launcher + # # 'export SLURM_MPI_TYPE=pmix', # WARNING: this broke the GROMACS on Vega + # # Avoid https://github.com/EESSI/software-layer/issues/136 + # # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) + # 'export OMPI_MCA_pml=ucx', + # ], + # 'launcher': 'mpirun', + # # Use --export=None to avoid that login environment is passed down to submitted jobs + # 'access': ['-p gpu', '-A DD-23-96', '--export=None'], + # 'environs': ['default'], + # 'max_jobs': 60, + # 'devices': [ + # { + # 'type': DEVICE_TYPES[GPU], + # 'num_devices': 8, + # } + # ], + # 'resources': [ + # { + # 'name': '_rfm_gpu', + # 'options': ['--gpus-per-node={num_gpus_per_node}'], + # } + # ], + # 'features': [ + # FEATURES[GPU], + # ] + list(SCALES.keys()), + # 'descr': 'GPU partition with accelerated nodes, https://docs.it4i.cz/karolina/hardware-overview/' + # }, ] }, ], diff --git a/config/izum_vega.py b/config/izum_vega.py index 90357553..4dca647a 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -47,7 +47,7 @@ ], 'launcher': 'mpirun', # Use --export=None to avoid that login environment is passed down to submitted jobs - 'access': ['-p cpu', '--export=None'], + 'access': ['-p cpu', '--export=None'], 'environs': ['default'], 'max_jobs': 120, 'resources': [ @@ -76,7 +76,7 @@ ], 'launcher': 'mpirun', # Use --export=None to avoid that login environment is passed down to submitted jobs - 'access': ['-p gpu', '--export=None'], + 'access': ['-p gpu', '--export=None'], 'environs': ['default'], 'max_jobs': 60, 'devices': [ diff --git a/config/settings_example.py b/config/settings_example.py index 0f61496a..7d53c58f 100644 --- a/config/settings_example.py +++ b/config/settings_example.py @@ -20,7 +20,7 @@ import os from eessi.testsuite.common_config import common_logging_config, format_perfvars, perflog_format -from eessi.testsuite.constants import * # noqa: F403 +from eessi.testsuite.constants import * site_configuration = { @@ -35,10 +35,10 @@ 'partitions': [ { 'name': 'cpu_partition', - 'descr': 'CPU partition' + 'descr': 'CPU partition', 'scheduler': 'slurm', 'launcher': 'mpirun', - 'access': ['-p cpu', '--export=None'], + 'access': ['-p cpu', '--export=None'], 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], 'environs': ['default'], 'max_jobs': 4, @@ -62,20 +62,20 @@ }, { 'name': 'gpu_partition', - 'descr': 'GPU partition' + 'descr': 'GPU partition', 'scheduler': 'slurm', 'launcher': 'mpirun', - 'access': ['-p gpu', '--export=None'], + 'access': ['-p gpu', '--export=None'], 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], 'environs': ['default'], 'max_jobs': 4, # We recommend to rely on ReFrame's CPU autodetection, # and only define the 'processor' field if autodetection fails # 'processor': { - # 'num_cpus': 72, - # 'num_sockets': 2, - # 'num_cpus_per_socket': 36, - # 'num_cpus_per_core': 1, + # 'num_cpus': 72, + # 'num_sockets': 2, + # 'num_cpus_per_socket': 36, + # 'num_cpus_per_core': 1, # }, 'resources': [ { diff --git a/config/surf_snellius.py b/config/surf_snellius.py index e42bfeac..f243ce9a 100644 --- a/config/surf_snellius.py +++ b/config/surf_snellius.py @@ -41,7 +41,7 @@ 'scheduler': 'slurm', 'prepare_cmds': ['source %s' % common_eessi_init()], 'launcher': 'mpirun', - 'access': ['-p rome', '--export=None'], + 'access': ['-p rome', '--export=None'], 'environs': ['default'], 'max_jobs': 120, 'resources': [ @@ -60,7 +60,7 @@ 'scheduler': 'slurm', 'prepare_cmds': ['source %s' % common_eessi_init()], 'launcher': 'mpirun', - 'access': ['-p genoa', '--export=None'], + 'access': ['-p genoa', '--export=None'], 'environs': ['default'], 'max_jobs': 120, 'resources': [ @@ -80,7 +80,7 @@ 'scheduler': 'slurm', 'prepare_cmds': ['source %s' % common_eessi_init()], 'launcher': 'mpirun', - 'access': ['-p gpu', '--export=None'], + 'access': ['-p gpu', '--export=None'], 'environs': ['default'], 'max_jobs': 60, 'devices': [ diff --git a/config/vsc_hortense.py b/config/vsc_hortense.py index e8f1a721..12c683d8 100644 --- a/config/vsc_hortense.py +++ b/config/vsc_hortense.py @@ -143,7 +143,7 @@ def command(self, job): } ], 'devices': [ - { + { 'type': DEVICE_TYPES[GPU], 'num_devices': 4, } diff --git a/eessi/testsuite/common_config.py b/eessi/testsuite/common_config.py index ecf2d0c5..5832c7af 100644 --- a/eessi/testsuite/common_config.py +++ b/eessi/testsuite/common_config.py @@ -69,33 +69,35 @@ def common_logging_config(prefix=None): ], }] + def common_eessi_init(eessi_version=None): """ Returns the full path that should be sourced to initialize the EESSI environment for a given version of EESSI. - If no eessi_version is passed, the EESSI_VERSION environment variable is read. If that is also not defined, default behaviour is to use `latest`. + If no eessi_version is passed, the EESSI_VERSION environment variable is read. + If that is also not defined, default behaviour is to use `latest`. :param eessi_version: version of EESSI that should be sourced (e.g. '2023.06' or 'latest') [optional] """ # Check which EESSI_CVMFS_REPO we are running under eessi_cvmfs_repo = os.getenv('EESSI_CVMFS_REPO', None) if eessi_cvmfs_repo is None: - err_msg = "Environment variable 'EESSI_CVMFS_REPO' was not found. " - err_msg += "Did you initialize the EESSI environment before running the test suite?" + err_msg = "Environment variable 'EESSI_CVMFS_REPO' was not found." + err_msg += " Did you initialize the EESSI environment before running the test suite?" raise ValueError(err_msg) if eessi_cvmfs_repo == '/cvmfs/pilot.eessi-hpc.org': - if eessi_version == None: + if eessi_version is None: # Try also EESSI_VERSION for backwards compatibility with previous common_eessi_init implementation eessi_version = os.getenv('EESSI_PILOT_VERSION', os.getenv('EESSI_VERSION', 'latest')) else: - # software.eessi.io, or another where we assume the same variable names to be used - if eessi_version == None: + # software.eessi.io, or another where we assume the same variable names to be used + if eessi_version is None: eessi_version = os.getenv('EESSI_VERSION', None) # Without EESSI_VERSION, we don't know what to do. There is no default/latest version # So, report error - if eessi_version == None: - err_msg = "Environment variable 'EESSI_VERSION' was not found. " - err_msg += "Did you initialize the EESSI environment before running the test suite?" + if eessi_version is None: + err_msg = "Environment variable 'EESSI_VERSION' was not found." + err_msg += " Did you initialize the EESSI environment before running the test suite?" raise ValueError(err_msg) - + if eessi_cvmfs_repo == '/cvmfs/pilot.eessi-hpc.org' and eessi_version == 'latest': return '/cvmfs/pilot.eessi-hpc.org/latest/init/bash' else: diff --git a/eessi/testsuite/tests/apps/gromacs.py b/eessi/testsuite/tests/apps/gromacs.py index 8e14cbb4..a3d9e625 100644 --- a/eessi/testsuite/tests/apps/gromacs.py +++ b/eessi/testsuite/tests/apps/gromacs.py @@ -30,6 +30,7 @@ """ import reframe as rfm +from reframe.core.builtins import parameter, run_after # added only to make the linter happy from hpctestlib.sciapps.gromacs.benchmarks import gromacs_check diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py index bb5f6db1..3a6f9dbe 100644 --- a/eessi/testsuite/tests/apps/osu.py +++ b/eessi/testsuite/tests/apps/osu.py @@ -6,9 +6,11 @@ non-GPU nodes. Otherwise those tests will FAIL. """ import reframe as rfm -from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark +from reframe.core.builtins import parameter, run_after # added only to make the linter happy from reframe.utility import reframe +from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark + from eessi.testsuite import hooks, utils from eessi.testsuite.constants import * from eessi.testsuite.utils import find_modules, log @@ -127,7 +129,6 @@ def adjust_executable_opts(self): if self.device_type == DEVICE_TYPES[CPU]: self.executable_opts = [ele for ele in self.executable_opts if ele != 'D'] - @run_after('setup') def set_num_tasks_per_node(self): """ Setting number of tasks per node and cpus per task in this function. This function sets num_cpus_per_task @@ -168,7 +169,6 @@ class EESSI_OSU_Micro_Benchmarks_coll(osu_benchmark): # Unset num_tasks_per_node from hpctestlib num_tasks_per_node = None - @run_after('init') def run_after_init(self): """hooks to run after init phase""" @@ -177,7 +177,7 @@ def run_after_init(self): self.device_buffers = 'cpu' # Filter on which scales are supported by the partitions defined in the ReFrame configuration hooks.filter_supported_scales(self) - hooks.filter_valid_systems_by_device_type( self, required_device_type=self.device_type) + hooks.filter_valid_systems_by_device_type(self, required_device_type=self.device_type) is_cuda_module = utils.is_cuda_required_module(self.module_name) if is_cuda_module and self.device_type == DEVICE_TYPES[GPU]: self.device_buffers = 'cuda' @@ -191,19 +191,16 @@ def run_after_init(self): self.valid_systems = [] hooks.set_modules(self) - @run_after('init') def set_tag_ci(self): - if (self.benchmark_info[0] == 'mpi.collective.osu_allreduce' or - self.benchmark_info[0] == 'mpi.collective.osu_alltoall'): + if (self.benchmark_info[0] == 'mpi.collective.osu_allreduce' + or self.benchmark_info[0] == 'mpi.collective.osu_alltoall'): self.tags.add('CI') if (self.benchmark_info[0] == 'mpi.collective.osu_allreduce'): self.tags.add('osu_allreduce') - if (self.benchmark_info[0] == 'mpi.collective.osu_alltoall'): self.tags.add('osu_alltoall') - @run_after('init') def set_mem(self): """ Setting an extra job option of memory. The alltoall operation takes maximum memory of 0.1 GB per core for a @@ -221,37 +218,36 @@ def set_num_tasks_per_node(self): """ Setting number of tasks per node, cpus per task and gpus per node in this function. This function sets num_cpus_per_task for 1 node and 2 node options where the request is for full nodes.""" max_avail_cpus_per_node = self.current_partition.processor.num_cpus - if(self.device_buffers == 'cpu'): + if self.device_buffers == 'cpu': # Setting num_tasks and num_tasks_per_node for the CPU tests - if(SCALES.get(self.scale).get('num_cpus_per_node', 0)): + if SCALES.get(self.scale).get('num_cpus_per_node', 0): hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[NODE], self.default_num_cpus_per_node) - elif(SCALES.get(self.scale).get('node_part', 0)): + elif SCALES.get(self.scale).get('node_part', 0): pass_num_per = int(max_avail_cpus_per_node / SCALES.get(self.scale).get('node_part', 0)) - if(pass_num_per > 1): + if pass_num_per > 1: hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[NODE], pass_num_per) else: self.skip(msg="Too few cores available for a collective operation.") - if(FEATURES[GPU] in self.current_partition.features): + if FEATURES[GPU] in self.current_partition.features: max_avail_gpus_per_node = utils.get_max_avail_gpus_per_node(self) # Setting number of GPU for a cpu test on a GPU node. - if(SCALES.get(self.scale).get('num_nodes') == 1): + if SCALES.get(self.scale).get('num_nodes') == 1: self.num_gpus_per_node = 1 else: self.num_gpus_per_node = max_avail_gpus_per_node - elif(self.device_buffers == 'cuda'): + elif self.device_buffers == 'cuda': max_avail_gpus_per_node = utils.get_max_avail_gpus_per_node(self) # Setting num_tasks and num_tasks_per_node for the GPU tests - if(max_avail_gpus_per_node == 1 and - SCALES.get(self.scale).get('num_nodes') == 1): - self.skip(msg="There is only 1 device within the node. Skipping collective tests involving only 1 node.") + if max_avail_gpus_per_node == 1 and SCALES.get(self.scale).get('num_nodes') == 1: + self.skip(msg="There is only 1 device in the node. Skipping collective tests involving only 1 node.") else: - if(SCALES.get(self.scale).get('num_gpus_per_node', 0) * SCALES.get(self.scale).get('num_nodes', 0) > 1): + if SCALES.get(self.scale).get('num_gpus_per_node', 0) * SCALES.get(self.scale).get('num_nodes', 0) > 1: hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(GPU, FEATURES[GPU])) - elif(SCALES.get(self.scale).get('node_part', 0)): + elif SCALES.get(self.scale).get('node_part', 0): pass_num_per = int(max_avail_gpus_per_node / SCALES.get(self.scale).get('node_part', 0)) - if(pass_num_per > 1): + if pass_num_per > 1: hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(GPU, FEATURES[GPU])) else: self.skip(msg="Total GPUs (max_avail_gpus_per_node / node_part) is 1 less.") diff --git a/eessi/testsuite/tests/apps/tensorflow/src/mnist_setup.py b/eessi/testsuite/tests/apps/tensorflow/src/mnist_setup.py index 8d47cf0e..1850a7ac 100644 --- a/eessi/testsuite/tests/apps/tensorflow/src/mnist_setup.py +++ b/eessi/testsuite/tests/apps/tensorflow/src/mnist_setup.py @@ -1,31 +1,33 @@ import tensorflow as tf import numpy as np + def mnist_dataset(batch_size, test_batch_size): - (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() - # The `x` arrays are in uint8 and have values in the [0, 255] range. - # You need to convert them to float32 with values in the [0, 1] range. - x_train = x_train / np.float32(255) - y_train = y_train.astype(np.int64) - x_test = x_test / np.float32(255) - y_test = y_test.astype(np.int64) - train_dataset = tf.data.Dataset.from_tensor_slices( - (x_train, y_train)).shuffle(60000).repeat().batch(batch_size) - test_dataset = tf.data.Dataset.from_tensor_slices( - (x_test, y_test)).batch(test_batch_size) - return train_dataset, test_dataset + (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() + # The `x` arrays are in uint8 and have values in the [0, 255] range. + # You need to convert them to float32 with values in the [0, 1] range. + x_train = x_train / np.float32(255) + y_train = y_train.astype(np.int64) + x_test = x_test / np.float32(255) + y_test = y_test.astype(np.int64) + train_dataset = tf.data.Dataset.from_tensor_slices( + (x_train, y_train)).shuffle(60000).repeat().batch(batch_size) + test_dataset = tf.data.Dataset.from_tensor_slices( + (x_test, y_test)).batch(test_batch_size) + return train_dataset, test_dataset + def build_and_compile_cnn_model(): - model = tf.keras.Sequential([ - tf.keras.layers.InputLayer(input_shape=(28, 28)), - tf.keras.layers.Reshape(target_shape=(28, 28, 1)), - tf.keras.layers.Conv2D(32, 3, activation='relu'), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(128, activation='relu'), - tf.keras.layers.Dense(10) - ]) - model.compile( - loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), - optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), - metrics=['accuracy']) - return model + model = tf.keras.Sequential([ + tf.keras.layers.InputLayer(input_shape=(28, 28)), + tf.keras.layers.Reshape(target_shape=(28, 28, 1)), + tf.keras.layers.Conv2D(32, 3, activation='relu'), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(128, activation='relu'), + tf.keras.layers.Dense(10) + ]) + model.compile( + loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), + optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), + metrics=['accuracy']) + return model diff --git a/eessi/testsuite/tests/apps/tensorflow/src/tf_test.py b/eessi/testsuite/tests/apps/tensorflow/src/tf_test.py index ac41415b..c908a8e2 100644 --- a/eessi/testsuite/tests/apps/tensorflow/src/tf_test.py +++ b/eessi/testsuite/tests/apps/tensorflow/src/tf_test.py @@ -14,6 +14,7 @@ import tensorflow as tf + def print0(msg, comm=MPI.COMM_WORLD): '''Prints string "msg" from rank 0''' output = comm.gather(msg, root=0) @@ -23,13 +24,15 @@ def print0(msg, comm=MPI.COMM_WORLD): for (rank, rank_out) in enumerate(output): print(f'Rank {rank}: {rank_out}') + def find_free_port(): '''Function that gets a free port for the current process''' with closing(socket.socket()) as s: s.bind(('', 0)) - s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR,1) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) return s.getsockname()[1] + def get_local_rank(rank_info, rank_info_list): '''Function that figures out the local rank based on a list of rank, hostname, and port gathered from each of the workers''' @@ -38,24 +41,26 @@ def get_local_rank(rank_info, rank_info_list): # our local rank is n for index, item in enumerate(rank_info_list): if item['hostname'] == rank_info['hostname'] and item['rank'] == rank_info['rank']: - return index + return index + def get_rank_info(comm=MPI.COMM_WORLD): '''Create a dict for this worker containing rank, hostname and port to be used by this worker''' rank = comm.Get_rank() hostname = socket.gethostname() port = find_free_port() - + return { 'rank': rank, 'hostname': hostname, 'port': port, } + def set_tf_config(rank_info, rank_info_list): '''Sets the TF_CONFIG environment variable for the current worker, based on the rank_info_list''' worker_list = ['%s:%s' % (item['hostname'], item['port']) for item in rank_info_list] - + tf_config = { 'cluster': { 'worker': worker_list, @@ -63,28 +68,38 @@ def set_tf_config(rank_info, rank_info_list): 'task': {'type': 'worker', 'index': rank_info['rank']} } os.environ["TF_CONFIG"] = json.dumps(tf_config) - + # logger.info(f"Set TF_CONFIG for rank {rank_info['rank']} to {tf_config}.") print0(f"Set TF_CONFIG for rank {rank_info['rank']} to {tf_config}.") return tf_config + parser = argparse.ArgumentParser( - prog='Tensorflow Distributed Test', - description='This program runs a distributed TensorFlow test using the tf.distribute.MultiWorkerMirroredStrategy and the Keras fit API' -) -parser.add_argument('-d', '--device', type=str, default='cpu', choices=['cpu', 'gpu'], help='Device to use for training') -parser.add_argument('--inter-op-parallelism', type=int, default=1, help='Sets tf.config.threading.set_inter_op_parallelism_threads') -parser.add_argument('--intra-op-parallelism', type=int, default=0, help='Sets tf.config.threading.set_intra_op_parallelism_threads') -parser.add_argument('--per-worker-batch-size', type=int, default=4096, help='Batch size processed by each worker') -parser.add_argument('--per-worker-test-batch-size', type=int, default=512, help='Batch size for computing accuracy on the validation set') -parser.add_argument('--epochs-to-train', type=int, default=4, help='Number of epochs to train') -parser.add_argument('--steps-per-epoch', type=int, default=25, help='Number of steps to train per epoch') + prog='Tensorflow Distributed Test', + description='This program runs a distributed TensorFlow test using the tf.distribute.MultiWorkerMirroredStrategy' + ' and the Keras fit API') + +parser.add_argument( + '-d', '--device', type=str, default='cpu', choices=['cpu', 'gpu'], help='Device to use for training') +parser.add_argument( + '--inter-op-parallelism', type=int, default=1, help='Sets tf.config.threading.set_inter_op_parallelism_threads') +parser.add_argument( + '--intra-op-parallelism', type=int, default=0, help='Sets tf.config.threading.set_intra_op_parallelism_threads') +parser.add_argument( + '--per-worker-batch-size', type=int, default=4096, help='Batch size processed by each worker') +parser.add_argument( + '--per-worker-test-batch-size', type=int, default=512, + help='Batch size for computing accuracy on the validation set') +parser.add_argument( + '--epochs-to-train', type=int, default=4, help='Number of epochs to train') +parser.add_argument( + '--steps-per-epoch', type=int, default=25, help='Number of steps to train per epoch') args = parser.parse_args() # Make sure we can import mnist_setup from current dir if '.' not in sys.path: - sys.path.insert(0, '.') -import mnist_setup + sys.path.insert(0, '.') +import mnist_setup # noqa: E402 os.environ.pop('TF_CONFIG', None) @@ -101,27 +116,29 @@ def set_tf_config(rank_info, rank_info_list): local_rank = get_local_rank(rank_info, rank_info_list) # Create logger per rank -#logging.basicConfig( -# filename='rank_%s.out' % local_rank, -# format='%(asctime)s %(levelname)s %(message)s', -# datefmt='%Y-%m-%d %H:%M:%S' -#) -#logging.info(f"Rank {rank_info['rank']} has local_rank {local_rank}, hostname {rank_info['hostname']} and port {rank_info['port']}") -print0(f"Rank {rank_info['rank']} has local_rank {local_rank}, hostname {rank_info['hostname']} and port {rank_info['port']}") +# logging.basicConfig( +# filename='rank_%s.out' % local_rank, +# format='%(asctime)s %(levelname)s %(message)s', +# datefmt='%Y-%m-%d %H:%M:%S' +# ) +# logging.info(f"Rank {rank_info['rank']} has local_rank {local_rank}, hostname {rank_info['hostname']}" +# " and port {rank_info['port']}") +print0(f"Rank {rank_info['rank']} has local_rank {local_rank}, hostname {rank_info['hostname']}" + " and port {rank_info['port']}") # Turn off tensorflow info and warnings for rank != 0 if local_rank != 0: print("Turning off logging") - #tf.get_logger().setLevel('ERROR') - #tf.autograph.set_verbosity(1) - #tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) + # tf.get_logger().setLevel('ERROR') + # tf.autograph.set_verbosity(1) + # tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' tf_config = set_tf_config(rank_info, rank_info_list) num_workers = len(tf_config['cluster']['worker']) # Set visible devices and create MultiWorkerMirroredStrategy -#logging.info(f"Selecting device: {args.device}") +# logging.info(f"Selecting device: {args.device}") print0(f"Selecting device: {args.device}") if args.device == 'gpu': # Limit each local rank to its own GPU. @@ -134,10 +151,10 @@ def set_tf_config(rank_info, rank_info_list): # logging.info("Local rank: %s, visible_devices: %s" % (local_rank, visible_devices)) print0("Local rank: %s, visible_devices: %s" % (local_rank, visible_devices)) assert len(visible_devices) == 1 - except: + except Exception: print0("ERROR: Selection of GPU device based on local rank failed. Local rank: %s. Selected devices: %s" - % (local_rank, visible_devices)) - # logging.error("Selection of GPU device based on local rank failed. Local rank: %s. Selected devices: %s" + % (local_rank, visible_devices)) + # logging.error("Selection of GPU device based on local rank failed. Local rank: %s. Selected devices: %s" # % (local_rank, visible_devices)) # Should now have 1 GPU per process. Set memory growth for that device to avoid issues similar to @@ -156,7 +173,7 @@ def set_tf_config(rank_info, rank_info_list): tf.config.set_visible_devices([], 'GPU') visible_devices = tf.config.get_visible_devices() print0("Local rank: %s, visible_devices: %s" % (local_rank, visible_devices)) - # logging.info("Local rank: %s, visible_devices: %s" % (local_rank, visible_devices)) + # logging.info("Local rank: %s, visible_devices: %s" % (local_rank, visible_devices)) strategy = tf.distribute.MultiWorkerMirroredStrategy() # logging.info("Multiworker strategy created") print0("Multiworker strategy created") @@ -171,14 +188,15 @@ def set_tf_config(rank_info, rank_info_list): # Run the training starttime = timer() -multi_worker_model.fit(multi_worker_dataset, epochs=args.epochs_to_train, steps_per_epoch=args.steps_per_epoch, verbose=2) +multi_worker_model.fit( + multi_worker_dataset, epochs=args.epochs_to_train, steps_per_epoch=args.steps_per_epoch, verbose=2) endtime = timer() -#logging.info("Keras fit completed!") +# logging.info("Keras fit completed!") print0("Keras fit completed!") # Compute performance -training_time = endtime-starttime -total_samples_trained = global_batch_size*args.steps_per_epoch*args.epochs_to_train +training_time = endtime - starttime +total_samples_trained = global_batch_size * args.steps_per_epoch * args.epochs_to_train throughput = total_samples_trained / training_time if local_rank == 0: print(f"Total training time: {training_time}") diff --git a/eessi/testsuite/tests/apps/tensorflow/tensorflow.py b/eessi/testsuite/tests/apps/tensorflow/tensorflow.py index 023dfd82..f37194f8 100644 --- a/eessi/testsuite/tests/apps/tensorflow/tensorflow.py +++ b/eessi/testsuite/tests/apps/tensorflow/tensorflow.py @@ -10,6 +10,7 @@ from eessi.testsuite import hooks, utils from eessi.testsuite.constants import * # noqa + @rfm.simple_test class EESSI_TensorFlow(rfm.RunOnlyRegressionTest): @@ -50,7 +51,7 @@ def assert_completion(self): @deferrable def assert_convergence(self): '''Assert that the network learned _something_ during training''' - accuracy = sn.extractsingle('^Final accuracy: (?P\S+)', self.stdout, 'accuracy', float) + accuracy = sn.extractsingle('^Final accuracy: (?P\S+)', self.stdout, 'accuracy', float) # noqa: W605 # mnist is a 10-class classification problem, so if accuracy >> 0.2 the network 'learned' something return sn.assert_gt(accuracy, 0.2) diff --git a/eessi/testsuite/utils.py b/eessi/testsuite/utils.py index 79ae1ec4..9357cc60 100644 --- a/eessi/testsuite/utils.py +++ b/eessi/testsuite/utils.py @@ -9,7 +9,6 @@ import reframe as rfm import reframe.core.runtime as rt from reframe.frontend.printer import PrettyPrinter -from reframe.utility import OrderedSet from eessi.testsuite.constants import * @@ -62,11 +61,11 @@ def find_modules(regex: str, name_only=True) -> Iterator[str]: Return all modules matching the regular expression regex. Note that since we use re.search, a module matches if the regex matches the module name at any place. I.e. the match does not have to be at the start of the smodule name - + Arguments: - regex: a regular expression - name_only: regular expressions will only be matched on the module name, not the version (default: True). - + Note: the name_only feature assumes anything after the last forward '/' is the version, and strips that before doing a match. @@ -109,14 +108,15 @@ def find_modules(regex: str, name_only=True) -> Iterator[str]: # Match the actual regular expression log(f"Matching module {mod} with regex {regex}") if re.search(regex, mod): - log(f"Match!") + log("Match!") yield orig_mod + def check_proc_attribute_defined(test: rfm.RegressionTest, attribute) -> bool: """ Checks if a processor feature is defined (i.e. if test.current_partition.processor. is defined) - If not, throws an informative error message. - + If not, throws an informative error message. + Arguments: - test: the reframe regression test instance for which should be checked if the processor feature is defined - attribute: attribute of the processor object, as defined by systems.partitions.processor @@ -125,7 +125,7 @@ def check_proc_attribute_defined(test: rfm.RegressionTest, attribute) -> bool: - True (bool) if the attribute is defined - Function does not return (but raises an error) if the attribute is undefined - Current known attributes in ReFrame are arch, num_cpus, num_cpus_per_core and topology, + Current known attributes in ReFrame are arch, num_cpus, num_cpus_per_core and topology, but this may change in the future. If ReFrame's autodetect feature is used, all of these should be properly defined, so that's what we advice. @@ -149,4 +149,3 @@ def check_proc_attribute_defined(test: rfm.RegressionTest, attribute) -> bool: "This is a programming error, please report this issue." ) raise AttributeError(msg) - From 08ba8e49d77610ae71b0f9d591634b5d7e7a8b10 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Thu, 29 Feb 2024 17:22:46 +0100 Subject: [PATCH 5/5] use isinstance for type checking --- eessi/testsuite/hooks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index 82c46430..c06ff572 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -90,8 +90,8 @@ def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, n # Check if either node_part, or default_num_cpus_per_node and default_num_gpus_per_node are set correctly if not ( - type(test.node_part) == int - or (type(test.default_num_cpus_per_node) == int and type(test.default_num_gpus_per_node) == int) + isinstance(test.node_part, int) + or (isinstance(test.default_num_cpus_per_node, int) and isinstance(test.default_num_gpus_per_node, int)) ): raise ValueError( f'Either node_part ({test.node_part}), or default_num_cpus_per_node ({test.default_num_cpus_per_node}) and'