From 47140cda8c1c2078cea563f0241309db952dfdd5 Mon Sep 17 00:00:00 2001 From: Ryan Anderson Date: Fri, 15 Sep 2023 12:15:18 -0400 Subject: [PATCH] [Develop] Migrate internal storage to EFS from NFS exports Add backup and restore recipes to move data in second stage images to shared filesystems Create a new mount_internal_use_fs.rb recipe to mount the internal shared filesystems Filter the efs filesystem arrays to mount internal shared fses in init and cx fses in config Refactor environment recipes to be clearer in functional description and remove unnecessary recipes --- CHANGELOG.md | 1 + .../recipes/init.rb | 1 + .../recipes/update.rb | 4 +- .../attributes/environment.rb | 4 + .../kitchen.environment-config.yml | 85 ++++++++++--------- .../recipes/config.rb | 20 ++++- .../recipes/config/efs.rb | 41 +++++++++ .../recipes/config/export_home.rb} | 24 +++--- .../recipes/config/fs_mount.rb | 34 -------- .../recipes/config/mount_shared.rb | 48 ----------- .../recipes/config/shared_storages.rb | 59 ------------- .../{fs_update.rb => update_fs_mapping.rb} | 0 .../recipes/init.rb | 8 +- .../init/backup_internal_use_shared_data.rb | 32 +++++++ .../recipes/init/mount_internal_use_fs.rb | 53 ++++++++++++ .../init/restore_internal_use_shared_data.rb | 33 +++++++ .../recipes/{fs_mount_spec.rb => efs_spec.rb} | 7 +- ..._spec.rb => mount_internal_use_fs_spec.rb} | 13 ++- ..._spec.rb => mount_internal_use_fs_spec.rb} | 22 ++++- .../test/controls/shared_storages_spec.rb | 26 ------ .../attributes/slurm_attributes.rb | 1 - .../kitchen.slurm-config.yml | 14 --- .../recipes/config/config_compute.rb | 2 - .../recipes/config/config_head_node.rb | 7 -- .../recipes/config/config_login.rb | 2 - .../test/controls/mount_slurm_dir_spec.rb | 22 ----- kitchen.ec2.yml | 2 +- 27 files changed, 274 insertions(+), 291 deletions(-) create mode 100644 cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb rename cookbooks/{aws-parallelcluster-slurm/recipes/config/mount_slurm_dir.rb => aws-parallelcluster-environment/recipes/config/export_home.rb} (51%) delete mode 100644 cookbooks/aws-parallelcluster-environment/recipes/config/fs_mount.rb delete mode 100644 cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb delete mode 100644 cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb rename cookbooks/aws-parallelcluster-environment/recipes/config/{fs_update.rb => update_fs_mapping.rb} (100%) create mode 100644 cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb create mode 100644 cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_fs.rb create mode 100644 cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb rename cookbooks/aws-parallelcluster-environment/spec/unit/recipes/{fs_mount_spec.rb => efs_spec.rb} (69%) rename cookbooks/aws-parallelcluster-environment/spec/unit/recipes/{mount_shared_spec.rb => mount_internal_use_fs_spec.rb} (65%) rename cookbooks/aws-parallelcluster-environment/test/controls/{mount_shared_spec.rb => mount_internal_use_fs_spec.rb} (65%) delete mode 100644 cookbooks/aws-parallelcluster-environment/test/controls/shared_storages_spec.rb delete mode 100644 cookbooks/aws-parallelcluster-slurm/test/controls/mount_slurm_dir_spec.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index 63938c8ac..a7b776d87 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Install [Spack](https://spack.io) by default in cluster user's home directory. **CHANGES** +- Remove the NFS exports from the head node root volume and use Amazon EFS storage instead for intra-cluster shared ParallelCluster, Intel, Slurm, and Login Node data. **BUG FIXES** - Fix inconsistent scaling configuration after cluster update rollback when modifying the list of instance types declared in the Compute Resources. diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb index edd70bcf9..538eeaf37 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb @@ -24,5 +24,6 @@ # Fetch config must be executed after the mount of the shared folders because the config will be saved there fetch_config 'Fetch and load cluster configs' +# Compute fleet init requires shared dirs include_recipe "aws-parallelcluster-computefleet::init" include_recipe "aws-parallelcluster-slurm::init" diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb index 116e03481..e10f15744 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb @@ -16,8 +16,8 @@ update true end -# generate the update shared storages mapping file -include_recipe 'aws-parallelcluster-environment::fs_update' +# generate the updated shared storages mapping file +include_recipe 'aws-parallelcluster-environment::update_fs_mapping' include_recipe 'aws-parallelcluster-environment::directory_service' include_recipe 'aws-parallelcluster-slurm::update' if node['cluster']['scheduler'] == 'slurm' diff --git a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb index 1a6bd3e1a..fa78adfa1 100644 --- a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb +++ b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb @@ -53,5 +53,9 @@ default['cluster']['shared_dir_compute'] = node['cluster']['shared_dir'] default['cluster']['shared_dir_head'] = node['cluster']['shared_dir'] default['cluster']['shared_dir_login'] = node['cluster']['shared_dir_login_nodes'] +# Since this is a shared directory, it needs to be defined here first instead of in the dependent cookbook for slurm +default['cluster']['slurm']['install_dir'] = '/opt/slurm' + +default['cluster']['internal_shared_dirs'] = [node['cluster']['shared_dir'], node['cluster']['shared_dir_login_nodes'], node['cluster']['slurm']['install_dir'], "/opt/intel"] default['cluster']['head_node_private_ip'] = nil diff --git a/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml b/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml index 86d346eb1..94bacfcf6 100644 --- a/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml +++ b/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml @@ -414,10 +414,10 @@ suites: fsx_shared_dirs: '' raid_shared_dir: '' ephemeral_dir: test1 - - name: fs_update + - name: update_fs_mapping run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::fs_update] + - recipe[aws-parallelcluster-environment::update_fs_mapping] verifier: controls: - fs_data_file_created_correctly @@ -440,10 +440,10 @@ suites: fsx_dns_names: dns1,dns2 fsx_mount_names: mount1,mount2 fsx_volume_junction_paths: value1,value2 - - name: fs_update_default_values + - name: update_fs_mapping_default_values run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::fs_update] + - recipe[aws-parallelcluster-environment::update_fs_mapping] verifier: controls: - fs_data_file_with_default_values @@ -472,42 +472,43 @@ suites: scheduler: slurm head_node_imds_secured: 'true' head_node_imds_allowed_users: ['root', 'nobody'] - - name: mount_shared_compute - run_list: - - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::mount_shared] - verifier: - controls: - - mount_home - - mount_shared_compute - attributes: - dependencies: - - recipe:aws-parallelcluster-platform::directories - - resource:nfs - - recipe:aws-parallelcluster-environment::mock_export_directories - cluster: - node_type: 'ComputeFleet' - head_node_private_ip: '127.0.0.1' - head_node_home_path: '/fake_headnode_home' - shared_dir_head: '/fake_headnode_shared' - - name: mount_shared_login - run_list: - - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::mount_shared] - verifier: - controls: - - mount_home - - mount_shared_login - attributes: - dependencies: - - recipe:aws-parallelcluster-platform::directories - - resource:nfs - - recipe:aws-parallelcluster-environment::mock_export_directories - cluster: - node_type: 'LoginNode' - head_node_private_ip: '127.0.0.1' - head_node_home_path: '/fake_headnode_home' - shared_dir_head: '/fake_headnode_shared' +# TODO replacement for the mount_internal_use_fs recipe since it uses shared storage +# - name: mount_shared_compute +# run_list: +# - recipe[aws-parallelcluster-tests::setup] +# - recipe[aws-parallelcluster-environment::mount_shared] +# verifier: +# controls: +# - mount_home +# - mount_shared_compute +# attributes: +# dependencies: +# - recipe:aws-parallelcluster-platform::directories +# - resource:nfs +# - recipe:aws-parallelcluster-environment::mock_export_directories +# cluster: +# node_type: 'ComputeFleet' +# head_node_private_ip: '127.0.0.1' +# head_node_home_path: '/fake_headnode_home' +# shared_dir_head: '/fake_headnode_shared' +# - name: mount_shared_login +# run_list: +# - recipe[aws-parallelcluster-tests::setup] +# - recipe[aws-parallelcluster-environment::mount_shared] +# verifier: +# controls: +# - mount_home +# - mount_shared_login +# attributes: +# dependencies: +# - recipe:aws-parallelcluster-platform::directories +# - resource:nfs +# - recipe:aws-parallelcluster-environment::mock_export_directories +# cluster: +# node_type: 'LoginNode' +# head_node_private_ip: '127.0.0.1' +# head_node_home_path: '/fake_headnode_home' +# shared_dir_head: '/fake_headnode_shared' - name: raid_compute run_list: - recipe[aws-parallelcluster-tests::setup] @@ -541,7 +542,7 @@ suites: - name: shared_storages_compute run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::shared_storages] + - recipe[aws-parallelcluster-environment::efs] verifier: controls: - shared_storages_compute_and_login @@ -555,7 +556,7 @@ suites: - name: shared_storages_login run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::shared_storages] + - recipe[aws-parallelcluster-environment::efs] verifier: controls: - shared_storages_compute_and_login diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config.rb b/cookbooks/aws-parallelcluster-environment/recipes/config.rb index 6eefa2b36..900f5e277 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config.rb @@ -18,9 +18,21 @@ action :configure end include_recipe 'aws-parallelcluster-environment::ephemeral_drives' -# fs_update generates the shared storages mapping file so must be executed before shared storages recipes -include_recipe 'aws-parallelcluster-environment::fs_update' -include_recipe 'aws-parallelcluster-environment::shared_storages' +# update_fs_mapping generates the shared storages mapping file so must be executed before shared storages recipes +include_recipe 'aws-parallelcluster-environment::update_fs_mapping' +include_recipe 'aws-parallelcluster-environment::export_home' include_recipe 'aws-parallelcluster-environment::ebs' include_recipe 'aws-parallelcluster-environment::raid' -include_recipe "aws-parallelcluster-environment::fs_mount" +include_recipe "aws-parallelcluster-environment::efs" + +# Mount FSx directory with manage_fsx resource +lustre "mount fsx" do + fsx_fs_id_array node['cluster']['fsx_fs_ids'].split(',') + fsx_fs_type_array node['cluster']['fsx_fs_types'].split(',') + fsx_shared_dir_array node['cluster']['fsx_shared_dirs'].split(',') + fsx_dns_name_array node['cluster']['fsx_dns_names'].split(',') + fsx_mount_name_array node['cluster']['fsx_mount_names'].split(',') + fsx_volume_junction_path_array node['cluster']['fsx_volume_junction_paths'].split(',') + action :mount + not_if { node['cluster']['fsx_fs_ids'].split(',').empty? } +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb new file mode 100644 index 000000000..a967ecece --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb @@ -0,0 +1,41 @@ +# frozen_string_literal: true + +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +return if on_docker? +efs_shared_dir_array = node['cluster']['efs_shared_dirs'].split(',') +efs_fs_id_array = node['cluster']['efs_fs_ids'].split(',') +efs_encryption_in_transit_array = node['cluster']['efs_encryption_in_transits'].split(',') +efs_iam_authorization_array = node['cluster']['efs_iam_authorizations'].split(',') + +cx_shared_dir_array = [] +cx_efs_fs_id_array = [] +cx_efs_encryption_array = [] +cx_efs_iam_array = [] + +# Identify the customer use filesystems and store their data in arrays for the EFS resource +efs_shared_dir_array.each_with_index do |dir, index| + next if node['cluster']['internal_shared_dirs'].include?(dir) + cx_shared_dir_array.push(dir) + cx_efs_fs_id_array.push(efs_fs_id_array[index]) + cx_efs_encryption_array.push(efs_encryption_in_transit_array[index]) + cx_efs_iam_array.push(efs_iam_authorization_array[index]) +end + +# Mount EFS directories with the efs resource +efs "mount efs" do + shared_dir_array cx_shared_dir_array + efs_fs_id_array cx_efs_fs_id_array + efs_encryption_in_transit_array cx_efs_encryption_array + efs_iam_authorization_array cx_efs_iam_array + action :mount + not_if { cx_shared_dir_array.empty? } +end diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/mount_slurm_dir.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb similarity index 51% rename from cookbooks/aws-parallelcluster-slurm/recipes/config/mount_slurm_dir.rb rename to cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb index 574c7a100..aa1e1e9db 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/mount_slurm_dir.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true -# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the # License. A copy of the License is located at @@ -11,13 +12,16 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. -# Mount /opt/slurm over NFS -# Computemgtd config is under /opt/slurm/etc/pcluster; all compute nodes share a config -mount "#{node['cluster']['slurm']['install_dir']}" do - device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['slurm']['install_dir']}" }) - fstype "nfs" - options node['cluster']['nfs']['hard_mount_options'] - action %i(mount enable) - retries 10 - retry_delay 6 +return if on_docker? + +case node['cluster']['node_type'] +when 'HeadNode' + volume "export /home" do + shared_dir "/home" + action :export + end +when 'ComputeFleet', 'LoginNode' + Chef::Log.info("Export only from the HeadNode") +else + raise "node_type must be HeadNode, ComputeFleet, or LoginNode" end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/fs_mount.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/fs_mount.rb deleted file mode 100644 index 255be867b..000000000 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/fs_mount.rb +++ /dev/null @@ -1,34 +0,0 @@ -# frozen_string_literal: true - -# Copyright:: 2013-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the -# License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. - -# Mount EFS directory with efs resource -efs "mount efs" do - shared_dir_array node['cluster']['efs_shared_dirs'].split(',') - efs_fs_id_array node['cluster']['efs_fs_ids'].split(',') - efs_encryption_in_transit_array node['cluster']['efs_encryption_in_transits'].split(',') - efs_iam_authorization_array node['cluster']['efs_iam_authorizations'].split(',') - action :mount - not_if { node['cluster']['efs_shared_dirs'].split(',').empty? } -end - -# Mount FSx directory with manage_fsx resource -lustre "mount fsx" do - fsx_fs_id_array node['cluster']['fsx_fs_ids'].split(',') - fsx_fs_type_array node['cluster']['fsx_fs_types'].split(',') - fsx_shared_dir_array node['cluster']['fsx_shared_dirs'].split(',') - fsx_dns_name_array node['cluster']['fsx_dns_names'].split(',') - fsx_mount_name_array node['cluster']['fsx_mount_names'].split(',') - fsx_volume_junction_path_array node['cluster']['fsx_volume_junction_paths'].split(',') - action :mount - not_if { node['cluster']['fsx_fs_ids'].split(',').empty? } -end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb deleted file mode 100644 index d6632d4ef..000000000 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb +++ /dev/null @@ -1,48 +0,0 @@ -# frozen_string_literal: true - -# Copyright:: 2013-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the -# License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. - -return if on_docker? - -case node['cluster']['node_type'] -when 'ComputeFleet' - include_recipe 'aws-parallelcluster-environment::mount_home' - - # Mount /opt/parallelcluster/shared over NFS - volume "mount #{node['cluster']['shared_dir_compute']}" do - action :mount - shared_dir node['cluster']['shared_dir_compute'] - device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['shared_dir_head']}" }) - fstype 'nfs' - options node['cluster']['nfs']['hard_mount_options'] - retries 10 - retry_delay 6 - end - -when 'LoginNode' - include_recipe 'aws-parallelcluster-environment::mount_home' - - # Mount /opt/parallelcluster/shared_login_nodes over NFS - volume "mount #{node['cluster']['shared_dir_login']}" do - action :mount - shared_dir node['cluster']['shared_dir_login'] - device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['shared_dir_login']}" }) - fstype 'nfs' - options node['cluster']['nfs']['hard_mount_options'] - retries 10 - retry_delay 6 - end -when 'HeadNode' - Chef::Log.info("Nothing to mount in the HeadNode") -else - raise "node_type must be HeadNode, LoginNode or ComputeFleet" -end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb deleted file mode 100644 index 56f34e204..000000000 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb +++ /dev/null @@ -1,59 +0,0 @@ -# frozen_string_literal: true - -# -# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the -# License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. - -return if on_docker? - -case node['cluster']['node_type'] -when 'HeadNode' - volume "export /home" do - shared_dir "/home" - action :export - end - - # Export /opt/parallelcluster/shared - volume "export #{node['cluster']['shared_dir']}" do - shared_dir node['cluster']['shared_dir'] - action :export - end - - # Export /opt/parallelcluster/shared_login_nodes - volume "export #{node['cluster']['shared_dir_login_nodes']}" do - shared_dir node['cluster']['shared_dir_login_nodes'] - action :export - end - - # Export /opt/intel only if exists - volume "export /opt/intel" do - shared_dir "/opt/intel" - only_if { ::File.directory?("/opt/intel") } - action :export - end - -when 'ComputeFleet', 'LoginNode' - # Mount /opt/intel over NFS only if it exists - exported_intel_dir = format_directory('/opt/intel') - volume "mount /opt/intel" do - action :mount - shared_dir '/opt/intel' - device(lazy { "#{node['cluster']['head_node_private_ip']}:#{exported_intel_dir}" }) - fstype 'nfs' - options node['cluster']['nfs']['hard_mount_options'] - retries 10 - retry_delay 6 - only_if { ::File.directory?("/opt/intel") } - end - -else - raise "node_type must be HeadNode or ComputeFleet" -end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/fs_update.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/update_fs_mapping.rb similarity index 100% rename from cookbooks/aws-parallelcluster-environment/recipes/config/fs_update.rb rename to cookbooks/aws-parallelcluster-environment/recipes/config/update_fs_mapping.rb diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init.rb b/cookbooks/aws-parallelcluster-environment/recipes/init.rb index cd3c801c9..8e465ee0e 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init.rb @@ -13,12 +13,18 @@ # include_recipe "aws-parallelcluster-environment::cfnconfig_mixed" -include_recipe "aws-parallelcluster-environment::mount_shared" cloudwatch "Configure CloudWatch" do action :configure end +include_recipe "aws-parallelcluster-environment::update_fs_mapping" +include_recipe "aws-parallelcluster-environment::backup_internal_use_shared_data" +include_recipe "aws-parallelcluster-environment::mount_internal_use_fs" +include_recipe "aws-parallelcluster-environment::restore_internal_use_shared_data" + include_recipe "aws-parallelcluster-environment::network_interfaces" include_recipe 'aws-parallelcluster-environment::imds' + +# login nodes keys and directory service require shared storage include_recipe "aws-parallelcluster-environment::login_nodes_keys" include_recipe "aws-parallelcluster-environment::directory_service" diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb new file mode 100644 index 000000000..6e472f7d1 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +if node['cluster']['node_type'] == 'HeadNode' + # For each, backup the data to a temp location + # This is necessary to preserve any data in these directories that was + # generated during the build of ParallelCluster AMIs after converting to + # shared storage + node['cluster']['internal_shared_dirs'].each do |dir| + bash "Backup #{dir}" do + user 'root' + group 'root' + code <<-EOH + mkdir -p /tmp#{dir} + rsync -a #{dir}/ /tmp#{dir} + EOH + end + end +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_fs.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_fs.rb new file mode 100644 index 000000000..e25f89c4f --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_fs.rb @@ -0,0 +1,53 @@ +# frozen_string_literal: true + +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +return if on_docker? +efs_shared_dir_array = node['cluster']['efs_shared_dirs'].split(',') +efs_fs_id_array = node['cluster']['efs_fs_ids'].split(',') +efs_encryption_in_transit_array = node['cluster']['efs_encryption_in_transits'].split(',') +efs_iam_authorization_array = node['cluster']['efs_iam_authorizations'].split(',') + +internal_shared_dir_array = [] +internal_efs_fs_id_array = [] +internal_efs_encryption_array = [] +internal_efs_iam_array = [] + +# Identify the internal use filesystems and store their data in arrays for the EFS resource +efs_shared_dir_array.each_with_index do |dir, index| + next unless node['cluster']['internal_shared_dirs'].include?(dir) + # Don't mount the login nodes shared dir to compute nodes + next if node['cluster']['node_type'] == 'ComputeFleet' && dir == node['cluster']['shared_dir_login_nodes'] + internal_shared_dir_array.push(dir) + internal_efs_fs_id_array.push(efs_fs_id_array[index]) + internal_efs_encryption_array.push(efs_encryption_in_transit_array[index]) + internal_efs_iam_array.push(efs_iam_authorization_array[index]) +end + +# Mount EFS directories with the efs resource +efs "mount internal use efs" do + shared_dir_array internal_shared_dir_array + efs_fs_id_array internal_efs_fs_id_array + efs_encryption_in_transit_array internal_efs_encryption_array + efs_iam_authorization_array internal_efs_iam_array + action :mount + not_if { internal_shared_dir_array.empty? } +end + +# TODO: replace home as NFS with shared /home +case node['cluster']['node_type'] +when 'ComputeFleet', 'LoginNode' + include_recipe 'aws-parallelcluster-environment::mount_home' +when 'HeadNode' + Chef::Log.info("Nothing to mount in the HeadNode") +else + raise "node_type must be HeadNode, LoginNode or ComputeFleet" +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb new file mode 100644 index 000000000..83b0d71ec --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +if node['cluster']['node_type'] == 'HeadNode' + # For each, restore the shared storage if it doesn't already exist + # This is necessary to preserve any data in these directories that was + # generated during the build of ParallelCluster AMIs after converting to + # shared storage and backed up to a temporary location previously + # Remove the backup after the copy is done + node['cluster']['internal_shared_dirs'].each do |dir| + bash "Restore #{dir}" do + user 'root' + group 'root' + code <<-EOH + rsync -a --ignore-existing /tmp#{dir}/ #{dir} + rm -rf /tmp#{dir}/ + EOH + end + end +end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/fs_mount_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/efs_spec.rb similarity index 69% rename from cookbooks/aws-parallelcluster-environment/spec/unit/recipes/fs_mount_spec.rb rename to cookbooks/aws-parallelcluster-environment/spec/unit/recipes/efs_spec.rb index 6fca809ad..9849bc533 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/fs_mount_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/efs_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe 'aws-parallelcluster-environment::fs_mount' do +describe 'aws-parallelcluster-environment::efs' do for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:chef_run) do @@ -13,12 +13,9 @@ end cached(:node) { chef_run.node } - describe 'call the efs for mounting' do + describe 'call efs for mounting' do it { is_expected.to mount_efs('mount efs') } end - describe 'call the lustre for mounting' do - it { is_expected.to mount_lustre("mount fsx") } - end end end end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_shared_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_fs_spec.rb similarity index 65% rename from cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_shared_spec.rb rename to cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_fs_spec.rb index ad2b45e3e..c5e33aab6 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_shared_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_fs_spec.rb @@ -1,12 +1,14 @@ require 'spec_helper' -describe 'aws-parallelcluster-environment::mount_shared' do +describe 'aws-parallelcluster-environment::mount_internal_use_fs' do for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| node.override['cluster']['head_node_private_ip'] = '0.0.0.0' node.override['cluster']['node_type'] = 'ComputeFleet' + node.override['cluster']['internal_shared_dirs'] = %w(/opt/slurm /opt/intel) + node.override['cluster']['efs_shared_dirs'] = "/opt/slurm,/opt/intel" end runner.converge(described_recipe) end @@ -21,13 +23,8 @@ .with(retry_delay: 6) end - it 'mounts /opt/parallelcluster/shared' do - is_expected.to mount_volume('mount /opt/parallelcluster/shared') - .with(device: "0.0.0.0:/opt/parallelcluster/shared") - .with(fstype: 'nfs') - .with(options: 'hard,_netdev,noatime') - .with(retries: 10) - .with(retry_delay: 6) + describe 'call efs for mounting' do + it { is_expected.to mount_efs('mount internal use efs') } end end end diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/mount_shared_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/mount_internal_use_fs_spec.rb similarity index 65% rename from cookbooks/aws-parallelcluster-environment/test/controls/mount_shared_spec.rb rename to cookbooks/aws-parallelcluster-environment/test/controls/mount_internal_use_fs_spec.rb index fe3704818..e3861a20e 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/mount_shared_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/mount_internal_use_fs_spec.rb @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and limitations under the License. control 'mount_home' do - title 'Check if the home and the shared directories are mounted' + title 'Check if the home directory in mounted' only_if { !os_properties.on_docker? && (instance.compute_node? or instance.login_node?) } @@ -22,7 +22,7 @@ end control 'mount_shared_compute' do - title 'Check if the home and the shared directories are mounted' + title 'Check if the shared directory is mounted' only_if { !os_properties.on_docker? && instance.compute_node? } @@ -34,7 +34,7 @@ end control 'mount_shared_login' do - title 'Check if the home and the shared directories are mounted' + title 'Check if the shared directory is mounted' only_if { !os_properties.on_docker? && instance.login_node? } @@ -44,3 +44,19 @@ its('options') { should include 'rw' } end end + +control 'shared_storages_compute_and_login' do + title 'Check the shared storages configuration for compute node' + + only_if { !os_properties.on_docker? && (instance.compute_node? or instance.login_node?) } + + describe 'Check that /opt/intel dir has been mounted' + describe mount("/opt/intel") do + it { should be_mounted } + its('device') { should eq "127.0.0.1:/opt/intel" } + its('type') { should eq 'nfs4' } + its('options') { should include 'hard' } + its('options') { should include '_netdev' } + its('options') { should include 'noatime' } + end +end diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/shared_storages_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/shared_storages_spec.rb deleted file mode 100644 index 6d9fe989f..000000000 --- a/cookbooks/aws-parallelcluster-environment/test/controls/shared_storages_spec.rb +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. -# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. -# See the License for the specific language governing permissions and limitations under the License. - -control 'shared_storages_compute_and_login' do - title 'Check the shared storages configuration for compute node' - - only_if { !os_properties.on_docker? && (instance.compute_node? or instance.login_node?) } - - describe 'Check that /opt/intel dir has been mounted' - describe mount("/opt/intel") do - it { should be_mounted } - its('device') { should eq "127.0.0.1:/opt/intel" } - its('type') { should eq 'nfs4' } - its('options') { should include 'hard' } - its('options') { should include '_netdev' } - its('options') { should include 'noatime' } - end -end diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb b/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb index b43bc8821..966fcd4df 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb @@ -4,7 +4,6 @@ # Slurm attributes shared between install_slurm and configure_slurm_accounting default['cluster']['slurm']['commit'] = '' default['cluster']['slurm']['sha256'] = '7290143a71ce2797d0df3423f08396fd5c0ae4504749ff372d6860b2d6a3a1b0' -default['cluster']['slurm']['install_dir'] = '/opt/slurm' default['cluster']['dns_domain'] = nil default['cluster']['use_private_hostname'] = 'false' diff --git a/cookbooks/aws-parallelcluster-slurm/kitchen.slurm-config.yml b/cookbooks/aws-parallelcluster-slurm/kitchen.slurm-config.yml index 52fbe82e3..b689bdf97 100644 --- a/cookbooks/aws-parallelcluster-slurm/kitchen.slurm-config.yml +++ b/cookbooks/aws-parallelcluster-slurm/kitchen.slurm-config.yml @@ -162,20 +162,6 @@ suites: local_hostname: dokken local_ipv4: 172.17.1.15 ipaddress: 172.17.1.15 - - name: mount_slurm_dir - run_list: - - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-slurm::mount_slurm_dir] - verifier: - controls: - - mount_slurm_dir - attributes: - dependencies: - - resource:nfs - - recipe:aws-parallelcluster-slurm::mock_slurm_dir - cluster: - node_type: ComputeFleet - head_node_private_ip: '127.0.0.1' - name: config_head_node_munge run_list: - recipe[aws-parallelcluster-tests::setup] diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_compute.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_compute.rb index 29a6fba00..9424e491c 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_compute.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_compute.rb @@ -24,8 +24,6 @@ mode '0700' end -include_recipe 'aws-parallelcluster-slurm::mount_slurm_dir' - # Check to see if is GPU instance with Nvidia installed Chef::Log.warn("GPU instance but no Nvidia drivers found") if graphic_instance? && !nvidia_installed? diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb index 74113ead5..9c304d364 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb @@ -17,13 +17,6 @@ include_recipe 'aws-parallelcluster-slurm::config_munge_key' -# Export /opt/slurm -nfs_export "#{node['cluster']['slurm']['install_dir']}" do - network get_vpc_cidr_list - writeable true - options ['no_root_squash'] -end unless on_docker? - # Ensure config directory is in place directory "#{node['cluster']['slurm']['install_dir']}" do user 'root' diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_login.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_login.rb index ce562c2c4..47df2dd4d 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_login.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_login.rb @@ -17,5 +17,3 @@ # TODO: rename, find a better name that include login nodes setup_munge_compute_node - -include_recipe 'aws-parallelcluster-slurm::mount_slurm_dir' diff --git a/cookbooks/aws-parallelcluster-slurm/test/controls/mount_slurm_dir_spec.rb b/cookbooks/aws-parallelcluster-slurm/test/controls/mount_slurm_dir_spec.rb deleted file mode 100644 index 682060030..000000000 --- a/cookbooks/aws-parallelcluster-slurm/test/controls/mount_slurm_dir_spec.rb +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. -# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. -# See the License for the specific language governing permissions and limitations under the License. - -control 'mount_slurm_dir' do - title 'Check if the slurm install dir is mounted' - - only_if { !os_properties.on_docker? && (instance.compute_node? or instance.login_node?) } - - describe mount('/opt/slurm') do - it { should be_mounted } - its('type') { should eq 'nfs4' } - its('options') { should include 'rw' } - end -end diff --git a/kitchen.ec2.yml b/kitchen.ec2.yml index 8bb9cdcde..a3fdfee52 100644 --- a/kitchen.ec2.yml +++ b/kitchen.ec2.yml @@ -187,7 +187,7 @@ platforms: image_id: <%= ENV['KITCHEN_UBUNTU2204_AMI'] %> <% else %> image_search: - name: ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-<% if ENV['KITCHEN_ARCHITECTURE'] == 'x86_64' %>amd64<% else %>arm64<% end %>-server-20230106 + name: <% if ENV['KITCHEN_PHASE']=='install' %>ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-<% if ENV['KITCHEN_ARCHITECTURE'] == 'x86_64' %>amd64<% else %>arm64<% end %>-server-20230106<% else %><%= pcluster_prefix %>-ubuntu-2204-lts-hvm-*<% end %> architecture: <%= ENV['KITCHEN_ARCHITECTURE'] %> <% end %> block_device_mappings: