diff --git a/CHANGELOG.md b/CHANGELOG.md index 63938c8ace..a7b776d878 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Install [Spack](https://spack.io) by default in cluster user's home directory. **CHANGES** +- Remove the NFS exports from the head node root volume and use Amazon EFS storage instead for intra-cluster shared ParallelCluster, Intel, Slurm, and Login Node data. **BUG FIXES** - Fix inconsistent scaling configuration after cluster update rollback when modifying the list of instance types declared in the Compute Resources. diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb index edd70bcf99..538eeaf37f 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb @@ -24,5 +24,6 @@ # Fetch config must be executed after the mount of the shared folders because the config will be saved there fetch_config 'Fetch and load cluster configs' +# Compute fleet init requires shared dirs include_recipe "aws-parallelcluster-computefleet::init" include_recipe "aws-parallelcluster-slurm::init" diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb index 116e034817..e10f157445 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb @@ -16,8 +16,8 @@ update true end -# generate the update shared storages mapping file -include_recipe 'aws-parallelcluster-environment::fs_update' +# generate the updated shared storages mapping file +include_recipe 'aws-parallelcluster-environment::update_fs_mapping' include_recipe 'aws-parallelcluster-environment::directory_service' include_recipe 'aws-parallelcluster-slurm::update' if node['cluster']['scheduler'] == 'slurm' diff --git a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb index 1a6bd3e1a1..fa78adfa1e 100644 --- a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb +++ b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb @@ -53,5 +53,9 @@ default['cluster']['shared_dir_compute'] = node['cluster']['shared_dir'] default['cluster']['shared_dir_head'] = node['cluster']['shared_dir'] default['cluster']['shared_dir_login'] = node['cluster']['shared_dir_login_nodes'] +# Since this is a shared directory, it needs to be defined here first instead of in the dependent cookbook for slurm +default['cluster']['slurm']['install_dir'] = '/opt/slurm' + +default['cluster']['internal_shared_dirs'] = [node['cluster']['shared_dir'], node['cluster']['shared_dir_login_nodes'], node['cluster']['slurm']['install_dir'], "/opt/intel"] default['cluster']['head_node_private_ip'] = nil diff --git a/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml b/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml index 86d346eb15..94bacfcf63 100644 --- a/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml +++ b/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml @@ -414,10 +414,10 @@ suites: fsx_shared_dirs: '' raid_shared_dir: '' ephemeral_dir: test1 - - name: fs_update + - name: update_fs_mapping run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::fs_update] + - recipe[aws-parallelcluster-environment::update_fs_mapping] verifier: controls: - fs_data_file_created_correctly @@ -440,10 +440,10 @@ suites: fsx_dns_names: dns1,dns2 fsx_mount_names: mount1,mount2 fsx_volume_junction_paths: value1,value2 - - name: fs_update_default_values + - name: update_fs_mapping_default_values run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::fs_update] + - recipe[aws-parallelcluster-environment::update_fs_mapping] verifier: controls: - fs_data_file_with_default_values @@ -472,42 +472,43 @@ suites: scheduler: slurm head_node_imds_secured: 'true' head_node_imds_allowed_users: ['root', 'nobody'] - - name: mount_shared_compute - run_list: - - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::mount_shared] - verifier: - controls: - - mount_home - - mount_shared_compute - attributes: - dependencies: - - recipe:aws-parallelcluster-platform::directories - - resource:nfs - - recipe:aws-parallelcluster-environment::mock_export_directories - cluster: - node_type: 'ComputeFleet' - head_node_private_ip: '127.0.0.1' - head_node_home_path: '/fake_headnode_home' - shared_dir_head: '/fake_headnode_shared' - - name: mount_shared_login - run_list: - - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::mount_shared] - verifier: - controls: - - mount_home - - mount_shared_login - attributes: - dependencies: - - recipe:aws-parallelcluster-platform::directories - - resource:nfs - - recipe:aws-parallelcluster-environment::mock_export_directories - cluster: - node_type: 'LoginNode' - head_node_private_ip: '127.0.0.1' - head_node_home_path: '/fake_headnode_home' - shared_dir_head: '/fake_headnode_shared' +# TODO replacement for the mount_internal_use_fs recipe since it uses shared storage +# - name: mount_shared_compute +# run_list: +# - recipe[aws-parallelcluster-tests::setup] +# - recipe[aws-parallelcluster-environment::mount_shared] +# verifier: +# controls: +# - mount_home +# - mount_shared_compute +# attributes: +# dependencies: +# - recipe:aws-parallelcluster-platform::directories +# - resource:nfs +# - recipe:aws-parallelcluster-environment::mock_export_directories +# cluster: +# node_type: 'ComputeFleet' +# head_node_private_ip: '127.0.0.1' +# head_node_home_path: '/fake_headnode_home' +# shared_dir_head: '/fake_headnode_shared' +# - name: mount_shared_login +# run_list: +# - recipe[aws-parallelcluster-tests::setup] +# - recipe[aws-parallelcluster-environment::mount_shared] +# verifier: +# controls: +# - mount_home +# - mount_shared_login +# attributes: +# dependencies: +# - recipe:aws-parallelcluster-platform::directories +# - resource:nfs +# - recipe:aws-parallelcluster-environment::mock_export_directories +# cluster: +# node_type: 'LoginNode' +# head_node_private_ip: '127.0.0.1' +# head_node_home_path: '/fake_headnode_home' +# shared_dir_head: '/fake_headnode_shared' - name: raid_compute run_list: - recipe[aws-parallelcluster-tests::setup] @@ -541,7 +542,7 @@ suites: - name: shared_storages_compute run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::shared_storages] + - recipe[aws-parallelcluster-environment::efs] verifier: controls: - shared_storages_compute_and_login @@ -555,7 +556,7 @@ suites: - name: shared_storages_login run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::shared_storages] + - recipe[aws-parallelcluster-environment::efs] verifier: controls: - shared_storages_compute_and_login diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config.rb b/cookbooks/aws-parallelcluster-environment/recipes/config.rb index 6eefa2b369..900f5e277e 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config.rb @@ -18,9 +18,21 @@ action :configure end include_recipe 'aws-parallelcluster-environment::ephemeral_drives' -# fs_update generates the shared storages mapping file so must be executed before shared storages recipes -include_recipe 'aws-parallelcluster-environment::fs_update' -include_recipe 'aws-parallelcluster-environment::shared_storages' +# update_fs_mapping generates the shared storages mapping file so must be executed before shared storages recipes +include_recipe 'aws-parallelcluster-environment::update_fs_mapping' +include_recipe 'aws-parallelcluster-environment::export_home' include_recipe 'aws-parallelcluster-environment::ebs' include_recipe 'aws-parallelcluster-environment::raid' -include_recipe "aws-parallelcluster-environment::fs_mount" +include_recipe "aws-parallelcluster-environment::efs" + +# Mount FSx directory with manage_fsx resource +lustre "mount fsx" do + fsx_fs_id_array node['cluster']['fsx_fs_ids'].split(',') + fsx_fs_type_array node['cluster']['fsx_fs_types'].split(',') + fsx_shared_dir_array node['cluster']['fsx_shared_dirs'].split(',') + fsx_dns_name_array node['cluster']['fsx_dns_names'].split(',') + fsx_mount_name_array node['cluster']['fsx_mount_names'].split(',') + fsx_volume_junction_path_array node['cluster']['fsx_volume_junction_paths'].split(',') + action :mount + not_if { node['cluster']['fsx_fs_ids'].split(',').empty? } +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb new file mode 100644 index 0000000000..a967ececea --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb @@ -0,0 +1,41 @@ +# frozen_string_literal: true + +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +return if on_docker? +efs_shared_dir_array = node['cluster']['efs_shared_dirs'].split(',') +efs_fs_id_array = node['cluster']['efs_fs_ids'].split(',') +efs_encryption_in_transit_array = node['cluster']['efs_encryption_in_transits'].split(',') +efs_iam_authorization_array = node['cluster']['efs_iam_authorizations'].split(',') + +cx_shared_dir_array = [] +cx_efs_fs_id_array = [] +cx_efs_encryption_array = [] +cx_efs_iam_array = [] + +# Identify the customer use filesystems and store their data in arrays for the EFS resource +efs_shared_dir_array.each_with_index do |dir, index| + next if node['cluster']['internal_shared_dirs'].include?(dir) + cx_shared_dir_array.push(dir) + cx_efs_fs_id_array.push(efs_fs_id_array[index]) + cx_efs_encryption_array.push(efs_encryption_in_transit_array[index]) + cx_efs_iam_array.push(efs_iam_authorization_array[index]) +end + +# Mount EFS directories with the efs resource +efs "mount efs" do + shared_dir_array cx_shared_dir_array + efs_fs_id_array cx_efs_fs_id_array + efs_encryption_in_transit_array cx_efs_encryption_array + efs_iam_authorization_array cx_efs_iam_array + action :mount + not_if { cx_shared_dir_array.empty? } +end diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/mount_slurm_dir.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb similarity index 51% rename from cookbooks/aws-parallelcluster-slurm/recipes/config/mount_slurm_dir.rb rename to cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb index 574c7a100f..aa1e1e9dbc 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/mount_slurm_dir.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true -# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the # License. A copy of the License is located at @@ -11,13 +12,16 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. -# Mount /opt/slurm over NFS -# Computemgtd config is under /opt/slurm/etc/pcluster; all compute nodes share a config -mount "#{node['cluster']['slurm']['install_dir']}" do - device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['slurm']['install_dir']}" }) - fstype "nfs" - options node['cluster']['nfs']['hard_mount_options'] - action %i(mount enable) - retries 10 - retry_delay 6 +return if on_docker? + +case node['cluster']['node_type'] +when 'HeadNode' + volume "export /home" do + shared_dir "/home" + action :export + end +when 'ComputeFleet', 'LoginNode' + Chef::Log.info("Export only from the HeadNode") +else + raise "node_type must be HeadNode, ComputeFleet, or LoginNode" end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/fs_mount.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/fs_mount.rb deleted file mode 100644 index 255be867b4..0000000000 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/fs_mount.rb +++ /dev/null @@ -1,34 +0,0 @@ -# frozen_string_literal: true - -# Copyright:: 2013-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the -# License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. - -# Mount EFS directory with efs resource -efs "mount efs" do - shared_dir_array node['cluster']['efs_shared_dirs'].split(',') - efs_fs_id_array node['cluster']['efs_fs_ids'].split(',') - efs_encryption_in_transit_array node['cluster']['efs_encryption_in_transits'].split(',') - efs_iam_authorization_array node['cluster']['efs_iam_authorizations'].split(',') - action :mount - not_if { node['cluster']['efs_shared_dirs'].split(',').empty? } -end - -# Mount FSx directory with manage_fsx resource -lustre "mount fsx" do - fsx_fs_id_array node['cluster']['fsx_fs_ids'].split(',') - fsx_fs_type_array node['cluster']['fsx_fs_types'].split(',') - fsx_shared_dir_array node['cluster']['fsx_shared_dirs'].split(',') - fsx_dns_name_array node['cluster']['fsx_dns_names'].split(',') - fsx_mount_name_array node['cluster']['fsx_mount_names'].split(',') - fsx_volume_junction_path_array node['cluster']['fsx_volume_junction_paths'].split(',') - action :mount - not_if { node['cluster']['fsx_fs_ids'].split(',').empty? } -end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb deleted file mode 100644 index d6632d4eff..0000000000 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb +++ /dev/null @@ -1,48 +0,0 @@ -# frozen_string_literal: true - -# Copyright:: 2013-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the -# License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. - -return if on_docker? - -case node['cluster']['node_type'] -when 'ComputeFleet' - include_recipe 'aws-parallelcluster-environment::mount_home' - - # Mount /opt/parallelcluster/shared over NFS - volume "mount #{node['cluster']['shared_dir_compute']}" do - action :mount - shared_dir node['cluster']['shared_dir_compute'] - device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['shared_dir_head']}" }) - fstype 'nfs' - options node['cluster']['nfs']['hard_mount_options'] - retries 10 - retry_delay 6 - end - -when 'LoginNode' - include_recipe 'aws-parallelcluster-environment::mount_home' - - # Mount /opt/parallelcluster/shared_login_nodes over NFS - volume "mount #{node['cluster']['shared_dir_login']}" do - action :mount - shared_dir node['cluster']['shared_dir_login'] - device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['shared_dir_login']}" }) - fstype 'nfs' - options node['cluster']['nfs']['hard_mount_options'] - retries 10 - retry_delay 6 - end -when 'HeadNode' - Chef::Log.info("Nothing to mount in the HeadNode") -else - raise "node_type must be HeadNode, LoginNode or ComputeFleet" -end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb deleted file mode 100644 index 56f34e2047..0000000000 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb +++ /dev/null @@ -1,59 +0,0 @@ -# frozen_string_literal: true - -# -# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the -# License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. - -return if on_docker? - -case node['cluster']['node_type'] -when 'HeadNode' - volume "export /home" do - shared_dir "/home" - action :export - end - - # Export /opt/parallelcluster/shared - volume "export #{node['cluster']['shared_dir']}" do - shared_dir node['cluster']['shared_dir'] - action :export - end - - # Export /opt/parallelcluster/shared_login_nodes - volume "export #{node['cluster']['shared_dir_login_nodes']}" do - shared_dir node['cluster']['shared_dir_login_nodes'] - action :export - end - - # Export /opt/intel only if exists - volume "export /opt/intel" do - shared_dir "/opt/intel" - only_if { ::File.directory?("/opt/intel") } - action :export - end - -when 'ComputeFleet', 'LoginNode' - # Mount /opt/intel over NFS only if it exists - exported_intel_dir = format_directory('/opt/intel') - volume "mount /opt/intel" do - action :mount - shared_dir '/opt/intel' - device(lazy { "#{node['cluster']['head_node_private_ip']}:#{exported_intel_dir}" }) - fstype 'nfs' - options node['cluster']['nfs']['hard_mount_options'] - retries 10 - retry_delay 6 - only_if { ::File.directory?("/opt/intel") } - end - -else - raise "node_type must be HeadNode or ComputeFleet" -end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/fs_update.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/update_fs_mapping.rb similarity index 100% rename from cookbooks/aws-parallelcluster-environment/recipes/config/fs_update.rb rename to cookbooks/aws-parallelcluster-environment/recipes/config/update_fs_mapping.rb diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init.rb b/cookbooks/aws-parallelcluster-environment/recipes/init.rb index cd3c801c9c..8e465ee0ea 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init.rb @@ -13,12 +13,18 @@ # include_recipe "aws-parallelcluster-environment::cfnconfig_mixed" -include_recipe "aws-parallelcluster-environment::mount_shared" cloudwatch "Configure CloudWatch" do action :configure end +include_recipe "aws-parallelcluster-environment::update_fs_mapping" +include_recipe "aws-parallelcluster-environment::backup_internal_use_shared_data" +include_recipe "aws-parallelcluster-environment::mount_internal_use_fs" +include_recipe "aws-parallelcluster-environment::restore_internal_use_shared_data" + include_recipe "aws-parallelcluster-environment::network_interfaces" include_recipe 'aws-parallelcluster-environment::imds' + +# login nodes keys and directory service require shared storage include_recipe "aws-parallelcluster-environment::login_nodes_keys" include_recipe "aws-parallelcluster-environment::directory_service" diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb new file mode 100644 index 0000000000..6e472f7d15 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +if node['cluster']['node_type'] == 'HeadNode' + # For each, backup the data to a temp location + # This is necessary to preserve any data in these directories that was + # generated during the build of ParallelCluster AMIs after converting to + # shared storage + node['cluster']['internal_shared_dirs'].each do |dir| + bash "Backup #{dir}" do + user 'root' + group 'root' + code <<-EOH + mkdir -p /tmp#{dir} + rsync -a #{dir}/ /tmp#{dir} + EOH + end + end +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_fs.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_fs.rb new file mode 100644 index 0000000000..e25f89c4f2 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_fs.rb @@ -0,0 +1,53 @@ +# frozen_string_literal: true + +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +return if on_docker? +efs_shared_dir_array = node['cluster']['efs_shared_dirs'].split(',') +efs_fs_id_array = node['cluster']['efs_fs_ids'].split(',') +efs_encryption_in_transit_array = node['cluster']['efs_encryption_in_transits'].split(',') +efs_iam_authorization_array = node['cluster']['efs_iam_authorizations'].split(',') + +internal_shared_dir_array = [] +internal_efs_fs_id_array = [] +internal_efs_encryption_array = [] +internal_efs_iam_array = [] + +# Identify the internal use filesystems and store their data in arrays for the EFS resource +efs_shared_dir_array.each_with_index do |dir, index| + next unless node['cluster']['internal_shared_dirs'].include?(dir) + # Don't mount the login nodes shared dir to compute nodes + next if node['cluster']['node_type'] == 'ComputeFleet' && dir == node['cluster']['shared_dir_login_nodes'] + internal_shared_dir_array.push(dir) + internal_efs_fs_id_array.push(efs_fs_id_array[index]) + internal_efs_encryption_array.push(efs_encryption_in_transit_array[index]) + internal_efs_iam_array.push(efs_iam_authorization_array[index]) +end + +# Mount EFS directories with the efs resource +efs "mount internal use efs" do + shared_dir_array internal_shared_dir_array + efs_fs_id_array internal_efs_fs_id_array + efs_encryption_in_transit_array internal_efs_encryption_array + efs_iam_authorization_array internal_efs_iam_array + action :mount + not_if { internal_shared_dir_array.empty? } +end + +# TODO: replace home as NFS with shared /home +case node['cluster']['node_type'] +when 'ComputeFleet', 'LoginNode' + include_recipe 'aws-parallelcluster-environment::mount_home' +when 'HeadNode' + Chef::Log.info("Nothing to mount in the HeadNode") +else + raise "node_type must be HeadNode, LoginNode or ComputeFleet" +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb new file mode 100644 index 0000000000..83b0d71ec7 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +if node['cluster']['node_type'] == 'HeadNode' + # For each, restore the shared storage if it doesn't already exist + # This is necessary to preserve any data in these directories that was + # generated during the build of ParallelCluster AMIs after converting to + # shared storage and backed up to a temporary location previously + # Remove the backup after the copy is done + node['cluster']['internal_shared_dirs'].each do |dir| + bash "Restore #{dir}" do + user 'root' + group 'root' + code <<-EOH + rsync -a --ignore-existing /tmp#{dir}/ #{dir} + rm -rf /tmp#{dir}/ + EOH + end + end +end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/fs_mount_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/efs_spec.rb similarity index 69% rename from cookbooks/aws-parallelcluster-environment/spec/unit/recipes/fs_mount_spec.rb rename to cookbooks/aws-parallelcluster-environment/spec/unit/recipes/efs_spec.rb index 6fca809adf..9849bc533b 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/fs_mount_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/efs_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe 'aws-parallelcluster-environment::fs_mount' do +describe 'aws-parallelcluster-environment::efs' do for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:chef_run) do @@ -13,12 +13,9 @@ end cached(:node) { chef_run.node } - describe 'call the efs for mounting' do + describe 'call efs for mounting' do it { is_expected.to mount_efs('mount efs') } end - describe 'call the lustre for mounting' do - it { is_expected.to mount_lustre("mount fsx") } - end end end end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_shared_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_fs_spec.rb similarity index 65% rename from cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_shared_spec.rb rename to cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_fs_spec.rb index ad2b45e3ea..c5e33aab6d 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_shared_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_fs_spec.rb @@ -1,12 +1,14 @@ require 'spec_helper' -describe 'aws-parallelcluster-environment::mount_shared' do +describe 'aws-parallelcluster-environment::mount_internal_use_fs' do for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| node.override['cluster']['head_node_private_ip'] = '0.0.0.0' node.override['cluster']['node_type'] = 'ComputeFleet' + node.override['cluster']['internal_shared_dirs'] = %w(/opt/slurm /opt/intel) + node.override['cluster']['efs_shared_dirs'] = "/opt/slurm,/opt/intel" end runner.converge(described_recipe) end @@ -21,13 +23,8 @@ .with(retry_delay: 6) end - it 'mounts /opt/parallelcluster/shared' do - is_expected.to mount_volume('mount /opt/parallelcluster/shared') - .with(device: "0.0.0.0:/opt/parallelcluster/shared") - .with(fstype: 'nfs') - .with(options: 'hard,_netdev,noatime') - .with(retries: 10) - .with(retry_delay: 6) + describe 'call efs for mounting' do + it { is_expected.to mount_efs('mount internal use efs') } end end end diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/mount_shared_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/mount_internal_use_fs_spec.rb similarity index 65% rename from cookbooks/aws-parallelcluster-environment/test/controls/mount_shared_spec.rb rename to cookbooks/aws-parallelcluster-environment/test/controls/mount_internal_use_fs_spec.rb index fe37048180..e3861a20e6 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/mount_shared_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/mount_internal_use_fs_spec.rb @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and limitations under the License. control 'mount_home' do - title 'Check if the home and the shared directories are mounted' + title 'Check if the home directory in mounted' only_if { !os_properties.on_docker? && (instance.compute_node? or instance.login_node?) } @@ -22,7 +22,7 @@ end control 'mount_shared_compute' do - title 'Check if the home and the shared directories are mounted' + title 'Check if the shared directory is mounted' only_if { !os_properties.on_docker? && instance.compute_node? } @@ -34,7 +34,7 @@ end control 'mount_shared_login' do - title 'Check if the home and the shared directories are mounted' + title 'Check if the shared directory is mounted' only_if { !os_properties.on_docker? && instance.login_node? } @@ -44,3 +44,19 @@ its('options') { should include 'rw' } end end + +control 'shared_storages_compute_and_login' do + title 'Check the shared storages configuration for compute node' + + only_if { !os_properties.on_docker? && (instance.compute_node? or instance.login_node?) } + + describe 'Check that /opt/intel dir has been mounted' + describe mount("/opt/intel") do + it { should be_mounted } + its('device') { should eq "127.0.0.1:/opt/intel" } + its('type') { should eq 'nfs4' } + its('options') { should include 'hard' } + its('options') { should include '_netdev' } + its('options') { should include 'noatime' } + end +end diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/shared_storages_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/shared_storages_spec.rb deleted file mode 100644 index 6d9fe989ff..0000000000 --- a/cookbooks/aws-parallelcluster-environment/test/controls/shared_storages_spec.rb +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. -# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. -# See the License for the specific language governing permissions and limitations under the License. - -control 'shared_storages_compute_and_login' do - title 'Check the shared storages configuration for compute node' - - only_if { !os_properties.on_docker? && (instance.compute_node? or instance.login_node?) } - - describe 'Check that /opt/intel dir has been mounted' - describe mount("/opt/intel") do - it { should be_mounted } - its('device') { should eq "127.0.0.1:/opt/intel" } - its('type') { should eq 'nfs4' } - its('options') { should include 'hard' } - its('options') { should include '_netdev' } - its('options') { should include 'noatime' } - end -end diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb b/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb index b43bc88211..966fcd4df1 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb @@ -4,7 +4,6 @@ # Slurm attributes shared between install_slurm and configure_slurm_accounting default['cluster']['slurm']['commit'] = '' default['cluster']['slurm']['sha256'] = '7290143a71ce2797d0df3423f08396fd5c0ae4504749ff372d6860b2d6a3a1b0' -default['cluster']['slurm']['install_dir'] = '/opt/slurm' default['cluster']['dns_domain'] = nil default['cluster']['use_private_hostname'] = 'false' diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_compute.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_compute.rb index 29a6fba001..9424e491c0 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_compute.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_compute.rb @@ -24,8 +24,6 @@ mode '0700' end -include_recipe 'aws-parallelcluster-slurm::mount_slurm_dir' - # Check to see if is GPU instance with Nvidia installed Chef::Log.warn("GPU instance but no Nvidia drivers found") if graphic_instance? && !nvidia_installed? diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb index 74113ead54..9c304d3641 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb @@ -17,13 +17,6 @@ include_recipe 'aws-parallelcluster-slurm::config_munge_key' -# Export /opt/slurm -nfs_export "#{node['cluster']['slurm']['install_dir']}" do - network get_vpc_cidr_list - writeable true - options ['no_root_squash'] -end unless on_docker? - # Ensure config directory is in place directory "#{node['cluster']['slurm']['install_dir']}" do user 'root' diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_login.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_login.rb index ce562c2c49..47df2dd4dd 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_login.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_login.rb @@ -17,5 +17,3 @@ # TODO: rename, find a better name that include login nodes setup_munge_compute_node - -include_recipe 'aws-parallelcluster-slurm::mount_slurm_dir' diff --git a/kitchen.ec2.yml b/kitchen.ec2.yml index 8bb9cdcde5..a3fdfee52f 100644 --- a/kitchen.ec2.yml +++ b/kitchen.ec2.yml @@ -187,7 +187,7 @@ platforms: image_id: <%= ENV['KITCHEN_UBUNTU2204_AMI'] %> <% else %> image_search: - name: ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-<% if ENV['KITCHEN_ARCHITECTURE'] == 'x86_64' %>amd64<% else %>arm64<% end %>-server-20230106 + name: <% if ENV['KITCHEN_PHASE']=='install' %>ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-<% if ENV['KITCHEN_ARCHITECTURE'] == 'x86_64' %>amd64<% else %>arm64<% end %>-server-20230106<% else %><%= pcluster_prefix %>-ubuntu-2204-lts-hvm-*<% end %> architecture: <%= ENV['KITCHEN_ARCHITECTURE'] %> <% end %> block_device_mappings: