diff --git a/CHANGELOG.md b/CHANGELOG.md index c0ba7f33d..8b8f7214f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Add support for Rocky Linux 8. - Install [Spack](https://spack.io) by default in cluster user's home directory. - Add support for `Scheduling/SlurmSettings/Database/DatabaseName` parameter to render `StorageLoc` in the slurmdbd configuration generated by ParallelCluster. +- Add the option to use EFS storage instead of NFS exports from the head node root volume for intra-cluster shared ParallelCluster, Intel, Slurm, and login node data. **CHANGES** diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb index edd70bcf9..538eeaf37 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb @@ -24,5 +24,6 @@ # Fetch config must be executed after the mount of the shared folders because the config will be saved there fetch_config 'Fetch and load cluster configs' +# Compute fleet init requires shared dirs include_recipe "aws-parallelcluster-computefleet::init" include_recipe "aws-parallelcluster-slurm::init" diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb index 116e03481..e10f15744 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb @@ -16,8 +16,8 @@ update true end -# generate the update shared storages mapping file -include_recipe 'aws-parallelcluster-environment::fs_update' +# generate the updated shared storages mapping file +include_recipe 'aws-parallelcluster-environment::update_fs_mapping' include_recipe 'aws-parallelcluster-environment::directory_service' include_recipe 'aws-parallelcluster-slurm::update' if node['cluster']['scheduler'] == 'slurm' diff --git a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb index 1a6bd3e1a..c312a103c 100644 --- a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb +++ b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb @@ -53,5 +53,10 @@ default['cluster']['shared_dir_compute'] = node['cluster']['shared_dir'] default['cluster']['shared_dir_head'] = node['cluster']['shared_dir'] default['cluster']['shared_dir_login'] = node['cluster']['shared_dir_login_nodes'] +# Since this is a shared directory, it needs to be defined here first instead of in the dependent cookbook for slurm +default['cluster']['slurm']['install_dir'] = '/opt/slurm' + +default['cluster']['internal_shared_dirs'] = [node['cluster']['shared_dir'], node['cluster']['shared_dir_login_nodes'], node['cluster']['slurm']['install_dir'], "/opt/intel"] +default['cluster']['internal_initial_shared_dir'] = "#{node['cluster']['base_dir']}/init_shared" default['cluster']['head_node_private_ip'] = nil diff --git a/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml b/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml index 86d346eb1..237961924 100644 --- a/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml +++ b/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml @@ -414,10 +414,10 @@ suites: fsx_shared_dirs: '' raid_shared_dir: '' ephemeral_dir: test1 - - name: fs_update + - name: update_fs_mapping run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::fs_update] + - recipe[aws-parallelcluster-environment::update_fs_mapping] verifier: controls: - fs_data_file_created_correctly @@ -440,10 +440,10 @@ suites: fsx_dns_names: dns1,dns2 fsx_mount_names: mount1,mount2 fsx_volume_junction_paths: value1,value2 - - name: fs_update_default_values + - name: update_fs_mapping_default_values run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::fs_update] + - recipe[aws-parallelcluster-environment::update_fs_mapping] verifier: controls: - fs_data_file_with_default_values @@ -475,7 +475,7 @@ suites: - name: mount_shared_compute run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::mount_shared] + - recipe[aws-parallelcluster-environment::mount_internal_use_ebs] verifier: controls: - mount_home @@ -493,7 +493,7 @@ suites: - name: mount_shared_login run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::mount_shared] + - recipe[aws-parallelcluster-environment::mount_internal_use_ebs] verifier: controls: - mount_home @@ -538,10 +538,10 @@ suites: node_type: LoginNode raid_shared_dir: raid1 head_node_private_ip: '127.0.0.1' - - name: shared_storages_compute + - name: shared_storages_compute_efs run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::shared_storages] + - recipe[aws-parallelcluster-environment::efs] verifier: controls: - shared_storages_compute_and_login @@ -552,10 +552,38 @@ suites: cluster: node_type: ComputeFleet head_node_private_ip: '127.0.0.1' - - name: shared_storages_login + - name: shared_storages_login_efs run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::shared_storages] + - recipe[aws-parallelcluster-environment::efs] + verifier: + controls: + - shared_storages_compute_and_login + attributes: + dependencies: + - resource:nfs + - recipe:aws-parallelcluster-environment::mock_compute_shared_storages + cluster: + node_type: LoginNode + head_node_private_ip: '127.0.0.1' + - name: shared_storages_compute_ebs + run_list: + - recipe[aws-parallelcluster-tests::setup] + - recipe[aws-parallelcluster-environment::mount_intel_dir] + verifier: + controls: + - shared_storages_compute_and_login + attributes: + dependencies: + - resource:nfs + - recipe:aws-parallelcluster-environment::mock_compute_shared_storages + cluster: + node_type: ComputeFleet + head_node_private_ip: '127.0.0.1' + - name: shared_storages_login_ebs + run_list: + - recipe[aws-parallelcluster-tests::setup] + - recipe[aws-parallelcluster-environment::mount_intel_dir] verifier: controls: - shared_storages_compute_and_login diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config.rb b/cookbooks/aws-parallelcluster-environment/recipes/config.rb index 6eefa2b36..a6e57c789 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config.rb @@ -18,9 +18,30 @@ action :configure end include_recipe 'aws-parallelcluster-environment::ephemeral_drives' -# fs_update generates the shared storages mapping file so must be executed before shared storages recipes -include_recipe 'aws-parallelcluster-environment::fs_update' -include_recipe 'aws-parallelcluster-environment::shared_storages' +# update_fs_mapping generates the shared storages mapping file so must be executed before shared storages recipes +include_recipe 'aws-parallelcluster-environment::update_fs_mapping' +# Export home dir from the head node +include_recipe 'aws-parallelcluster-environment::export_home' + +if node['cluster']['internal_shared_storage_type'] == 'ebs' + # Export internal use dirs from the head node + include_recipe 'aws-parallelcluster-environment::export_internal_use_ebs' + # Mount intel on compute and login nodes + include_recipe 'aws-parallelcluster-environment::mount_intel_dir' +end + include_recipe 'aws-parallelcluster-environment::ebs' include_recipe 'aws-parallelcluster-environment::raid' -include_recipe "aws-parallelcluster-environment::fs_mount" +include_recipe "aws-parallelcluster-environment::efs" + +# Mount FSx directory with manage_fsx resource +lustre "mount fsx" do + fsx_fs_id_array node['cluster']['fsx_fs_ids'].split(',') + fsx_fs_type_array node['cluster']['fsx_fs_types'].split(',') + fsx_shared_dir_array node['cluster']['fsx_shared_dirs'].split(',') + fsx_dns_name_array node['cluster']['fsx_dns_names'].split(',') + fsx_mount_name_array node['cluster']['fsx_mount_names'].split(',') + fsx_volume_junction_path_array node['cluster']['fsx_volume_junction_paths'].split(',') + action :mount + not_if { node['cluster']['fsx_fs_ids'].split(',').empty? } +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb new file mode 100644 index 000000000..a967ecece --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb @@ -0,0 +1,41 @@ +# frozen_string_literal: true + +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +return if on_docker? +efs_shared_dir_array = node['cluster']['efs_shared_dirs'].split(',') +efs_fs_id_array = node['cluster']['efs_fs_ids'].split(',') +efs_encryption_in_transit_array = node['cluster']['efs_encryption_in_transits'].split(',') +efs_iam_authorization_array = node['cluster']['efs_iam_authorizations'].split(',') + +cx_shared_dir_array = [] +cx_efs_fs_id_array = [] +cx_efs_encryption_array = [] +cx_efs_iam_array = [] + +# Identify the customer use filesystems and store their data in arrays for the EFS resource +efs_shared_dir_array.each_with_index do |dir, index| + next if node['cluster']['internal_shared_dirs'].include?(dir) + cx_shared_dir_array.push(dir) + cx_efs_fs_id_array.push(efs_fs_id_array[index]) + cx_efs_encryption_array.push(efs_encryption_in_transit_array[index]) + cx_efs_iam_array.push(efs_iam_authorization_array[index]) +end + +# Mount EFS directories with the efs resource +efs "mount efs" do + shared_dir_array cx_shared_dir_array + efs_fs_id_array cx_efs_fs_id_array + efs_encryption_in_transit_array cx_efs_encryption_array + efs_iam_authorization_array cx_efs_iam_array + action :mount + not_if { cx_shared_dir_array.empty? } +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb new file mode 100644 index 000000000..aa1e1e9db --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +case node['cluster']['node_type'] +when 'HeadNode' + volume "export /home" do + shared_dir "/home" + action :export + end +when 'ComputeFleet', 'LoginNode' + Chef::Log.info("Export only from the HeadNode") +else + raise "node_type must be HeadNode, ComputeFleet, or LoginNode" +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/export_internal_use_ebs.rb similarity index 71% rename from cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb rename to cookbooks/aws-parallelcluster-environment/recipes/config/export_internal_use_ebs.rb index 56f34e204..c572f5269 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/export_internal_use_ebs.rb @@ -16,11 +16,6 @@ case node['cluster']['node_type'] when 'HeadNode' - volume "export /home" do - shared_dir "/home" - action :export - end - # Export /opt/parallelcluster/shared volume "export #{node['cluster']['shared_dir']}" do shared_dir node['cluster']['shared_dir'] @@ -41,19 +36,7 @@ end when 'ComputeFleet', 'LoginNode' - # Mount /opt/intel over NFS only if it exists - exported_intel_dir = format_directory('/opt/intel') - volume "mount /opt/intel" do - action :mount - shared_dir '/opt/intel' - device(lazy { "#{node['cluster']['head_node_private_ip']}:#{exported_intel_dir}" }) - fstype 'nfs' - options node['cluster']['nfs']['hard_mount_options'] - retries 10 - retry_delay 6 - only_if { ::File.directory?("/opt/intel") } - end - + Chef::Log.info("Export only from the HeadNode") else raise "node_type must be HeadNode or ComputeFleet" end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/fs_mount.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/fs_mount.rb deleted file mode 100644 index 255be867b..000000000 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/fs_mount.rb +++ /dev/null @@ -1,34 +0,0 @@ -# frozen_string_literal: true - -# Copyright:: 2013-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the -# License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. - -# Mount EFS directory with efs resource -efs "mount efs" do - shared_dir_array node['cluster']['efs_shared_dirs'].split(',') - efs_fs_id_array node['cluster']['efs_fs_ids'].split(',') - efs_encryption_in_transit_array node['cluster']['efs_encryption_in_transits'].split(',') - efs_iam_authorization_array node['cluster']['efs_iam_authorizations'].split(',') - action :mount - not_if { node['cluster']['efs_shared_dirs'].split(',').empty? } -end - -# Mount FSx directory with manage_fsx resource -lustre "mount fsx" do - fsx_fs_id_array node['cluster']['fsx_fs_ids'].split(',') - fsx_fs_type_array node['cluster']['fsx_fs_types'].split(',') - fsx_shared_dir_array node['cluster']['fsx_shared_dirs'].split(',') - fsx_dns_name_array node['cluster']['fsx_dns_names'].split(',') - fsx_mount_name_array node['cluster']['fsx_mount_names'].split(',') - fsx_volume_junction_path_array node['cluster']['fsx_volume_junction_paths'].split(',') - action :mount - not_if { node['cluster']['fsx_fs_ids'].split(',').empty? } -end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/mount_intel_dir.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/mount_intel_dir.rb new file mode 100644 index 000000000..16ec9c704 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/mount_intel_dir.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +case node['cluster']['node_type'] +when 'HeadNode' + Chef::Log.info("Mount only on the ComputeFleet and LoginNodes") +when 'ComputeFleet', 'LoginNode' + # Mount /opt/intel over NFS only if it exists + exported_intel_dir = format_directory('/opt/intel') + volume "mount /opt/intel" do + action :mount + shared_dir '/opt/intel' + device(lazy { "#{node['cluster']['head_node_private_ip']}:#{exported_intel_dir}" }) + fstype 'nfs' + options node['cluster']['nfs']['hard_mount_options'] + retries 10 + retry_delay 6 + only_if { ::File.directory?("/opt/intel") } + end + +else + raise "node_type must be HeadNode or ComputeFleet" +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/fs_update.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/update_fs_mapping.rb similarity index 100% rename from cookbooks/aws-parallelcluster-environment/recipes/config/fs_update.rb rename to cookbooks/aws-parallelcluster-environment/recipes/config/update_fs_mapping.rb diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init.rb b/cookbooks/aws-parallelcluster-environment/recipes/init.rb index cd3c801c9..c5ff6a66f 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init.rb @@ -13,12 +13,25 @@ # include_recipe "aws-parallelcluster-environment::cfnconfig_mixed" -include_recipe "aws-parallelcluster-environment::mount_shared" cloudwatch "Configure CloudWatch" do action :configure end + +case node['cluster']['internal_shared_storage_type'] +when 'efs' + include_recipe "aws-parallelcluster-environment::mount_internal_use_efs" +when 'ebs' + include_recipe "aws-parallelcluster-environment::mount_internal_use_ebs" +else + raise "internal_shared_storage_type must be ebs or efs" +end + +include_recipe "aws-parallelcluster-environment::mount_home" if %w(ComputeFleet LoginNode).include? node['cluster']['node_type'] + include_recipe "aws-parallelcluster-environment::network_interfaces" include_recipe 'aws-parallelcluster-environment::imds' + +# login nodes keys and directory service require shared storage include_recipe "aws-parallelcluster-environment::login_nodes_keys" include_recipe "aws-parallelcluster-environment::directory_service" diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb new file mode 100644 index 000000000..6e472f7d1 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +if node['cluster']['node_type'] == 'HeadNode' + # For each, backup the data to a temp location + # This is necessary to preserve any data in these directories that was + # generated during the build of ParallelCluster AMIs after converting to + # shared storage + node['cluster']['internal_shared_dirs'].each do |dir| + bash "Backup #{dir}" do + user 'root' + group 'root' + code <<-EOH + mkdir -p /tmp#{dir} + rsync -a #{dir}/ /tmp#{dir} + EOH + end + end +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_ebs.rb similarity index 86% rename from cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb rename to cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_ebs.rb index d6632d4ef..04db5f3b3 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_ebs.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true -# Copyright:: 2013-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the # License. A copy of the License is located at @@ -11,12 +11,13 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. +# This recipe mounts the NFS exports from the head node on compute nodes and login nodes +# when a customer has chosen to use ebs as the internal shared storage type + return if on_docker? case node['cluster']['node_type'] when 'ComputeFleet' - include_recipe 'aws-parallelcluster-environment::mount_home' - # Mount /opt/parallelcluster/shared over NFS volume "mount #{node['cluster']['shared_dir_compute']}" do action :mount @@ -29,8 +30,6 @@ end when 'LoginNode' - include_recipe 'aws-parallelcluster-environment::mount_home' - # Mount /opt/parallelcluster/shared_login_nodes over NFS volume "mount #{node['cluster']['shared_dir_login']}" do action :mount diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_efs.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_efs.rb new file mode 100644 index 000000000..e0ff347f2 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_efs.rb @@ -0,0 +1,96 @@ +# frozen_string_literal: true + +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +include_recipe "aws-parallelcluster-environment::update_fs_mapping" +include_recipe "aws-parallelcluster-environment::backup_internal_use_shared_data" + +efs_shared_dir_array = node['cluster']['efs_shared_dirs'].split(',') +efs_fs_id_array = node['cluster']['efs_fs_ids'].split(',') +efs_encryption_in_transit_array = node['cluster']['efs_encryption_in_transits'].split(',') +efs_iam_authorization_array = node['cluster']['efs_iam_authorizations'].split(',') + +initial_shared_dir_array = [] +initial_efs_fs_id_array = [] +initial_efs_encryption_array = [] +initial_efs_iam_array = [] + +# Identify the initial filesystem and store their data in arrays for the EFS resource +efs_shared_dir_array.each_with_index do |dir, index| + next unless dir == node['cluster']['internal_initial_shared_dir'] + initial_shared_dir_array.push(dir) + initial_efs_fs_id_array.push(efs_fs_id_array[index]) + initial_efs_encryption_array.push(efs_encryption_in_transit_array[index]) + initial_efs_iam_array.push(efs_iam_authorization_array[index]) +end + +if node['cluster']['node_type'] == 'HeadNode' + # Mount the initial internal use EFS + efs "mount initial internal use efs" do + shared_dir_array initial_shared_dir_array + efs_fs_id_array initial_efs_fs_id_array + efs_encryption_in_transit_array initial_efs_encryption_array + efs_iam_authorization_array initial_efs_iam_array + action :mount + not_if { initial_shared_dir_array.empty? } + end + + # Add the mount points for shared dirs + node['cluster']['internal_shared_dirs'].each do |dir| + directory "#{node['cluster']['internal_initial_shared_dir']}#{dir}" do + user 'root' + group 'root' + mode '0755' + action :create + recursive true + end + end unless initial_shared_dir_array.empty? + + # Unmount the root of the EFS after creating the shared directories + # TODO this doesn't seem to unmount the EFS + efs "unmount internal efs" do + shared_dir_array(lazy { initial_shared_dir_array }) + efs_fs_id_array(lazy { initial_efs_fs_id_array }) + action :unmount + not_if { initial_shared_dir_array.empty? } + end +end + +# Mount the shared dirs, there should only be one initial shared dir array +internal_shared_dir_array = [] +internal_efs_fs_id_array = [] +internal_efs_encryption_array = [] +internal_efs_iam_array = [] +internal_efs_mount_point_array = [] +node['cluster']['internal_shared_dirs'].each do |dir| + # Don't mount the login nodes shared dir to compute nodes + next if node['cluster']['node_type'] == 'ComputeFleet' && dir == node['cluster']['shared_dir_login_nodes'] + internal_shared_dir_array.push(dir) + internal_efs_fs_id_array.push(initial_efs_fs_id_array[0]) + internal_efs_encryption_array.push(initial_efs_encryption_array[0]) + internal_efs_iam_array.push(initial_efs_iam_array[0]) + internal_efs_mount_point_array.push(dir) +end unless initial_shared_dir_array.empty? + +efs "mount internal shared efs" do + shared_dir_array internal_shared_dir_array + efs_fs_id_array internal_efs_fs_id_array + efs_encryption_in_transit_array internal_efs_encryption_array + efs_iam_authorization_array internal_efs_iam_array + efs_mount_point_array internal_efs_mount_point_array + action :mount + not_if { internal_shared_dir_array.empty? } +end + +include_recipe "aws-parallelcluster-environment::restore_internal_use_shared_data" diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb new file mode 100644 index 000000000..83b0d71ec --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +if node['cluster']['node_type'] == 'HeadNode' + # For each, restore the shared storage if it doesn't already exist + # This is necessary to preserve any data in these directories that was + # generated during the build of ParallelCluster AMIs after converting to + # shared storage and backed up to a temporary location previously + # Remove the backup after the copy is done + node['cluster']['internal_shared_dirs'].each do |dir| + bash "Restore #{dir}" do + user 'root' + group 'root' + code <<-EOH + rsync -a --ignore-existing /tmp#{dir}/ #{dir} + rm -rf /tmp#{dir}/ + EOH + end + end +end diff --git a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_mount_umount.rb b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_mount_umount.rb index 1aa65acb7..2f5616414 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_mount_umount.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_mount_umount.rb @@ -13,10 +13,14 @@ # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. +# This is the local system directory at which we want to mount the EFS mount point property :shared_dir_array, Array, required: %i(mount unmount) property :efs_fs_id_array, Array, required: %i(mount unmount) property :efs_encryption_in_transit_array, Array, required: false property :efs_iam_authorization_array, Array, required: false +# This is the mount point on the EFS itself, as opposed to the local system directory, defaults to "/" +property :efs_mount_point_array, Array, required: false +property :efs_unmount_forced_array, Array, required: false action :mount do return if on_docker? @@ -24,6 +28,7 @@ efs_fs_id_array = new_resource.efs_fs_id_array.dup efs_encryption_in_transit_array = new_resource.efs_encryption_in_transit_array.dup efs_iam_authorization_array = new_resource.efs_iam_authorization_array.dup + efs_mount_point_array = new_resource.efs_mount_point_array.dup efs_fs_id_array.each_with_index do |efs_fs_id, index| efs_shared_dir = efs_shared_dir_array[index] @@ -41,6 +46,7 @@ mount_options += ",iam" end end + mount_point = efs_mount_point_array.nil? ? "/" : efs_mount_point_array[index] # Create the EFS shared directory directory efs_shared_dir do @@ -49,11 +55,11 @@ mode '1777' recursive true action :create - end + end unless ::File.directory?(efs_shared_dir) # Mount EFS over NFS mount efs_shared_dir do - device "#{efs_fs_id}:/" + device "#{efs_fs_id}:#{mount_point}" fstype 'efs' options mount_options dump 0 @@ -64,8 +70,9 @@ not_if "mount | grep ' #{efs_shared_dir} '" end + # Enable the mount dir mount efs_shared_dir do - device "#{efs_fs_id}.efs.#{node['cluster']['region']}.#{node['cluster']['aws_domain']}:/" + device "#{efs_fs_id}.efs.#{node['cluster']['region']}.#{node['cluster']['aws_domain']}:#{mount_point}" fstype 'efs' options mount_options dump 0 diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/fs_mount_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/efs_spec.rb similarity index 69% rename from cookbooks/aws-parallelcluster-environment/spec/unit/recipes/fs_mount_spec.rb rename to cookbooks/aws-parallelcluster-environment/spec/unit/recipes/efs_spec.rb index 6fca809ad..9849bc533 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/fs_mount_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/efs_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe 'aws-parallelcluster-environment::fs_mount' do +describe 'aws-parallelcluster-environment::efs' do for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:chef_run) do @@ -13,12 +13,9 @@ end cached(:node) { chef_run.node } - describe 'call the efs for mounting' do + describe 'call efs for mounting' do it { is_expected.to mount_efs('mount efs') } end - describe 'call the lustre for mounting' do - it { is_expected.to mount_lustre("mount fsx") } - end end end end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_efs_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_efs_spec.rb new file mode 100644 index 000000000..0198e3261 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_efs_spec.rb @@ -0,0 +1,22 @@ +require 'spec_helper' + +describe 'aws-parallelcluster-environment::mount_internal_use_efs' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + cached(:chef_run) do + runner = runner(platform: platform, version: version) do |node| + node.override['cluster']['head_node_private_ip'] = '0.0.0.0' + node.override['cluster']['node_type'] = 'ComputeFleet' + node.override['cluster']['internal_shared_dirs'] = %w(/opt/slurm /opt/intel) + node.override['cluster']['efs_shared_dirs'] = "/opt/parallelcluster/init_shared" + end + runner.converge(described_recipe) + end + cached(:node) { chef_run.node } + + describe 'call efs for mounting' do + it { is_expected.to mount_efs('mount internal shared efs') } + end + end + end +end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_shared_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_shared_spec.rb deleted file mode 100644 index ad2b45e3e..000000000 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_shared_spec.rb +++ /dev/null @@ -1,34 +0,0 @@ -require 'spec_helper' - -describe 'aws-parallelcluster-environment::mount_shared' do - for_all_oses do |platform, version| - context "on #{platform}#{version}" do - cached(:chef_run) do - runner = runner(platform: platform, version: version) do |node| - node.override['cluster']['head_node_private_ip'] = '0.0.0.0' - node.override['cluster']['node_type'] = 'ComputeFleet' - end - runner.converge(described_recipe) - end - cached(:node) { chef_run.node } - - it 'mounts /home' do - is_expected.to mount_volume('mount /home') - .with(device: "0.0.0.0:/home") - .with(fstype: 'nfs') - .with(options: 'hard,_netdev,noatime') - .with(retries: 10) - .with(retry_delay: 6) - end - - it 'mounts /opt/parallelcluster/shared' do - is_expected.to mount_volume('mount /opt/parallelcluster/shared') - .with(device: "0.0.0.0:/opt/parallelcluster/shared") - .with(fstype: 'nfs') - .with(options: 'hard,_netdev,noatime') - .with(retries: 10) - .with(retry_delay: 6) - end - end - end -end diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/shared_storages_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/mount_intel_dir_spec.rb similarity index 100% rename from cookbooks/aws-parallelcluster-environment/test/controls/shared_storages_spec.rb rename to cookbooks/aws-parallelcluster-environment/test/controls/mount_intel_dir_spec.rb diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/mount_shared_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/mount_internal_use_efs_spec.rb similarity index 65% rename from cookbooks/aws-parallelcluster-environment/test/controls/mount_shared_spec.rb rename to cookbooks/aws-parallelcluster-environment/test/controls/mount_internal_use_efs_spec.rb index fe3704818..e3861a20e 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/mount_shared_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/mount_internal_use_efs_spec.rb @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and limitations under the License. control 'mount_home' do - title 'Check if the home and the shared directories are mounted' + title 'Check if the home directory in mounted' only_if { !os_properties.on_docker? && (instance.compute_node? or instance.login_node?) } @@ -22,7 +22,7 @@ end control 'mount_shared_compute' do - title 'Check if the home and the shared directories are mounted' + title 'Check if the shared directory is mounted' only_if { !os_properties.on_docker? && instance.compute_node? } @@ -34,7 +34,7 @@ end control 'mount_shared_login' do - title 'Check if the home and the shared directories are mounted' + title 'Check if the shared directory is mounted' only_if { !os_properties.on_docker? && instance.login_node? } @@ -44,3 +44,19 @@ its('options') { should include 'rw' } end end + +control 'shared_storages_compute_and_login' do + title 'Check the shared storages configuration for compute node' + + only_if { !os_properties.on_docker? && (instance.compute_node? or instance.login_node?) } + + describe 'Check that /opt/intel dir has been mounted' + describe mount("/opt/intel") do + it { should be_mounted } + its('device') { should eq "127.0.0.1:/opt/intel" } + its('type') { should eq 'nfs4' } + its('options') { should include 'hard' } + its('options') { should include '_netdev' } + its('options') { should include 'noatime' } + end +end diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb b/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb index 042bbcb71..1003af0b4 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb @@ -5,7 +5,6 @@ default['cluster']['slurm']['commit'] = '' default['cluster']['slurm']['branch'] = '' default['cluster']['slurm']['sha256'] = '4fee743a34514d8fe487080048256f5ee032374ed5f42d0eae342110dcd59edf' -default['cluster']['slurm']['install_dir'] = '/opt/slurm' default['cluster']['dns_domain'] = nil default['cluster']['use_private_hostname'] = 'false' diff --git a/cookbooks/aws-parallelcluster-slurm/kitchen.slurm-config.yml b/cookbooks/aws-parallelcluster-slurm/kitchen.slurm-config.yml index 300cdaa0c..f322713ff 100644 --- a/cookbooks/aws-parallelcluster-slurm/kitchen.slurm-config.yml +++ b/cookbooks/aws-parallelcluster-slurm/kitchen.slurm-config.yml @@ -176,6 +176,7 @@ suites: cluster: node_type: ComputeFleet head_node_private_ip: '127.0.0.1' + internal_shared_storage_type: ebs - name: config_head_node_munge run_list: - recipe[aws-parallelcluster-tests::setup] diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb index 322ab975f..b0914b518 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb @@ -22,6 +22,7 @@ network get_vpc_cidr_list writeable true options ['no_root_squash'] + only_if { node['cluster']['internal_shared_storage_type'] == 'ebs' } end unless on_docker? # Ensure config directory is in place diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/mount_slurm_dir.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/mount_slurm_dir.rb index 574c7a100..ceaa83a58 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/mount_slurm_dir.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/mount_slurm_dir.rb @@ -20,4 +20,5 @@ action %i(mount enable) retries 10 retry_delay 6 + only_if { node['cluster']['internal_shared_storage_type'] == 'ebs' } end diff --git a/kitchen.ec2.yml b/kitchen.ec2.yml index 9272d2e6b..9e1fc78f4 100644 --- a/kitchen.ec2.yml +++ b/kitchen.ec2.yml @@ -214,7 +214,7 @@ platforms: image_id: <%= ENV['KITCHEN_UBUNTU2204_AMI'] %> <% else %> image_search: - name: ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-<% if ENV['KITCHEN_ARCHITECTURE'] == 'x86_64' %>amd64<% else %>arm64<% end %>-server-20230106 + name: <% if ENV['KITCHEN_PHASE']=='install' %>ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-<% if ENV['KITCHEN_ARCHITECTURE'] == 'x86_64' %>amd64<% else %>arm64<% end %>-server-20230106<% else %><%= pcluster_prefix %>-ubuntu-2204-lts-hvm-*<% end %> architecture: <%= ENV['KITCHEN_ARCHITECTURE'] %> <% end %> block_device_mappings: diff --git a/kitchen.validate-config.yml b/kitchen.validate-config.yml index 265e1795b..b8f284bec 100644 --- a/kitchen.validate-config.yml +++ b/kitchen.validate-config.yml @@ -27,6 +27,7 @@ _common_cluster_attributes: &_common_cluster_attributes enable_efa: 'efa' nvidia: enabled: <%= ENV['NVIDIA_ENABLED'] %> + internal_shared_storage_type: ebs _head_node_cluster_attributes: &_head_node_cluster_attributes << : *_common_cluster_attributes