diff --git a/CHANGELOG.md b/CHANGELOG.md index 63938c8ace..42b9f2b7a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Install [Spack](https://spack.io) by default in cluster user's home directory. **CHANGES** +- Migrate NFS exports from the head node root volume and associated data for intra-cluster shared storage to external AWS EFS filesystems attached to the cluster stack. **BUG FIXES** - Fix inconsistent scaling configuration after cluster update rollback when modifying the list of instance types declared in the Compute Resources. diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb index edd70bcf99..538eeaf37f 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb @@ -24,5 +24,6 @@ # Fetch config must be executed after the mount of the shared folders because the config will be saved there fetch_config 'Fetch and load cluster configs' +# Compute fleet init requires shared dirs include_recipe "aws-parallelcluster-computefleet::init" include_recipe "aws-parallelcluster-slurm::init" diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb index 116e034817..e10f157445 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb @@ -16,8 +16,8 @@ update true end -# generate the update shared storages mapping file -include_recipe 'aws-parallelcluster-environment::fs_update' +# generate the updated shared storages mapping file +include_recipe 'aws-parallelcluster-environment::update_fs_mapping' include_recipe 'aws-parallelcluster-environment::directory_service' include_recipe 'aws-parallelcluster-slurm::update' if node['cluster']['scheduler'] == 'slurm' diff --git a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb index 1a6bd3e1a1..022d7042f9 100644 --- a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb +++ b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb @@ -53,5 +53,7 @@ default['cluster']['shared_dir_compute'] = node['cluster']['shared_dir'] default['cluster']['shared_dir_head'] = node['cluster']['shared_dir'] default['cluster']['shared_dir_login'] = node['cluster']['shared_dir_login_nodes'] - +default['cluster']['internal_shared_dirs'] = [] +default['cluster']['internal_shared_dirs'].append(node['cluster']['shared_dir_login_nodes']) if %w[LoginNode HeadNode].include?(node['cluster']['node_type']) +default['cluster']['internal_shared_dirs'].append(node['cluster']['shared_dir'], "/opt/slurm", "/opt/intel") if %w[ComputeFleet HeadNode].include?(node['cluster']['node_type']) default['cluster']['head_node_private_ip'] = nil diff --git a/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml b/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml index 86d346eb15..d3f4bc4b6a 100644 --- a/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml +++ b/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml @@ -414,10 +414,10 @@ suites: fsx_shared_dirs: '' raid_shared_dir: '' ephemeral_dir: test1 - - name: fs_update + - name: update_fs_mapping run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::fs_update] + - recipe[aws-parallelcluster-environment::update_fs_mapping] verifier: controls: - fs_data_file_created_correctly @@ -440,10 +440,10 @@ suites: fsx_dns_names: dns1,dns2 fsx_mount_names: mount1,mount2 fsx_volume_junction_paths: value1,value2 - - name: fs_update_default_values + - name: update_fs_mapping_default_values run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::fs_update] + - recipe[aws-parallelcluster-environment::update_fs_mapping] verifier: controls: - fs_data_file_with_default_values @@ -472,42 +472,43 @@ suites: scheduler: slurm head_node_imds_secured: 'true' head_node_imds_allowed_users: ['root', 'nobody'] - - name: mount_shared_compute - run_list: - - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::mount_shared] - verifier: - controls: - - mount_home - - mount_shared_compute - attributes: - dependencies: - - recipe:aws-parallelcluster-platform::directories - - resource:nfs - - recipe:aws-parallelcluster-environment::mock_export_directories - cluster: - node_type: 'ComputeFleet' - head_node_private_ip: '127.0.0.1' - head_node_home_path: '/fake_headnode_home' - shared_dir_head: '/fake_headnode_shared' - - name: mount_shared_login - run_list: - - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::mount_shared] - verifier: - controls: - - mount_home - - mount_shared_login - attributes: - dependencies: - - recipe:aws-parallelcluster-platform::directories - - resource:nfs - - recipe:aws-parallelcluster-environment::mock_export_directories - cluster: - node_type: 'LoginNode' - head_node_private_ip: '127.0.0.1' - head_node_home_path: '/fake_headnode_home' - shared_dir_head: '/fake_headnode_shared' +# TODO replacement for the mount_internal_use_fs recipe since it uses shared storage +# - name: mount_shared_compute +# run_list: +# - recipe[aws-parallelcluster-tests::setup] +# - recipe[aws-parallelcluster-environment::mount_shared] +# verifier: +# controls: +# - mount_home +# - mount_shared_compute +# attributes: +# dependencies: +# - recipe:aws-parallelcluster-platform::directories +# - resource:nfs +# - recipe:aws-parallelcluster-environment::mock_export_directories +# cluster: +# node_type: 'ComputeFleet' +# head_node_private_ip: '127.0.0.1' +# head_node_home_path: '/fake_headnode_home' +# shared_dir_head: '/fake_headnode_shared' +# - name: mount_shared_login +# run_list: +# - recipe[aws-parallelcluster-tests::setup] +# - recipe[aws-parallelcluster-environment::mount_shared] +# verifier: +# controls: +# - mount_home +# - mount_shared_login +# attributes: +# dependencies: +# - recipe:aws-parallelcluster-platform::directories +# - resource:nfs +# - recipe:aws-parallelcluster-environment::mock_export_directories +# cluster: +# node_type: 'LoginNode' +# head_node_private_ip: '127.0.0.1' +# head_node_home_path: '/fake_headnode_home' +# shared_dir_head: '/fake_headnode_shared' - name: raid_compute run_list: - recipe[aws-parallelcluster-tests::setup] @@ -541,7 +542,7 @@ suites: - name: shared_storages_compute run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::shared_storages] + - recipe[aws-parallelcluster-environment::mount_cx_fs] verifier: controls: - shared_storages_compute_and_login @@ -555,7 +556,7 @@ suites: - name: shared_storages_login run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::shared_storages] + - recipe[aws-parallelcluster-environment::mount_cx_fs] verifier: controls: - shared_storages_compute_and_login diff --git a/cookbooks/aws-parallelcluster-environment/kitchen.environment-recipes.yml b/cookbooks/aws-parallelcluster-environment/kitchen.environment-recipes.yml new file mode 100644 index 0000000000..19fc143239 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/kitchen.environment-recipes.yml @@ -0,0 +1,88 @@ +# Validates config recipes +--- +verifier: + name: inspec + inspec_tests: + - cookbooks/aws-parallelcluster-environment/test + +_common_cluster_attributes: &_common_cluster_attributes + stack_name: <%= ENV['AWS_STACK_NAME'] || 'fake_stack' %> + volume: <%= ENV['VOLUME'] || "''" %> + region: <%= ENV['KITCHEN_AWS_REGION'] %> + ephemeral_dir: <%= ENV['EPHEMERAL_DIR'] || '/scratch' %> + ebs_shared_dirs: <%= ENV['EBS_SHARED_DIRS'] || '/shared' %> + cluster_s3_bucket: <%= ENV['CLUSTER_CONFIG_S3_BUCKET'] %> + cluster_config_s3_key: <%= ENV['CLUSTER_CONFIG_S3_KEY'] %> + instance_types_data_s3_key: <%= ENV['INSTANCE_TYPES_DATA_S3_KEY'] %> + os: <%= ENV['OS'] %> + custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> + dcv_enabled: 'head_node' + dcv_port: '8443' + enable_efa: 'efa' + nvidia: + enabled: <%= ENV['NVIDIA_ENABLED'] %> + +_head_node_cluster_attributes: &_head_node_cluster_attributes + << : *_common_cluster_attributes + node_type: 'HeadNode' + ddb_table: <%= ENV['DDB_TABLE'] %> + slurm_ddb_table: <%= ENV['DDB_TABLE'] %> + +_compute_node_cluster_attributes: &_compute_node_cluster_attributes + << : *_common_cluster_attributes + node_type: 'ComputeFleet' + head_node: <%= ENV['HEAD_NODE'] %> + head_node_private_ip: <%= ENV['HEAD_NODE_PRIVATE_IP'] %> + +_run_list: &_run_list + - recipe[aws-parallelcluster-tests::setup] + - recipe[aws-parallelcluster-entrypoints::init] + - recipe[aws-parallelcluster-entrypoints::config] + - recipe[aws-parallelcluster-entrypoints::finalize] + - recipe[aws-parallelcluster-tests::tear_down] + +provisioner: + attributes: + kitchen: true + +suites: + - name: init-head + run_list: + - recipe[aws-parallelcluster-tests::setup] + - recipe[aws-parallelcluster-environment::init] + - recipe[aws-parallelcluster-tests::tear_down] + verifier: + controls: + - /tag:config_cfnconfig_file_configuration/ + - /mount_home/ + - /mount_shared/ + + attributes: &attributes_slurm_config_HeadNode + cluster: + << : *_head_node_cluster_attributes + scheduler: 'slurm' + enable_intel_hpc_platform: "<%= ENV['ENABLE_INTEL_HPC_PLATFORM'] || false %>" + - name: init-compute + run_list: + - recipe[aws-parallelcluster-tests::setup] + - recipe[aws-parallelcluster-environment::init] + - recipe[aws-parallelcluster-tests::tear_down] + verifier: + controls: + - /tag:config_cfnconfig_file_configuration/ + - /mount_home/ + - /mount_shared/ + + attributes: &attributes_slurm_config_HeadNode + cluster: + << : *_compute_node_cluster_attributes + scheduler: 'slurm' + slurm_nodename: 'fake-dy-compute-1' + + - name: slurm-config-compute-fleet-x86-64-<%= ENV['KITCHEN_INSTANCE_TYPE'] || 'c5n.xlarge' %> + run_list: *_run_list + attributes: &attributes_slurm_config_ComputeFleet + cluster: + << : *_compute_node_cluster_attributes + scheduler: 'slurm' + slurm_nodename: 'fake-dy-compute-1' diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config.rb b/cookbooks/aws-parallelcluster-environment/recipes/config.rb index 6eefa2b369..714ab70e2c 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config.rb @@ -18,9 +18,9 @@ action :configure end include_recipe 'aws-parallelcluster-environment::ephemeral_drives' -# fs_update generates the shared storages mapping file so must be executed before shared storages recipes -include_recipe 'aws-parallelcluster-environment::fs_update' -include_recipe 'aws-parallelcluster-environment::shared_storages' +# update_fs_mapping generates the shared storages mapping file so must be executed before shared storages recipes +include_recipe 'aws-parallelcluster-environment::update_fs_mapping' +include_recipe 'aws-parallelcluster-environment::export_home' include_recipe 'aws-parallelcluster-environment::ebs' include_recipe 'aws-parallelcluster-environment::raid' -include_recipe "aws-parallelcluster-environment::fs_mount" +include_recipe "aws-parallelcluster-environment::mount_cx_fs" diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb new file mode 100644 index 0000000000..aa1e1e9dbc --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +case node['cluster']['node_type'] +when 'HeadNode' + volume "export /home" do + shared_dir "/home" + action :export + end +when 'ComputeFleet', 'LoginNode' + Chef::Log.info("Export only from the HeadNode") +else + raise "node_type must be HeadNode, ComputeFleet, or LoginNode" +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/fs_mount.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/mount_cx_fs.rb similarity index 50% rename from cookbooks/aws-parallelcluster-environment/recipes/config/fs_mount.rb rename to cookbooks/aws-parallelcluster-environment/recipes/config/mount_cx_fs.rb index 255be867b4..1dbf03ba94 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/fs_mount.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/mount_cx_fs.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true -# Copyright:: 2013-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the # License. A copy of the License is located at @@ -10,15 +10,34 @@ # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. +return if on_docker? +efs_shared_dir_array = node['cluster']['efs_shared_dirs'].split(',') +efs_fs_id_array = node['cluster']['efs_fs_ids'].split(',') +efs_encryption_in_transit_array = node['cluster']['efs_encryption_in_transits'].split(',') +efs_iam_authorization_array = node['cluster']['efs_iam_authorizations'].split(',') -# Mount EFS directory with efs resource +cx_shared_dir_array = [] +cx_efs_fs_id_array = [] +cx_efs_encryption_array = [] +cx_efs_iam_array = [] + +# Identify the customer use filesystems and store their data in arrays for the EFS resource +efs_shared_dir_array.each_with_index do |dir, index| + next if node['cluster']['internal_shared_dirs'].include?(dir) + cx_shared_dir_array.push(dir) + cx_efs_fs_id_array.push(efs_fs_id_array[index]) + cx_efs_encryption_array.push(efs_encryption_in_transit_array[index]) + cx_efs_iam_array.push(efs_iam_authorization_array[index]) +end + +# Mount EFS directories with the efs resource efs "mount efs" do - shared_dir_array node['cluster']['efs_shared_dirs'].split(',') - efs_fs_id_array node['cluster']['efs_fs_ids'].split(',') - efs_encryption_in_transit_array node['cluster']['efs_encryption_in_transits'].split(',') - efs_iam_authorization_array node['cluster']['efs_iam_authorizations'].split(',') + shared_dir_array cx_shared_dir_array + efs_fs_id_array cx_efs_fs_id_array + efs_encryption_in_transit_array cx_efs_encryption_array + efs_iam_authorization_array cx_efs_iam_array action :mount - not_if { node['cluster']['efs_shared_dirs'].split(',').empty? } + not_if { cx_shared_dir_array.empty? } end # Mount FSx directory with manage_fsx resource diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb deleted file mode 100644 index d6632d4eff..0000000000 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb +++ /dev/null @@ -1,48 +0,0 @@ -# frozen_string_literal: true - -# Copyright:: 2013-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the -# License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. - -return if on_docker? - -case node['cluster']['node_type'] -when 'ComputeFleet' - include_recipe 'aws-parallelcluster-environment::mount_home' - - # Mount /opt/parallelcluster/shared over NFS - volume "mount #{node['cluster']['shared_dir_compute']}" do - action :mount - shared_dir node['cluster']['shared_dir_compute'] - device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['shared_dir_head']}" }) - fstype 'nfs' - options node['cluster']['nfs']['hard_mount_options'] - retries 10 - retry_delay 6 - end - -when 'LoginNode' - include_recipe 'aws-parallelcluster-environment::mount_home' - - # Mount /opt/parallelcluster/shared_login_nodes over NFS - volume "mount #{node['cluster']['shared_dir_login']}" do - action :mount - shared_dir node['cluster']['shared_dir_login'] - device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['shared_dir_login']}" }) - fstype 'nfs' - options node['cluster']['nfs']['hard_mount_options'] - retries 10 - retry_delay 6 - end -when 'HeadNode' - Chef::Log.info("Nothing to mount in the HeadNode") -else - raise "node_type must be HeadNode, LoginNode or ComputeFleet" -end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb deleted file mode 100644 index 56f34e2047..0000000000 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb +++ /dev/null @@ -1,59 +0,0 @@ -# frozen_string_literal: true - -# -# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the -# License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. - -return if on_docker? - -case node['cluster']['node_type'] -when 'HeadNode' - volume "export /home" do - shared_dir "/home" - action :export - end - - # Export /opt/parallelcluster/shared - volume "export #{node['cluster']['shared_dir']}" do - shared_dir node['cluster']['shared_dir'] - action :export - end - - # Export /opt/parallelcluster/shared_login_nodes - volume "export #{node['cluster']['shared_dir_login_nodes']}" do - shared_dir node['cluster']['shared_dir_login_nodes'] - action :export - end - - # Export /opt/intel only if exists - volume "export /opt/intel" do - shared_dir "/opt/intel" - only_if { ::File.directory?("/opt/intel") } - action :export - end - -when 'ComputeFleet', 'LoginNode' - # Mount /opt/intel over NFS only if it exists - exported_intel_dir = format_directory('/opt/intel') - volume "mount /opt/intel" do - action :mount - shared_dir '/opt/intel' - device(lazy { "#{node['cluster']['head_node_private_ip']}:#{exported_intel_dir}" }) - fstype 'nfs' - options node['cluster']['nfs']['hard_mount_options'] - retries 10 - retry_delay 6 - only_if { ::File.directory?("/opt/intel") } - end - -else - raise "node_type must be HeadNode or ComputeFleet" -end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/fs_update.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/update_fs_mapping.rb similarity index 100% rename from cookbooks/aws-parallelcluster-environment/recipes/config/fs_update.rb rename to cookbooks/aws-parallelcluster-environment/recipes/config/update_fs_mapping.rb diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init.rb b/cookbooks/aws-parallelcluster-environment/recipes/init.rb index cd3c801c9c..8e465ee0ea 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init.rb @@ -13,12 +13,18 @@ # include_recipe "aws-parallelcluster-environment::cfnconfig_mixed" -include_recipe "aws-parallelcluster-environment::mount_shared" cloudwatch "Configure CloudWatch" do action :configure end +include_recipe "aws-parallelcluster-environment::update_fs_mapping" +include_recipe "aws-parallelcluster-environment::backup_internal_use_shared_data" +include_recipe "aws-parallelcluster-environment::mount_internal_use_fs" +include_recipe "aws-parallelcluster-environment::restore_internal_use_shared_data" + include_recipe "aws-parallelcluster-environment::network_interfaces" include_recipe 'aws-parallelcluster-environment::imds' + +# login nodes keys and directory service require shared storage include_recipe "aws-parallelcluster-environment::login_nodes_keys" include_recipe "aws-parallelcluster-environment::directory_service" diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb new file mode 100644 index 0000000000..0be3bacfee --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +if node['cluster']['node_type'] == 'HeadNode' + # For each, backup the data to a temp location + node['cluster']['internal_shared_dirs'].each do |dir| + bash "Backup #{dir}" do + user 'root' + group 'root' + code <<-EOH + mkdir -p /tmp#{dir} + rsync -a #{dir}/ /tmp#{dir} + EOH + end + end +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_fs.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_fs.rb new file mode 100644 index 0000000000..b153e1bb50 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_fs.rb @@ -0,0 +1,51 @@ +# frozen_string_literal: true + +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +return if on_docker? +efs_shared_dir_array = node['cluster']['efs_shared_dirs'].split(',') +efs_fs_id_array = node['cluster']['efs_fs_ids'].split(',') +efs_encryption_in_transit_array = node['cluster']['efs_encryption_in_transits'].split(',') +efs_iam_authorization_array = node['cluster']['efs_iam_authorizations'].split(',') + +internal_shared_dir_array = [] +internal_efs_fs_id_array = [] +internal_efs_encryption_array = [] +internal_efs_iam_array = [] + +# Identify the internal use filesystems and store their data in arrays for the EFS resource +efs_shared_dir_array.each_with_index do |dir, index| + next unless node['cluster']['internal_shared_dirs'].include?(dir) + internal_shared_dir_array.push(dir) + internal_efs_fs_id_array.push(efs_fs_id_array[index]) + internal_efs_encryption_array.push(efs_encryption_in_transit_array[index]) + internal_efs_iam_array.push(efs_iam_authorization_array[index]) +end + +# Mount EFS directories with the efs resource +efs "mount internal use efs" do + shared_dir_array internal_shared_dir_array + efs_fs_id_array internal_efs_fs_id_array + efs_encryption_in_transit_array internal_efs_encryption_array + efs_iam_authorization_array internal_efs_iam_array + action :mount + not_if { internal_shared_dir_array.empty? } +end + +# TODO: replace home as NFS with shared /home +case node['cluster']['node_type'] +when 'ComputeFleet', 'LoginNode' + include_recipe 'aws-parallelcluster-environment::mount_home' +when 'HeadNode' + Chef::Log.info("Nothing to mount in the HeadNode") +else + raise "node_type must be HeadNode, LoginNode or ComputeFleet" +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb new file mode 100644 index 0000000000..6b8da840fd --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb @@ -0,0 +1,28 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +if node['cluster']['node_type'] == 'HeadNode' + # For each, restore to the shared storage if it doesn't already exist + node['cluster']['internal_shared_dirs'].each do |dir| + bash "Restore #{dir}" do + user 'root' + group 'root' + code <<-EOH + rsync -a --ignore-existing /tmp#{dir}/ #{dir} + EOH + end + end +end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/fs_mount_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_cx_fs_spec.rb similarity index 78% rename from cookbooks/aws-parallelcluster-environment/spec/unit/recipes/fs_mount_spec.rb rename to cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_cx_fs_spec.rb index 6fca809adf..5bfb446069 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/fs_mount_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_cx_fs_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe 'aws-parallelcluster-environment::fs_mount' do +describe 'aws-parallelcluster-environment::mount_cx_fs' do for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:chef_run) do @@ -13,10 +13,10 @@ end cached(:node) { chef_run.node } - describe 'call the efs for mounting' do + describe 'call efs for mounting' do it { is_expected.to mount_efs('mount efs') } end - describe 'call the lustre for mounting' do + describe 'call lustre for mounting' do it { is_expected.to mount_lustre("mount fsx") } end end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_shared_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_fs_spec.rb similarity index 65% rename from cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_shared_spec.rb rename to cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_fs_spec.rb index ad2b45e3ea..c5e33aab6d 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_shared_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_fs_spec.rb @@ -1,12 +1,14 @@ require 'spec_helper' -describe 'aws-parallelcluster-environment::mount_shared' do +describe 'aws-parallelcluster-environment::mount_internal_use_fs' do for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| node.override['cluster']['head_node_private_ip'] = '0.0.0.0' node.override['cluster']['node_type'] = 'ComputeFleet' + node.override['cluster']['internal_shared_dirs'] = %w(/opt/slurm /opt/intel) + node.override['cluster']['efs_shared_dirs'] = "/opt/slurm,/opt/intel" end runner.converge(described_recipe) end @@ -21,13 +23,8 @@ .with(retry_delay: 6) end - it 'mounts /opt/parallelcluster/shared' do - is_expected.to mount_volume('mount /opt/parallelcluster/shared') - .with(device: "0.0.0.0:/opt/parallelcluster/shared") - .with(fstype: 'nfs') - .with(options: 'hard,_netdev,noatime') - .with(retries: 10) - .with(retry_delay: 6) + describe 'call efs for mounting' do + it { is_expected.to mount_efs('mount internal use efs') } end end end diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/mount_shared_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/mount_internal_use_fs_spec.rb similarity index 65% rename from cookbooks/aws-parallelcluster-environment/test/controls/mount_shared_spec.rb rename to cookbooks/aws-parallelcluster-environment/test/controls/mount_internal_use_fs_spec.rb index fe37048180..e3861a20e6 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/mount_shared_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/mount_internal_use_fs_spec.rb @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and limitations under the License. control 'mount_home' do - title 'Check if the home and the shared directories are mounted' + title 'Check if the home directory in mounted' only_if { !os_properties.on_docker? && (instance.compute_node? or instance.login_node?) } @@ -22,7 +22,7 @@ end control 'mount_shared_compute' do - title 'Check if the home and the shared directories are mounted' + title 'Check if the shared directory is mounted' only_if { !os_properties.on_docker? && instance.compute_node? } @@ -34,7 +34,7 @@ end control 'mount_shared_login' do - title 'Check if the home and the shared directories are mounted' + title 'Check if the shared directory is mounted' only_if { !os_properties.on_docker? && instance.login_node? } @@ -44,3 +44,19 @@ its('options') { should include 'rw' } end end + +control 'shared_storages_compute_and_login' do + title 'Check the shared storages configuration for compute node' + + only_if { !os_properties.on_docker? && (instance.compute_node? or instance.login_node?) } + + describe 'Check that /opt/intel dir has been mounted' + describe mount("/opt/intel") do + it { should be_mounted } + its('device') { should eq "127.0.0.1:/opt/intel" } + its('type') { should eq 'nfs4' } + its('options') { should include 'hard' } + its('options') { should include '_netdev' } + its('options') { should include 'noatime' } + end +end diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/shared_storages_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/shared_storages_spec.rb deleted file mode 100644 index 6d9fe989ff..0000000000 --- a/cookbooks/aws-parallelcluster-environment/test/controls/shared_storages_spec.rb +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. -# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. -# See the License for the specific language governing permissions and limitations under the License. - -control 'shared_storages_compute_and_login' do - title 'Check the shared storages configuration for compute node' - - only_if { !os_properties.on_docker? && (instance.compute_node? or instance.login_node?) } - - describe 'Check that /opt/intel dir has been mounted' - describe mount("/opt/intel") do - it { should be_mounted } - its('device') { should eq "127.0.0.1:/opt/intel" } - its('type') { should eq 'nfs4' } - its('options') { should include 'hard' } - its('options') { should include '_netdev' } - its('options') { should include 'noatime' } - end -end diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_compute.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_compute.rb index 29a6fba001..9424e491c0 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_compute.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_compute.rb @@ -24,8 +24,6 @@ mode '0700' end -include_recipe 'aws-parallelcluster-slurm::mount_slurm_dir' - # Check to see if is GPU instance with Nvidia installed Chef::Log.warn("GPU instance but no Nvidia drivers found") if graphic_instance? && !nvidia_installed? diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb index 74113ead54..9c304d3641 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb @@ -17,13 +17,6 @@ include_recipe 'aws-parallelcluster-slurm::config_munge_key' -# Export /opt/slurm -nfs_export "#{node['cluster']['slurm']['install_dir']}" do - network get_vpc_cidr_list - writeable true - options ['no_root_squash'] -end unless on_docker? - # Ensure config directory is in place directory "#{node['cluster']['slurm']['install_dir']}" do user 'root' diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_login.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_login.rb index ce562c2c49..47df2dd4dd 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_login.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_login.rb @@ -17,5 +17,3 @@ # TODO: rename, find a better name that include login nodes setup_munge_compute_node - -include_recipe 'aws-parallelcluster-slurm::mount_slurm_dir' diff --git a/kitchen.ec2.yml b/kitchen.ec2.yml index 8bb9cdcde5..a3fdfee52f 100644 --- a/kitchen.ec2.yml +++ b/kitchen.ec2.yml @@ -187,7 +187,7 @@ platforms: image_id: <%= ENV['KITCHEN_UBUNTU2204_AMI'] %> <% else %> image_search: - name: ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-<% if ENV['KITCHEN_ARCHITECTURE'] == 'x86_64' %>amd64<% else %>arm64<% end %>-server-20230106 + name: <% if ENV['KITCHEN_PHASE']=='install' %>ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-<% if ENV['KITCHEN_ARCHITECTURE'] == 'x86_64' %>amd64<% else %>arm64<% end %>-server-20230106<% else %><%= pcluster_prefix %>-ubuntu-2204-lts-hvm-*<% end %> architecture: <%= ENV['KITCHEN_ARCHITECTURE'] %> <% end %> block_device_mappings: