From 28c41d9892571f2ebd2fec6dc7f33750b7e869ee Mon Sep 17 00:00:00 2001 From: Ryan Anderson Date: Fri, 15 Sep 2023 12:15:18 -0400 Subject: [PATCH] [Develop] Migrate internal storage to EFS from NFS exports Add backup and restore recipes to move data in second stage images to shared filesystems Create a new mount_internal_use_fs.rb recipe to mount the internal shared filesystems Filter the efs filesystem arrays to mount internal shared fses in init and cx fses in config Refactor environment recipes to be clearer in functional description and remove unnecessary recipes --- CHANGELOG.md | 1 + .../recipes/init.rb | 1 + .../recipes/update.rb | 4 +- .../attributes/environment.rb | 4 +- .../kitchen.environment-config.yml | 85 +++++++++--------- .../kitchen.environment-recipes.yml | 88 +++++++++++++++++++ .../recipes/config.rb | 8 +- .../recipes/config/export_home.rb | 27 ++++++ .../config/{fs_mount.rb => mount_cx_fs.rb} | 33 +++++-- .../recipes/config/mount_shared.rb | 48 ---------- .../recipes/config/shared_storages.rb | 59 ------------- .../{fs_update.rb => update_fs_mapping.rb} | 0 .../recipes/init.rb | 8 +- .../init/backup_internal_use_shared_data.rb | 29 ++++++ .../recipes/init/mount_internal_use_fs.rb | 51 +++++++++++ .../init/restore_internal_use_shared_data.rb | 28 ++++++ .../{fs_mount_spec.rb => mount_cx_fs_spec.rb} | 6 +- ..._spec.rb => mount_internal_use_fs_spec.rb} | 13 ++- ..._spec.rb => mount_internal_use_fs_spec.rb} | 22 ++++- .../test/controls/shared_storages_spec.rb | 26 ------ .../recipes/config/config_compute.rb | 2 - .../recipes/config/config_head_node.rb | 7 -- .../recipes/config/config_login.rb | 2 - kitchen.ec2.yml | 2 +- 24 files changed, 338 insertions(+), 216 deletions(-) create mode 100644 cookbooks/aws-parallelcluster-environment/kitchen.environment-recipes.yml create mode 100644 cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb rename cookbooks/aws-parallelcluster-environment/recipes/config/{fs_mount.rb => mount_cx_fs.rb} (50%) delete mode 100644 cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb delete mode 100644 cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb rename cookbooks/aws-parallelcluster-environment/recipes/config/{fs_update.rb => update_fs_mapping.rb} (100%) create mode 100644 cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb create mode 100644 cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_fs.rb create mode 100644 cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb rename cookbooks/aws-parallelcluster-environment/spec/unit/recipes/{fs_mount_spec.rb => mount_cx_fs_spec.rb} (78%) rename cookbooks/aws-parallelcluster-environment/spec/unit/recipes/{mount_shared_spec.rb => mount_internal_use_fs_spec.rb} (65%) rename cookbooks/aws-parallelcluster-environment/test/controls/{mount_shared_spec.rb => mount_internal_use_fs_spec.rb} (65%) delete mode 100644 cookbooks/aws-parallelcluster-environment/test/controls/shared_storages_spec.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index 63938c8ace..42b9f2b7a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Install [Spack](https://spack.io) by default in cluster user's home directory. **CHANGES** +- Migrate NFS exports from the head node root volume and associated data for intra-cluster shared storage to external AWS EFS filesystems attached to the cluster stack. **BUG FIXES** - Fix inconsistent scaling configuration after cluster update rollback when modifying the list of instance types declared in the Compute Resources. diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb index edd70bcf99..538eeaf37f 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb @@ -24,5 +24,6 @@ # Fetch config must be executed after the mount of the shared folders because the config will be saved there fetch_config 'Fetch and load cluster configs' +# Compute fleet init requires shared dirs include_recipe "aws-parallelcluster-computefleet::init" include_recipe "aws-parallelcluster-slurm::init" diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb index 116e034817..e10f157445 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb @@ -16,8 +16,8 @@ update true end -# generate the update shared storages mapping file -include_recipe 'aws-parallelcluster-environment::fs_update' +# generate the updated shared storages mapping file +include_recipe 'aws-parallelcluster-environment::update_fs_mapping' include_recipe 'aws-parallelcluster-environment::directory_service' include_recipe 'aws-parallelcluster-slurm::update' if node['cluster']['scheduler'] == 'slurm' diff --git a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb index 1a6bd3e1a1..022d7042f9 100644 --- a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb +++ b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb @@ -53,5 +53,7 @@ default['cluster']['shared_dir_compute'] = node['cluster']['shared_dir'] default['cluster']['shared_dir_head'] = node['cluster']['shared_dir'] default['cluster']['shared_dir_login'] = node['cluster']['shared_dir_login_nodes'] - +default['cluster']['internal_shared_dirs'] = [] +default['cluster']['internal_shared_dirs'].append(node['cluster']['shared_dir_login_nodes']) if %w[LoginNode HeadNode].include?(node['cluster']['node_type']) +default['cluster']['internal_shared_dirs'].append(node['cluster']['shared_dir'], "/opt/slurm", "/opt/intel") if %w[ComputeFleet HeadNode].include?(node['cluster']['node_type']) default['cluster']['head_node_private_ip'] = nil diff --git a/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml b/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml index 86d346eb15..d3f4bc4b6a 100644 --- a/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml +++ b/cookbooks/aws-parallelcluster-environment/kitchen.environment-config.yml @@ -414,10 +414,10 @@ suites: fsx_shared_dirs: '' raid_shared_dir: '' ephemeral_dir: test1 - - name: fs_update + - name: update_fs_mapping run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::fs_update] + - recipe[aws-parallelcluster-environment::update_fs_mapping] verifier: controls: - fs_data_file_created_correctly @@ -440,10 +440,10 @@ suites: fsx_dns_names: dns1,dns2 fsx_mount_names: mount1,mount2 fsx_volume_junction_paths: value1,value2 - - name: fs_update_default_values + - name: update_fs_mapping_default_values run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::fs_update] + - recipe[aws-parallelcluster-environment::update_fs_mapping] verifier: controls: - fs_data_file_with_default_values @@ -472,42 +472,43 @@ suites: scheduler: slurm head_node_imds_secured: 'true' head_node_imds_allowed_users: ['root', 'nobody'] - - name: mount_shared_compute - run_list: - - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::mount_shared] - verifier: - controls: - - mount_home - - mount_shared_compute - attributes: - dependencies: - - recipe:aws-parallelcluster-platform::directories - - resource:nfs - - recipe:aws-parallelcluster-environment::mock_export_directories - cluster: - node_type: 'ComputeFleet' - head_node_private_ip: '127.0.0.1' - head_node_home_path: '/fake_headnode_home' - shared_dir_head: '/fake_headnode_shared' - - name: mount_shared_login - run_list: - - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::mount_shared] - verifier: - controls: - - mount_home - - mount_shared_login - attributes: - dependencies: - - recipe:aws-parallelcluster-platform::directories - - resource:nfs - - recipe:aws-parallelcluster-environment::mock_export_directories - cluster: - node_type: 'LoginNode' - head_node_private_ip: '127.0.0.1' - head_node_home_path: '/fake_headnode_home' - shared_dir_head: '/fake_headnode_shared' +# TODO replacement for the mount_internal_use_fs recipe since it uses shared storage +# - name: mount_shared_compute +# run_list: +# - recipe[aws-parallelcluster-tests::setup] +# - recipe[aws-parallelcluster-environment::mount_shared] +# verifier: +# controls: +# - mount_home +# - mount_shared_compute +# attributes: +# dependencies: +# - recipe:aws-parallelcluster-platform::directories +# - resource:nfs +# - recipe:aws-parallelcluster-environment::mock_export_directories +# cluster: +# node_type: 'ComputeFleet' +# head_node_private_ip: '127.0.0.1' +# head_node_home_path: '/fake_headnode_home' +# shared_dir_head: '/fake_headnode_shared' +# - name: mount_shared_login +# run_list: +# - recipe[aws-parallelcluster-tests::setup] +# - recipe[aws-parallelcluster-environment::mount_shared] +# verifier: +# controls: +# - mount_home +# - mount_shared_login +# attributes: +# dependencies: +# - recipe:aws-parallelcluster-platform::directories +# - resource:nfs +# - recipe:aws-parallelcluster-environment::mock_export_directories +# cluster: +# node_type: 'LoginNode' +# head_node_private_ip: '127.0.0.1' +# head_node_home_path: '/fake_headnode_home' +# shared_dir_head: '/fake_headnode_shared' - name: raid_compute run_list: - recipe[aws-parallelcluster-tests::setup] @@ -541,7 +542,7 @@ suites: - name: shared_storages_compute run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::shared_storages] + - recipe[aws-parallelcluster-environment::mount_cx_fs] verifier: controls: - shared_storages_compute_and_login @@ -555,7 +556,7 @@ suites: - name: shared_storages_login run_list: - recipe[aws-parallelcluster-tests::setup] - - recipe[aws-parallelcluster-environment::shared_storages] + - recipe[aws-parallelcluster-environment::mount_cx_fs] verifier: controls: - shared_storages_compute_and_login diff --git a/cookbooks/aws-parallelcluster-environment/kitchen.environment-recipes.yml b/cookbooks/aws-parallelcluster-environment/kitchen.environment-recipes.yml new file mode 100644 index 0000000000..19fc143239 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/kitchen.environment-recipes.yml @@ -0,0 +1,88 @@ +# Validates config recipes +--- +verifier: + name: inspec + inspec_tests: + - cookbooks/aws-parallelcluster-environment/test + +_common_cluster_attributes: &_common_cluster_attributes + stack_name: <%= ENV['AWS_STACK_NAME'] || 'fake_stack' %> + volume: <%= ENV['VOLUME'] || "''" %> + region: <%= ENV['KITCHEN_AWS_REGION'] %> + ephemeral_dir: <%= ENV['EPHEMERAL_DIR'] || '/scratch' %> + ebs_shared_dirs: <%= ENV['EBS_SHARED_DIRS'] || '/shared' %> + cluster_s3_bucket: <%= ENV['CLUSTER_CONFIG_S3_BUCKET'] %> + cluster_config_s3_key: <%= ENV['CLUSTER_CONFIG_S3_KEY'] %> + instance_types_data_s3_key: <%= ENV['INSTANCE_TYPES_DATA_S3_KEY'] %> + os: <%= ENV['OS'] %> + custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %> + dcv_enabled: 'head_node' + dcv_port: '8443' + enable_efa: 'efa' + nvidia: + enabled: <%= ENV['NVIDIA_ENABLED'] %> + +_head_node_cluster_attributes: &_head_node_cluster_attributes + << : *_common_cluster_attributes + node_type: 'HeadNode' + ddb_table: <%= ENV['DDB_TABLE'] %> + slurm_ddb_table: <%= ENV['DDB_TABLE'] %> + +_compute_node_cluster_attributes: &_compute_node_cluster_attributes + << : *_common_cluster_attributes + node_type: 'ComputeFleet' + head_node: <%= ENV['HEAD_NODE'] %> + head_node_private_ip: <%= ENV['HEAD_NODE_PRIVATE_IP'] %> + +_run_list: &_run_list + - recipe[aws-parallelcluster-tests::setup] + - recipe[aws-parallelcluster-entrypoints::init] + - recipe[aws-parallelcluster-entrypoints::config] + - recipe[aws-parallelcluster-entrypoints::finalize] + - recipe[aws-parallelcluster-tests::tear_down] + +provisioner: + attributes: + kitchen: true + +suites: + - name: init-head + run_list: + - recipe[aws-parallelcluster-tests::setup] + - recipe[aws-parallelcluster-environment::init] + - recipe[aws-parallelcluster-tests::tear_down] + verifier: + controls: + - /tag:config_cfnconfig_file_configuration/ + - /mount_home/ + - /mount_shared/ + + attributes: &attributes_slurm_config_HeadNode + cluster: + << : *_head_node_cluster_attributes + scheduler: 'slurm' + enable_intel_hpc_platform: "<%= ENV['ENABLE_INTEL_HPC_PLATFORM'] || false %>" + - name: init-compute + run_list: + - recipe[aws-parallelcluster-tests::setup] + - recipe[aws-parallelcluster-environment::init] + - recipe[aws-parallelcluster-tests::tear_down] + verifier: + controls: + - /tag:config_cfnconfig_file_configuration/ + - /mount_home/ + - /mount_shared/ + + attributes: &attributes_slurm_config_HeadNode + cluster: + << : *_compute_node_cluster_attributes + scheduler: 'slurm' + slurm_nodename: 'fake-dy-compute-1' + + - name: slurm-config-compute-fleet-x86-64-<%= ENV['KITCHEN_INSTANCE_TYPE'] || 'c5n.xlarge' %> + run_list: *_run_list + attributes: &attributes_slurm_config_ComputeFleet + cluster: + << : *_compute_node_cluster_attributes + scheduler: 'slurm' + slurm_nodename: 'fake-dy-compute-1' diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config.rb b/cookbooks/aws-parallelcluster-environment/recipes/config.rb index 6eefa2b369..714ab70e2c 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config.rb @@ -18,9 +18,9 @@ action :configure end include_recipe 'aws-parallelcluster-environment::ephemeral_drives' -# fs_update generates the shared storages mapping file so must be executed before shared storages recipes -include_recipe 'aws-parallelcluster-environment::fs_update' -include_recipe 'aws-parallelcluster-environment::shared_storages' +# update_fs_mapping generates the shared storages mapping file so must be executed before shared storages recipes +include_recipe 'aws-parallelcluster-environment::update_fs_mapping' +include_recipe 'aws-parallelcluster-environment::export_home' include_recipe 'aws-parallelcluster-environment::ebs' include_recipe 'aws-parallelcluster-environment::raid' -include_recipe "aws-parallelcluster-environment::fs_mount" +include_recipe "aws-parallelcluster-environment::mount_cx_fs" diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb new file mode 100644 index 0000000000..aa1e1e9dbc --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +case node['cluster']['node_type'] +when 'HeadNode' + volume "export /home" do + shared_dir "/home" + action :export + end +when 'ComputeFleet', 'LoginNode' + Chef::Log.info("Export only from the HeadNode") +else + raise "node_type must be HeadNode, ComputeFleet, or LoginNode" +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/fs_mount.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/mount_cx_fs.rb similarity index 50% rename from cookbooks/aws-parallelcluster-environment/recipes/config/fs_mount.rb rename to cookbooks/aws-parallelcluster-environment/recipes/config/mount_cx_fs.rb index 255be867b4..1dbf03ba94 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/fs_mount.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/mount_cx_fs.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true -# Copyright:: 2013-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the # License. A copy of the License is located at @@ -10,15 +10,34 @@ # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. +return if on_docker? +efs_shared_dir_array = node['cluster']['efs_shared_dirs'].split(',') +efs_fs_id_array = node['cluster']['efs_fs_ids'].split(',') +efs_encryption_in_transit_array = node['cluster']['efs_encryption_in_transits'].split(',') +efs_iam_authorization_array = node['cluster']['efs_iam_authorizations'].split(',') -# Mount EFS directory with efs resource +cx_shared_dir_array = [] +cx_efs_fs_id_array = [] +cx_efs_encryption_array = [] +cx_efs_iam_array = [] + +# Identify the customer use filesystems and store their data in arrays for the EFS resource +efs_shared_dir_array.each_with_index do |dir, index| + next if node['cluster']['internal_shared_dirs'].include?(dir) + cx_shared_dir_array.push(dir) + cx_efs_fs_id_array.push(efs_fs_id_array[index]) + cx_efs_encryption_array.push(efs_encryption_in_transit_array[index]) + cx_efs_iam_array.push(efs_iam_authorization_array[index]) +end + +# Mount EFS directories with the efs resource efs "mount efs" do - shared_dir_array node['cluster']['efs_shared_dirs'].split(',') - efs_fs_id_array node['cluster']['efs_fs_ids'].split(',') - efs_encryption_in_transit_array node['cluster']['efs_encryption_in_transits'].split(',') - efs_iam_authorization_array node['cluster']['efs_iam_authorizations'].split(',') + shared_dir_array cx_shared_dir_array + efs_fs_id_array cx_efs_fs_id_array + efs_encryption_in_transit_array cx_efs_encryption_array + efs_iam_authorization_array cx_efs_iam_array action :mount - not_if { node['cluster']['efs_shared_dirs'].split(',').empty? } + not_if { cx_shared_dir_array.empty? } end # Mount FSx directory with manage_fsx resource diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb deleted file mode 100644 index d6632d4eff..0000000000 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/mount_shared.rb +++ /dev/null @@ -1,48 +0,0 @@ -# frozen_string_literal: true - -# Copyright:: 2013-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the -# License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. - -return if on_docker? - -case node['cluster']['node_type'] -when 'ComputeFleet' - include_recipe 'aws-parallelcluster-environment::mount_home' - - # Mount /opt/parallelcluster/shared over NFS - volume "mount #{node['cluster']['shared_dir_compute']}" do - action :mount - shared_dir node['cluster']['shared_dir_compute'] - device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['shared_dir_head']}" }) - fstype 'nfs' - options node['cluster']['nfs']['hard_mount_options'] - retries 10 - retry_delay 6 - end - -when 'LoginNode' - include_recipe 'aws-parallelcluster-environment::mount_home' - - # Mount /opt/parallelcluster/shared_login_nodes over NFS - volume "mount #{node['cluster']['shared_dir_login']}" do - action :mount - shared_dir node['cluster']['shared_dir_login'] - device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['shared_dir_login']}" }) - fstype 'nfs' - options node['cluster']['nfs']['hard_mount_options'] - retries 10 - retry_delay 6 - end -when 'HeadNode' - Chef::Log.info("Nothing to mount in the HeadNode") -else - raise "node_type must be HeadNode, LoginNode or ComputeFleet" -end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb deleted file mode 100644 index 56f34e2047..0000000000 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/shared_storages.rb +++ /dev/null @@ -1,59 +0,0 @@ -# frozen_string_literal: true - -# -# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the -# License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. - -return if on_docker? - -case node['cluster']['node_type'] -when 'HeadNode' - volume "export /home" do - shared_dir "/home" - action :export - end - - # Export /opt/parallelcluster/shared - volume "export #{node['cluster']['shared_dir']}" do - shared_dir node['cluster']['shared_dir'] - action :export - end - - # Export /opt/parallelcluster/shared_login_nodes - volume "export #{node['cluster']['shared_dir_login_nodes']}" do - shared_dir node['cluster']['shared_dir_login_nodes'] - action :export - end - - # Export /opt/intel only if exists - volume "export /opt/intel" do - shared_dir "/opt/intel" - only_if { ::File.directory?("/opt/intel") } - action :export - end - -when 'ComputeFleet', 'LoginNode' - # Mount /opt/intel over NFS only if it exists - exported_intel_dir = format_directory('/opt/intel') - volume "mount /opt/intel" do - action :mount - shared_dir '/opt/intel' - device(lazy { "#{node['cluster']['head_node_private_ip']}:#{exported_intel_dir}" }) - fstype 'nfs' - options node['cluster']['nfs']['hard_mount_options'] - retries 10 - retry_delay 6 - only_if { ::File.directory?("/opt/intel") } - end - -else - raise "node_type must be HeadNode or ComputeFleet" -end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/fs_update.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/update_fs_mapping.rb similarity index 100% rename from cookbooks/aws-parallelcluster-environment/recipes/config/fs_update.rb rename to cookbooks/aws-parallelcluster-environment/recipes/config/update_fs_mapping.rb diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init.rb b/cookbooks/aws-parallelcluster-environment/recipes/init.rb index cd3c801c9c..8e465ee0ea 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init.rb @@ -13,12 +13,18 @@ # include_recipe "aws-parallelcluster-environment::cfnconfig_mixed" -include_recipe "aws-parallelcluster-environment::mount_shared" cloudwatch "Configure CloudWatch" do action :configure end +include_recipe "aws-parallelcluster-environment::update_fs_mapping" +include_recipe "aws-parallelcluster-environment::backup_internal_use_shared_data" +include_recipe "aws-parallelcluster-environment::mount_internal_use_fs" +include_recipe "aws-parallelcluster-environment::restore_internal_use_shared_data" + include_recipe "aws-parallelcluster-environment::network_interfaces" include_recipe 'aws-parallelcluster-environment::imds' + +# login nodes keys and directory service require shared storage include_recipe "aws-parallelcluster-environment::login_nodes_keys" include_recipe "aws-parallelcluster-environment::directory_service" diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb new file mode 100644 index 0000000000..0be3bacfee --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/backup_internal_use_shared_data.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +if node['cluster']['node_type'] == 'HeadNode' + # For each, backup the data to a temp location + node['cluster']['internal_shared_dirs'].each do |dir| + bash "Backup #{dir}" do + user 'root' + group 'root' + code <<-EOH + mkdir -p /tmp#{dir} + rsync -a #{dir}/ /tmp#{dir} + EOH + end + end +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_fs.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_fs.rb new file mode 100644 index 0000000000..b153e1bb50 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/mount_internal_use_fs.rb @@ -0,0 +1,51 @@ +# frozen_string_literal: true + +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +return if on_docker? +efs_shared_dir_array = node['cluster']['efs_shared_dirs'].split(',') +efs_fs_id_array = node['cluster']['efs_fs_ids'].split(',') +efs_encryption_in_transit_array = node['cluster']['efs_encryption_in_transits'].split(',') +efs_iam_authorization_array = node['cluster']['efs_iam_authorizations'].split(',') + +internal_shared_dir_array = [] +internal_efs_fs_id_array = [] +internal_efs_encryption_array = [] +internal_efs_iam_array = [] + +# Identify the internal use filesystems and store their data in arrays for the EFS resource +efs_shared_dir_array.each_with_index do |dir, index| + next unless node['cluster']['internal_shared_dirs'].include?(dir) + internal_shared_dir_array.push(dir) + internal_efs_fs_id_array.push(efs_fs_id_array[index]) + internal_efs_encryption_array.push(efs_encryption_in_transit_array[index]) + internal_efs_iam_array.push(efs_iam_authorization_array[index]) +end + +# Mount EFS directories with the efs resource +efs "mount internal use efs" do + shared_dir_array internal_shared_dir_array + efs_fs_id_array internal_efs_fs_id_array + efs_encryption_in_transit_array internal_efs_encryption_array + efs_iam_authorization_array internal_efs_iam_array + action :mount + not_if { internal_shared_dir_array.empty? } +end + +# TODO: replace home as NFS with shared /home +case node['cluster']['node_type'] +when 'ComputeFleet', 'LoginNode' + include_recipe 'aws-parallelcluster-environment::mount_home' +when 'HeadNode' + Chef::Log.info("Nothing to mount in the HeadNode") +else + raise "node_type must be HeadNode, LoginNode or ComputeFleet" +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb new file mode 100644 index 0000000000..6b8da840fd --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb @@ -0,0 +1,28 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +if node['cluster']['node_type'] == 'HeadNode' + # For each, restore to the shared storage if it doesn't already exist + node['cluster']['internal_shared_dirs'].each do |dir| + bash "Restore #{dir}" do + user 'root' + group 'root' + code <<-EOH + rsync -a --ignore-existing /tmp#{dir}/ #{dir} + EOH + end + end +end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/fs_mount_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_cx_fs_spec.rb similarity index 78% rename from cookbooks/aws-parallelcluster-environment/spec/unit/recipes/fs_mount_spec.rb rename to cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_cx_fs_spec.rb index 6fca809adf..5bfb446069 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/fs_mount_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_cx_fs_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe 'aws-parallelcluster-environment::fs_mount' do +describe 'aws-parallelcluster-environment::mount_cx_fs' do for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:chef_run) do @@ -13,10 +13,10 @@ end cached(:node) { chef_run.node } - describe 'call the efs for mounting' do + describe 'call efs for mounting' do it { is_expected.to mount_efs('mount efs') } end - describe 'call the lustre for mounting' do + describe 'call lustre for mounting' do it { is_expected.to mount_lustre("mount fsx") } end end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_shared_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_fs_spec.rb similarity index 65% rename from cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_shared_spec.rb rename to cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_fs_spec.rb index ad2b45e3ea..c5e33aab6d 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_shared_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_fs_spec.rb @@ -1,12 +1,14 @@ require 'spec_helper' -describe 'aws-parallelcluster-environment::mount_shared' do +describe 'aws-parallelcluster-environment::mount_internal_use_fs' do for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| node.override['cluster']['head_node_private_ip'] = '0.0.0.0' node.override['cluster']['node_type'] = 'ComputeFleet' + node.override['cluster']['internal_shared_dirs'] = %w(/opt/slurm /opt/intel) + node.override['cluster']['efs_shared_dirs'] = "/opt/slurm,/opt/intel" end runner.converge(described_recipe) end @@ -21,13 +23,8 @@ .with(retry_delay: 6) end - it 'mounts /opt/parallelcluster/shared' do - is_expected.to mount_volume('mount /opt/parallelcluster/shared') - .with(device: "0.0.0.0:/opt/parallelcluster/shared") - .with(fstype: 'nfs') - .with(options: 'hard,_netdev,noatime') - .with(retries: 10) - .with(retry_delay: 6) + describe 'call efs for mounting' do + it { is_expected.to mount_efs('mount internal use efs') } end end end diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/mount_shared_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/mount_internal_use_fs_spec.rb similarity index 65% rename from cookbooks/aws-parallelcluster-environment/test/controls/mount_shared_spec.rb rename to cookbooks/aws-parallelcluster-environment/test/controls/mount_internal_use_fs_spec.rb index fe37048180..e3861a20e6 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/mount_shared_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/mount_internal_use_fs_spec.rb @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and limitations under the License. control 'mount_home' do - title 'Check if the home and the shared directories are mounted' + title 'Check if the home directory in mounted' only_if { !os_properties.on_docker? && (instance.compute_node? or instance.login_node?) } @@ -22,7 +22,7 @@ end control 'mount_shared_compute' do - title 'Check if the home and the shared directories are mounted' + title 'Check if the shared directory is mounted' only_if { !os_properties.on_docker? && instance.compute_node? } @@ -34,7 +34,7 @@ end control 'mount_shared_login' do - title 'Check if the home and the shared directories are mounted' + title 'Check if the shared directory is mounted' only_if { !os_properties.on_docker? && instance.login_node? } @@ -44,3 +44,19 @@ its('options') { should include 'rw' } end end + +control 'shared_storages_compute_and_login' do + title 'Check the shared storages configuration for compute node' + + only_if { !os_properties.on_docker? && (instance.compute_node? or instance.login_node?) } + + describe 'Check that /opt/intel dir has been mounted' + describe mount("/opt/intel") do + it { should be_mounted } + its('device') { should eq "127.0.0.1:/opt/intel" } + its('type') { should eq 'nfs4' } + its('options') { should include 'hard' } + its('options') { should include '_netdev' } + its('options') { should include 'noatime' } + end +end diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/shared_storages_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/shared_storages_spec.rb deleted file mode 100644 index 6d9fe989ff..0000000000 --- a/cookbooks/aws-parallelcluster-environment/test/controls/shared_storages_spec.rb +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. -# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. -# See the License for the specific language governing permissions and limitations under the License. - -control 'shared_storages_compute_and_login' do - title 'Check the shared storages configuration for compute node' - - only_if { !os_properties.on_docker? && (instance.compute_node? or instance.login_node?) } - - describe 'Check that /opt/intel dir has been mounted' - describe mount("/opt/intel") do - it { should be_mounted } - its('device') { should eq "127.0.0.1:/opt/intel" } - its('type') { should eq 'nfs4' } - its('options') { should include 'hard' } - its('options') { should include '_netdev' } - its('options') { should include 'noatime' } - end -end diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_compute.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_compute.rb index 29a6fba001..9424e491c0 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_compute.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_compute.rb @@ -24,8 +24,6 @@ mode '0700' end -include_recipe 'aws-parallelcluster-slurm::mount_slurm_dir' - # Check to see if is GPU instance with Nvidia installed Chef::Log.warn("GPU instance but no Nvidia drivers found") if graphic_instance? && !nvidia_installed? diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb index 74113ead54..9c304d3641 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb @@ -17,13 +17,6 @@ include_recipe 'aws-parallelcluster-slurm::config_munge_key' -# Export /opt/slurm -nfs_export "#{node['cluster']['slurm']['install_dir']}" do - network get_vpc_cidr_list - writeable true - options ['no_root_squash'] -end unless on_docker? - # Ensure config directory is in place directory "#{node['cluster']['slurm']['install_dir']}" do user 'root' diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_login.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_login.rb index ce562c2c49..47df2dd4dd 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_login.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_login.rb @@ -17,5 +17,3 @@ # TODO: rename, find a better name that include login nodes setup_munge_compute_node - -include_recipe 'aws-parallelcluster-slurm::mount_slurm_dir' diff --git a/kitchen.ec2.yml b/kitchen.ec2.yml index 8bb9cdcde5..a3fdfee52f 100644 --- a/kitchen.ec2.yml +++ b/kitchen.ec2.yml @@ -187,7 +187,7 @@ platforms: image_id: <%= ENV['KITCHEN_UBUNTU2204_AMI'] %> <% else %> image_search: - name: ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-<% if ENV['KITCHEN_ARCHITECTURE'] == 'x86_64' %>amd64<% else %>arm64<% end %>-server-20230106 + name: <% if ENV['KITCHEN_PHASE']=='install' %>ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-<% if ENV['KITCHEN_ARCHITECTURE'] == 'x86_64' %>amd64<% else %>arm64<% end %>-server-20230106<% else %><%= pcluster_prefix %>-ubuntu-2204-lts-hvm-*<% end %> architecture: <%= ENV['KITCHEN_ARCHITECTURE'] %> <% end %> block_device_mappings: