Skip to content

Commit

Permalink
[Develop] Migrate internal storage to EFS from NFS exports
Browse files Browse the repository at this point in the history
Add backup and restore recipes to move data in second stage images to shared filesystems
Create a new mount_internal_use_fs.rb recipe to mount the internal shared filesystems
Filter the efs filesystem arrays to mount internal shared fses in init and cx fses in config
Refactor environment recipes to be clearer in functional description and remove unnecessary recipes
  • Loading branch information
dreambeyondorange committed Sep 21, 2023
1 parent 2d1aecf commit 29a478c
Show file tree
Hide file tree
Showing 24 changed files with 271 additions and 254 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
- Install [Spack](https://spack.io) by default in cluster user's home directory.

**CHANGES**
- Remove the NFS exports from the head node root volume and use Amazon EFS storage instead for intra-cluster shared ParallelCluster, Intel, Slurm, and Login Node data.

**BUG FIXES**
- Fix inconsistent scaling configuration after cluster update rollback when modifying the list of instance types declared in the Compute Resources.
Expand Down
1 change: 1 addition & 0 deletions cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@
# Fetch config must be executed after the mount of the shared folders because the config will be saved there
fetch_config 'Fetch and load cluster configs'

# Compute fleet init requires shared dirs
include_recipe "aws-parallelcluster-computefleet::init"
include_recipe "aws-parallelcluster-slurm::init"
4 changes: 2 additions & 2 deletions cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
update true
end

# generate the update shared storages mapping file
include_recipe 'aws-parallelcluster-environment::fs_update'
# generate the updated shared storages mapping file
include_recipe 'aws-parallelcluster-environment::update_fs_mapping'

include_recipe 'aws-parallelcluster-environment::directory_service'
include_recipe 'aws-parallelcluster-slurm::update' if node['cluster']['scheduler'] == 'slurm'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,6 @@
default['cluster']['shared_dir_compute'] = node['cluster']['shared_dir']
default['cluster']['shared_dir_head'] = node['cluster']['shared_dir']
default['cluster']['shared_dir_login'] = node['cluster']['shared_dir_login_nodes']
default['cluster']['internal_shared_dirs'] = [node['cluster']['shared_dir'], node['cluster']['shared_dir_login_nodes'], "/opt/slurm", "/opt/intel"]

default['cluster']['head_node_private_ip'] = nil
Original file line number Diff line number Diff line change
Expand Up @@ -414,10 +414,10 @@ suites:
fsx_shared_dirs: ''
raid_shared_dir: ''
ephemeral_dir: test1
- name: fs_update
- name: update_fs_mapping
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::fs_update]
- recipe[aws-parallelcluster-environment::update_fs_mapping]
verifier:
controls:
- fs_data_file_created_correctly
Expand All @@ -440,10 +440,10 @@ suites:
fsx_dns_names: dns1,dns2
fsx_mount_names: mount1,mount2
fsx_volume_junction_paths: value1,value2
- name: fs_update_default_values
- name: update_fs_mapping_default_values
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::fs_update]
- recipe[aws-parallelcluster-environment::update_fs_mapping]
verifier:
controls:
- fs_data_file_with_default_values
Expand Down Expand Up @@ -472,42 +472,43 @@ suites:
scheduler: slurm
head_node_imds_secured: 'true'
head_node_imds_allowed_users: ['root', 'nobody']
- name: mount_shared_compute
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::mount_shared]
verifier:
controls:
- mount_home
- mount_shared_compute
attributes:
dependencies:
- recipe:aws-parallelcluster-platform::directories
- resource:nfs
- recipe:aws-parallelcluster-environment::mock_export_directories
cluster:
node_type: 'ComputeFleet'
head_node_private_ip: '127.0.0.1'
head_node_home_path: '/fake_headnode_home'
shared_dir_head: '/fake_headnode_shared'
- name: mount_shared_login
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::mount_shared]
verifier:
controls:
- mount_home
- mount_shared_login
attributes:
dependencies:
- recipe:aws-parallelcluster-platform::directories
- resource:nfs
- recipe:aws-parallelcluster-environment::mock_export_directories
cluster:
node_type: 'LoginNode'
head_node_private_ip: '127.0.0.1'
head_node_home_path: '/fake_headnode_home'
shared_dir_head: '/fake_headnode_shared'
# TODO replacement for the mount_internal_use_fs recipe since it uses shared storage
# - name: mount_shared_compute
# run_list:
# - recipe[aws-parallelcluster-tests::setup]
# - recipe[aws-parallelcluster-environment::mount_shared]
# verifier:
# controls:
# - mount_home
# - mount_shared_compute
# attributes:
# dependencies:
# - recipe:aws-parallelcluster-platform::directories
# - resource:nfs
# - recipe:aws-parallelcluster-environment::mock_export_directories
# cluster:
# node_type: 'ComputeFleet'
# head_node_private_ip: '127.0.0.1'
# head_node_home_path: '/fake_headnode_home'
# shared_dir_head: '/fake_headnode_shared'
# - name: mount_shared_login
# run_list:
# - recipe[aws-parallelcluster-tests::setup]
# - recipe[aws-parallelcluster-environment::mount_shared]
# verifier:
# controls:
# - mount_home
# - mount_shared_login
# attributes:
# dependencies:
# - recipe:aws-parallelcluster-platform::directories
# - resource:nfs
# - recipe:aws-parallelcluster-environment::mock_export_directories
# cluster:
# node_type: 'LoginNode'
# head_node_private_ip: '127.0.0.1'
# head_node_home_path: '/fake_headnode_home'
# shared_dir_head: '/fake_headnode_shared'
- name: raid_compute
run_list:
- recipe[aws-parallelcluster-tests::setup]
Expand Down Expand Up @@ -541,7 +542,7 @@ suites:
- name: shared_storages_compute
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::shared_storages]
- recipe[aws-parallelcluster-environment::efs]
verifier:
controls:
- shared_storages_compute_and_login
Expand All @@ -555,7 +556,7 @@ suites:
- name: shared_storages_login
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::shared_storages]
- recipe[aws-parallelcluster-environment::efs]
verifier:
controls:
- shared_storages_compute_and_login
Expand Down
20 changes: 16 additions & 4 deletions cookbooks/aws-parallelcluster-environment/recipes/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,21 @@
action :configure
end
include_recipe 'aws-parallelcluster-environment::ephemeral_drives'
# fs_update generates the shared storages mapping file so must be executed before shared storages recipes
include_recipe 'aws-parallelcluster-environment::fs_update'
include_recipe 'aws-parallelcluster-environment::shared_storages'
# update_fs_mapping generates the shared storages mapping file so must be executed before shared storages recipes
include_recipe 'aws-parallelcluster-environment::update_fs_mapping'
include_recipe 'aws-parallelcluster-environment::export_home'
include_recipe 'aws-parallelcluster-environment::ebs'
include_recipe 'aws-parallelcluster-environment::raid'
include_recipe "aws-parallelcluster-environment::fs_mount"
include_recipe "aws-parallelcluster-environment::efs"

# Mount FSx directory with manage_fsx resource
lustre "mount fsx" do
fsx_fs_id_array node['cluster']['fsx_fs_ids'].split(',')
fsx_fs_type_array node['cluster']['fsx_fs_types'].split(',')
fsx_shared_dir_array node['cluster']['fsx_shared_dirs'].split(',')
fsx_dns_name_array node['cluster']['fsx_dns_names'].split(',')
fsx_mount_name_array node['cluster']['fsx_mount_names'].split(',')
fsx_volume_junction_path_array node['cluster']['fsx_volume_junction_paths'].split(',')
action :mount
not_if { node['cluster']['fsx_fs_ids'].split(',').empty? }
end
41 changes: 41 additions & 0 deletions cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# frozen_string_literal: true

# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.
return if on_docker?
efs_shared_dir_array = node['cluster']['efs_shared_dirs'].split(',')
efs_fs_id_array = node['cluster']['efs_fs_ids'].split(',')
efs_encryption_in_transit_array = node['cluster']['efs_encryption_in_transits'].split(',')
efs_iam_authorization_array = node['cluster']['efs_iam_authorizations'].split(',')

cx_shared_dir_array = []
cx_efs_fs_id_array = []
cx_efs_encryption_array = []
cx_efs_iam_array = []

# Identify the customer use filesystems and store their data in arrays for the EFS resource
efs_shared_dir_array.each_with_index do |dir, index|
next if node['cluster']['internal_shared_dirs'].include?(dir)
cx_shared_dir_array.push(dir)
cx_efs_fs_id_array.push(efs_fs_id_array[index])
cx_efs_encryption_array.push(efs_encryption_in_transit_array[index])
cx_efs_iam_array.push(efs_iam_authorization_array[index])
end

# Mount EFS directories with the efs resource
efs "mount efs" do
shared_dir_array cx_shared_dir_array
efs_fs_id_array cx_efs_fs_id_array
efs_encryption_in_transit_array cx_efs_encryption_array
efs_iam_authorization_array cx_efs_iam_array
action :mount
not_if { cx_shared_dir_array.empty? }
end
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# frozen_string_literal: true

# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
Expand All @@ -11,13 +12,16 @@
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

# Mount /opt/slurm over NFS
# Computemgtd config is under /opt/slurm/etc/pcluster; all compute nodes share a config
mount "#{node['cluster']['slurm']['install_dir']}" do
device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['slurm']['install_dir']}" })
fstype "nfs"
options node['cluster']['nfs']['hard_mount_options']
action %i(mount enable)
retries 10
retry_delay 6
return if on_docker?

case node['cluster']['node_type']
when 'HeadNode'
volume "export /home" do
shared_dir "/home"
action :export
end
when 'ComputeFleet', 'LoginNode'
Chef::Log.info("Export only from the HeadNode")
else
raise "node_type must be HeadNode, ComputeFleet, or LoginNode"
end

This file was deleted.

This file was deleted.

Loading

0 comments on commit 29a478c

Please sign in to comment.