Skip to content

Commit

Permalink
[Develop] Add a new internal storage option, EFS, keeping EBS as the …
Browse files Browse the repository at this point in the history
…default (#2480)

Add backup and restore recipes to move data in second stage images to the shared filesystem
Create a new mount_internal_use_efs.rb resource to mount the internal shared filesystem
Filter the efs filesystem arrays to mount internal shared fs in init and cx fses in config
Refactor environment recipes as needed to adapt to having two separate modes for internal storage
  • Loading branch information
dreambeyondorange authored Oct 6, 2023
1 parent 4a2ab34 commit cf39542
Show file tree
Hide file tree
Showing 29 changed files with 414 additions and 121 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
- Add support for Rocky Linux 8.
- Install [Spack](https://spack.io) by default in cluster user's home directory.
- Add support for `Scheduling/SlurmSettings/Database/DatabaseName` parameter to render `StorageLoc` in the slurmdbd configuration generated by ParallelCluster.
- Add the option to use EFS storage instead of NFS exports from the head node root volume for intra-cluster shared ParallelCluster, Intel, Slurm, and login node data.

**CHANGES**

Expand Down
1 change: 1 addition & 0 deletions cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@
# Fetch config must be executed after the mount of the shared folders because the config will be saved there
fetch_config 'Fetch and load cluster configs'

# Compute fleet init requires shared dirs
include_recipe "aws-parallelcluster-computefleet::init"
include_recipe "aws-parallelcluster-slurm::init"
4 changes: 2 additions & 2 deletions cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
update true
end

# generate the update shared storages mapping file
include_recipe 'aws-parallelcluster-environment::fs_update'
# generate the updated shared storages mapping file
include_recipe 'aws-parallelcluster-environment::update_fs_mapping'

include_recipe 'aws-parallelcluster-environment::directory_service'
include_recipe 'aws-parallelcluster-slurm::update' if node['cluster']['scheduler'] == 'slurm'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,10 @@
default['cluster']['shared_dir_compute'] = node['cluster']['shared_dir']
default['cluster']['shared_dir_head'] = node['cluster']['shared_dir']
default['cluster']['shared_dir_login'] = node['cluster']['shared_dir_login_nodes']
# Since this is a shared directory, it needs to be defined here first instead of in the dependent cookbook for slurm
default['cluster']['slurm']['install_dir'] = '/opt/slurm'

default['cluster']['internal_shared_dirs'] = [node['cluster']['shared_dir'], node['cluster']['shared_dir_login_nodes'], node['cluster']['slurm']['install_dir'], "/opt/intel"]
default['cluster']['internal_initial_shared_dir'] = "#{node['cluster']['base_dir']}/init_shared"

default['cluster']['head_node_private_ip'] = nil
Original file line number Diff line number Diff line change
Expand Up @@ -414,10 +414,10 @@ suites:
fsx_shared_dirs: ''
raid_shared_dir: ''
ephemeral_dir: test1
- name: fs_update
- name: update_fs_mapping
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::fs_update]
- recipe[aws-parallelcluster-environment::update_fs_mapping]
verifier:
controls:
- fs_data_file_created_correctly
Expand All @@ -440,10 +440,10 @@ suites:
fsx_dns_names: dns1,dns2
fsx_mount_names: mount1,mount2
fsx_volume_junction_paths: value1,value2
- name: fs_update_default_values
- name: update_fs_mapping_default_values
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::fs_update]
- recipe[aws-parallelcluster-environment::update_fs_mapping]
verifier:
controls:
- fs_data_file_with_default_values
Expand Down Expand Up @@ -475,7 +475,7 @@ suites:
- name: mount_shared_compute
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::mount_shared]
- recipe[aws-parallelcluster-environment::mount_internal_use_ebs]
verifier:
controls:
- mount_home
Expand All @@ -493,7 +493,7 @@ suites:
- name: mount_shared_login
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::mount_shared]
- recipe[aws-parallelcluster-environment::mount_internal_use_ebs]
verifier:
controls:
- mount_home
Expand Down Expand Up @@ -538,10 +538,10 @@ suites:
node_type: LoginNode
raid_shared_dir: raid1
head_node_private_ip: '127.0.0.1'
- name: shared_storages_compute
- name: shared_storages_compute_efs
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::shared_storages]
- recipe[aws-parallelcluster-environment::efs]
verifier:
controls:
- shared_storages_compute_and_login
Expand All @@ -552,10 +552,38 @@ suites:
cluster:
node_type: ComputeFleet
head_node_private_ip: '127.0.0.1'
- name: shared_storages_login
- name: shared_storages_login_efs
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::shared_storages]
- recipe[aws-parallelcluster-environment::efs]
verifier:
controls:
- shared_storages_compute_and_login
attributes:
dependencies:
- resource:nfs
- recipe:aws-parallelcluster-environment::mock_compute_shared_storages
cluster:
node_type: LoginNode
head_node_private_ip: '127.0.0.1'
- name: shared_storages_compute_ebs
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::mount_intel_dir]
verifier:
controls:
- shared_storages_compute_and_login
attributes:
dependencies:
- resource:nfs
- recipe:aws-parallelcluster-environment::mock_compute_shared_storages
cluster:
node_type: ComputeFleet
head_node_private_ip: '127.0.0.1'
- name: shared_storages_login_ebs
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::mount_intel_dir]
verifier:
controls:
- shared_storages_compute_and_login
Expand Down
29 changes: 25 additions & 4 deletions cookbooks/aws-parallelcluster-environment/recipes/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,30 @@
action :configure
end
include_recipe 'aws-parallelcluster-environment::ephemeral_drives'
# fs_update generates the shared storages mapping file so must be executed before shared storages recipes
include_recipe 'aws-parallelcluster-environment::fs_update'
include_recipe 'aws-parallelcluster-environment::shared_storages'
# update_fs_mapping generates the shared storages mapping file so must be executed before shared storages recipes
include_recipe 'aws-parallelcluster-environment::update_fs_mapping'
# Export home dir from the head node
include_recipe 'aws-parallelcluster-environment::export_home'

if node['cluster']['internal_shared_storage_type'] == 'ebs'
# Export internal use dirs from the head node
include_recipe 'aws-parallelcluster-environment::export_internal_use_ebs'
# Mount intel on compute and login nodes
include_recipe 'aws-parallelcluster-environment::mount_intel_dir'
end

include_recipe 'aws-parallelcluster-environment::ebs'
include_recipe 'aws-parallelcluster-environment::raid'
include_recipe "aws-parallelcluster-environment::fs_mount"
include_recipe "aws-parallelcluster-environment::efs"

# Mount FSx directory with manage_fsx resource
lustre "mount fsx" do
fsx_fs_id_array node['cluster']['fsx_fs_ids'].split(',')
fsx_fs_type_array node['cluster']['fsx_fs_types'].split(',')
fsx_shared_dir_array node['cluster']['fsx_shared_dirs'].split(',')
fsx_dns_name_array node['cluster']['fsx_dns_names'].split(',')
fsx_mount_name_array node['cluster']['fsx_mount_names'].split(',')
fsx_volume_junction_path_array node['cluster']['fsx_volume_junction_paths'].split(',')
action :mount
not_if { node['cluster']['fsx_fs_ids'].split(',').empty? }
end
41 changes: 41 additions & 0 deletions cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# frozen_string_literal: true

# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.
return if on_docker?
efs_shared_dir_array = node['cluster']['efs_shared_dirs'].split(',')
efs_fs_id_array = node['cluster']['efs_fs_ids'].split(',')
efs_encryption_in_transit_array = node['cluster']['efs_encryption_in_transits'].split(',')
efs_iam_authorization_array = node['cluster']['efs_iam_authorizations'].split(',')

cx_shared_dir_array = []
cx_efs_fs_id_array = []
cx_efs_encryption_array = []
cx_efs_iam_array = []

# Identify the customer use filesystems and store their data in arrays for the EFS resource
efs_shared_dir_array.each_with_index do |dir, index|
next if node['cluster']['internal_shared_dirs'].include?(dir)
cx_shared_dir_array.push(dir)
cx_efs_fs_id_array.push(efs_fs_id_array[index])
cx_efs_encryption_array.push(efs_encryption_in_transit_array[index])
cx_efs_iam_array.push(efs_iam_authorization_array[index])
end

# Mount EFS directories with the efs resource
efs "mount efs" do
shared_dir_array cx_shared_dir_array
efs_fs_id_array cx_efs_fs_id_array
efs_encryption_in_transit_array cx_efs_encryption_array
efs_iam_authorization_array cx_efs_iam_array
action :mount
not_if { cx_shared_dir_array.empty? }
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# frozen_string_literal: true

#
# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

return if on_docker?

case node['cluster']['node_type']
when 'HeadNode'
volume "export /home" do
shared_dir "/home"
action :export
end
when 'ComputeFleet', 'LoginNode'
Chef::Log.info("Export only from the HeadNode")
else
raise "node_type must be HeadNode, ComputeFleet, or LoginNode"
end
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,6 @@

case node['cluster']['node_type']
when 'HeadNode'
volume "export /home" do
shared_dir "/home"
action :export
end

# Export /opt/parallelcluster/shared
volume "export #{node['cluster']['shared_dir']}" do
shared_dir node['cluster']['shared_dir']
Expand All @@ -41,19 +36,7 @@
end

when 'ComputeFleet', 'LoginNode'
# Mount /opt/intel over NFS only if it exists
exported_intel_dir = format_directory('/opt/intel')
volume "mount /opt/intel" do
action :mount
shared_dir '/opt/intel'
device(lazy { "#{node['cluster']['head_node_private_ip']}:#{exported_intel_dir}" })
fstype 'nfs'
options node['cluster']['nfs']['hard_mount_options']
retries 10
retry_delay 6
only_if { ::File.directory?("/opt/intel") }
end

Chef::Log.info("Export only from the HeadNode")
else
raise "node_type must be HeadNode or ComputeFleet"
end

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# frozen_string_literal: true

#
# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

return if on_docker?

case node['cluster']['node_type']
when 'HeadNode'
Chef::Log.info("Mount only on the ComputeFleet and LoginNodes")
when 'ComputeFleet', 'LoginNode'
# Mount /opt/intel over NFS only if it exists
exported_intel_dir = format_directory('/opt/intel')
volume "mount /opt/intel" do
action :mount
shared_dir '/opt/intel'
device(lazy { "#{node['cluster']['head_node_private_ip']}:#{exported_intel_dir}" })
fstype 'nfs'
options node['cluster']['nfs']['hard_mount_options']
retries 10
retry_delay 6
only_if { ::File.directory?("/opt/intel") }
end

else
raise "node_type must be HeadNode or ComputeFleet"
end
15 changes: 14 additions & 1 deletion cookbooks/aws-parallelcluster-environment/recipes/init.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,25 @@
#

include_recipe "aws-parallelcluster-environment::cfnconfig_mixed"
include_recipe "aws-parallelcluster-environment::mount_shared"
cloudwatch "Configure CloudWatch" do
action :configure
end

case node['cluster']['internal_shared_storage_type']
when 'efs'
include_recipe "aws-parallelcluster-environment::mount_internal_use_efs"
when 'ebs'
include_recipe "aws-parallelcluster-environment::mount_internal_use_ebs"
else
raise "internal_shared_storage_type must be ebs or efs"
end

include_recipe "aws-parallelcluster-environment::mount_home" if %w(ComputeFleet LoginNode).include? node['cluster']['node_type']

include_recipe "aws-parallelcluster-environment::network_interfaces"
include_recipe 'aws-parallelcluster-environment::imds'

# login nodes keys and directory service require shared storage
include_recipe "aws-parallelcluster-environment::login_nodes_keys"
include_recipe "aws-parallelcluster-environment::directory_service"

Expand Down
Loading

0 comments on commit cf39542

Please sign in to comment.