Skip to content

Commit

Permalink
[Develop] Add support for an external shared /home
Browse files Browse the repository at this point in the history
Users can now specify `/home` as a mount point in SharedStorage.  This was previously a reserved directory
When a user specifies this, the data in `/home` is transferred without replacement to the external filesystem.
This means that in order to share the `/home` directory across clusters, users must specify the same security
credentials when creating the clusters.
  • Loading branch information
dreambeyondorange committed Oct 6, 2023
1 parent d754cf9 commit dce308b
Show file tree
Hide file tree
Showing 10 changed files with 200 additions and 24 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
- Install [Spack](https://spack.io) by default in cluster user's home directory.
- Add support for `Scheduling/SlurmSettings/Database/DatabaseName` parameter to render `StorageLoc` in the slurmdbd configuration generated by ParallelCluster.
- Add the option to use EFS storage instead of NFS exports from the head node root volume for intra-cluster shared ParallelCluster, Intel, Slurm, and login node data.
- Add the option to use EFS or FSx as external shared storage via the `SharedStorage` section of the config file for `/home`

**CHANGES**

Expand Down
19 changes: 4 additions & 15 deletions cookbooks/aws-parallelcluster-environment/recipes/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
action :configure
end
include_recipe 'aws-parallelcluster-environment::ephemeral_drives'
# update_fs_mapping generates the shared storages mapping file so must be executed before shared storages recipes
# update_fs_mapping generates the shared storage mapping file, so it must be executed before shared storage recipes
include_recipe 'aws-parallelcluster-environment::update_fs_mapping'
# Export home dir from the head node
# Export the home dir from the head node when using ebs
include_recipe 'aws-parallelcluster-environment::export_home'

if node['cluster']['internal_shared_storage_type'] == 'ebs'
Expand All @@ -32,16 +32,5 @@

include_recipe 'aws-parallelcluster-environment::ebs'
include_recipe 'aws-parallelcluster-environment::raid'
include_recipe "aws-parallelcluster-environment::efs"

# Mount FSx directory with manage_fsx resource
lustre "mount fsx" do
fsx_fs_id_array node['cluster']['fsx_fs_ids'].split(',')
fsx_fs_type_array node['cluster']['fsx_fs_types'].split(',')
fsx_shared_dir_array node['cluster']['fsx_shared_dirs'].split(',')
fsx_dns_name_array node['cluster']['fsx_dns_names'].split(',')
fsx_mount_name_array node['cluster']['fsx_mount_names'].split(',')
fsx_volume_junction_path_array node['cluster']['fsx_volume_junction_paths'].split(',')
action :mount
not_if { node['cluster']['fsx_fs_ids'].split(',').empty? }
end
include_recipe 'aws-parallelcluster-environment::efs'
include_recipe 'aws-parallelcluster-environment::fsx'
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
# Identify the customer use filesystems and store their data in arrays for the EFS resource
efs_shared_dir_array.each_with_index do |dir, index|
next if node['cluster']['internal_shared_dirs'].include?(dir)
next if dir == "/home"
cx_shared_dir_array.push(dir)
cx_efs_fs_id_array.push(efs_fs_id_array[index])
cx_efs_encryption_array.push(efs_encryption_in_transit_array[index])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,18 @@

return if on_docker?

return unless node['cluster']['internal_shared_storage_type'] == 'ebs'

# Check if home is a shared filesystem and return if it is because
# there is nothing to export
is_home_shared = false
efs_shared_dir_array = node['cluster']['efs_shared_dirs'].split(',')
fsx_shared_dir_array = node['cluster']['fsx_shared_dirs'].split(',')
(efs_shared_dir_array + fsx_shared_dir_array).each do |dir|
is_home_shared = true if dir == "/home"
end
return if is_home_shared

case node['cluster']['node_type']
when 'HeadNode'
volume "export /home" do
Expand Down
49 changes: 49 additions & 0 deletions cookbooks/aws-parallelcluster-environment/recipes/config/fsx.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# frozen_string_literal: true

# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.
return if on_docker?
fsx_fs_id_array = node['cluster']['fsx_fs_ids'].split(',')
fsx_fs_type_array = node['cluster']['fsx_fs_types'].split(',')
fsx_shared_dir_array = node['cluster']['fsx_shared_dirs'].split(',')
fsx_dns_name_array = node['cluster']['fsx_dns_names'].split(',')
fsx_mount_name_array = node['cluster']['fsx_mount_names'].split(',')
fsx_volume_junction_path_array = node['cluster']['fsx_volume_junction_paths'].split(',')

cx_fs_id_array = []
cx_fs_type_array = []
cx_shared_dir_array = []
cx_dns_name_array = []
cx_mount_name_array = []
cx_volume_junction_path_array = []

# Identify the customer use filesystems and store their data in arrays for the fsx resource
fsx_shared_dir_array.each_with_index do |dir, index|
next if dir == "/home"
cx_fs_id_array.push(fsx_fs_id_array[index])
cx_fs_type_array.push(fsx_fs_type_array[index])
cx_shared_dir_array.push(dir)
cx_dns_name_array.push(fsx_dns_name_array[index])
cx_mount_name_array.push(fsx_mount_name_array[index])
cx_volume_junction_path_array.push(fsx_volume_junction_path_array[index])
end

# Mount FSx shared directories with the lustre resource
lustre "mount fsx" do
fsx_fs_id_array cx_fs_id_array
fsx_fs_type_array cx_fs_type_array
fsx_shared_dir_array cx_shared_dir_array
fsx_dns_name_array cx_dns_name_array
fsx_mount_name_array cx_mount_name_array
fsx_volume_junction_path_array cx_volume_junction_path_array
action :mount
not_if { cx_fs_id_array.empty? }
end
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,72 @@
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

volume "mount /home" do
action :mount
shared_dir '/home'
device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['head_node_home_path']}" })
fstype 'nfs'
options node['cluster']['nfs']['hard_mount_options']
retries 10
retry_delay 6
return if on_docker?

efs_shared_dir_array = node['cluster']['efs_shared_dirs'].split(',')
# Disambiguate from the lustre resource property below
fsx_input_shared_dir_array = node['cluster']['fsx_shared_dirs'].split(',')

# Check if home is a shared filesystem
is_home_shared = false
(efs_shared_dir_array + fsx_input_shared_dir_array).each do |dir|
if dir == "/home"
is_home_shared = true
break
end
end

if is_home_shared
include_recipe "aws-parallelcluster-environment::backup_home_shared_data"
home_mounted = false
include_recipe "aws-parallelcluster-environment::update_fs_mapping"
include_recipe "aws-parallelcluster-environment::backup_home_shared_data"
# Identify the filesystem that is shared as efs or fsx and mount it
efs_shared_dir_array.each_with_index do |dir, index|
next unless dir == "/home"
home_mounted = true
efs "mount shared efs home" do
shared_dir_array [dir]
efs_fs_id_array [node['cluster']['efs_fs_ids'].split(',')[index]]
efs_encryption_in_transit_array [node['cluster']['efs_encryption_in_transits'].split(',')[index]]
efs_iam_authorization_array [node['cluster']['efs_iam_authorizations'].split(',')[index]]
action :mount
end
break
end
unless home_mounted
fsx_input_shared_dir_array.each_with_index do |dir, index|
next unless dir == "/home"
lustre "mount shared fsx home" do
fsx_fs_id_array [node['cluster']['fsx_fs_ids'].split(',')[index]]
fsx_fs_type_array [node['cluster']['fsx_fs_types'].split(',')[index]]
fsx_shared_dir_array [dir]
fsx_dns_name_array [node['cluster']['fsx_dns_names'].split(',')[index]]
fsx_mount_name_array [node['cluster']['fsx_mount_names'].split(',')[index]]
fsx_volume_junction_path_array [node['cluster']['fsx_volume_junction_paths'].split(',')[index]]
action :mount
end
break
end
end
include_recipe "aws-parallelcluster-environment::restore_home_shared_data"
else
# Otherwise mount the NFS export to compute and login nodes
case node['cluster']['node_type']
when 'ComputeFleet', 'LoginNode'
volume "mount /home" do
action :mount
shared_dir '/home'
device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['head_node_home_path']}" })
fstype 'nfs'
options node['cluster']['nfs']['hard_mount_options']
retries 10
retry_delay 6
end
when 'HeadNode'
Chef::Log.info("Do not mount NFS shares on the HeadNode")
else
raise "node_type must be ComputeFleet, LoginNode or HeadNode"
end
end

3 changes: 2 additions & 1 deletion cookbooks/aws-parallelcluster-environment/recipes/init.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
raise "internal_shared_storage_type must be ebs or efs"
end

include_recipe "aws-parallelcluster-environment::mount_home" if %w(ComputeFleet LoginNode).include? node['cluster']['node_type']
# Mount the home directory to all nodes if it is shared, otherwise mount the NFS share to compute and login nodes
include_recipe "aws-parallelcluster-environment::mount_home"

include_recipe "aws-parallelcluster-environment::network_interfaces"
include_recipe 'aws-parallelcluster-environment::imds'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# frozen_string_literal: true

#
# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

return if on_docker?

if node['cluster']['node_type'] == 'HeadNode'
# For each, backup the data to a temp location
# This is necessary to preserve any data in these directories that was
# generated during the build of ParallelCluster AMIs after converting to
# shared storage
bash "Backup /home" do
user 'root'
group 'root'
code <<-EOH
mkdir -p /tmp/home
rsync -a /home/ /tmp/home
EOH
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# frozen_string_literal: true

#
# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

return if on_docker?

if node['cluster']['node_type'] == 'HeadNode'
# Restore the shared storage home data if it doesn't already exist
# This is necessary to preserve any data in these directories that was
# generated during the build of ParallelCluster AMIs after converting to
# shared storage and backed up to a temporary location previously
# Remove the backup after the copy is done
bash "Restore /home" do
user 'root'
group 'root'
code <<-EOH
rsync -a --ignore-existing /tmp/home/ /home
rm -rf /tmp/home/
EOH
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

fetch_config 'Fetch and load cluster configs'

template "#{node['cluster']['scripts_dir']}/slurm/slurm_resume" do
source 'slurm/resume_program.erb'
owner node['cluster']['slurm']['user']
Expand Down

0 comments on commit dce308b

Please sign in to comment.