Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "[Develop] Migrate internal storage to EFS from NFS exports (#… #2469

Merged
merged 1 commit into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ This file is used to list changes made in each version of the AWS ParallelCluste
- Install [Spack](https://spack.io) by default in cluster user's home directory.

**CHANGES**
- Remove the NFS exports from the head node root volume and use Amazon EFS storage instead for intra-cluster shared ParallelCluster, Intel, Slurm, and Login Node data.

**BUG FIXES**
- Fix inconsistent scaling configuration after cluster update rollback when modifying the list of instance types declared in the Compute Resources.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,5 @@
# Fetch config must be executed after the mount of the shared folders because the config will be saved there
fetch_config 'Fetch and load cluster configs'

# Compute fleet init requires shared dirs
include_recipe "aws-parallelcluster-computefleet::init"
include_recipe "aws-parallelcluster-slurm::init"
4 changes: 2 additions & 2 deletions cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
update true
end

# generate the updated shared storages mapping file
include_recipe 'aws-parallelcluster-environment::update_fs_mapping'
# generate the update shared storages mapping file
include_recipe 'aws-parallelcluster-environment::fs_update'

include_recipe 'aws-parallelcluster-environment::directory_service'
include_recipe 'aws-parallelcluster-slurm::update' if node['cluster']['scheduler'] == 'slurm'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,5 @@
default['cluster']['shared_dir_compute'] = node['cluster']['shared_dir']
default['cluster']['shared_dir_head'] = node['cluster']['shared_dir']
default['cluster']['shared_dir_login'] = node['cluster']['shared_dir_login_nodes']
# Since this is a shared directory, it needs to be defined here first instead of in the dependent cookbook for slurm
default['cluster']['slurm']['install_dir'] = '/opt/slurm'

default['cluster']['internal_shared_dirs'] = [node['cluster']['shared_dir'], node['cluster']['shared_dir_login_nodes'], node['cluster']['slurm']['install_dir'], "/opt/intel"]

default['cluster']['head_node_private_ip'] = nil
Original file line number Diff line number Diff line change
Expand Up @@ -414,10 +414,10 @@ suites:
fsx_shared_dirs: ''
raid_shared_dir: ''
ephemeral_dir: test1
- name: update_fs_mapping
- name: fs_update
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::update_fs_mapping]
- recipe[aws-parallelcluster-environment::fs_update]
verifier:
controls:
- fs_data_file_created_correctly
Expand All @@ -440,10 +440,10 @@ suites:
fsx_dns_names: dns1,dns2
fsx_mount_names: mount1,mount2
fsx_volume_junction_paths: value1,value2
- name: update_fs_mapping_default_values
- name: fs_update_default_values
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::update_fs_mapping]
- recipe[aws-parallelcluster-environment::fs_update]
verifier:
controls:
- fs_data_file_with_default_values
Expand Down Expand Up @@ -472,43 +472,42 @@ suites:
scheduler: slurm
head_node_imds_secured: 'true'
head_node_imds_allowed_users: ['root', 'nobody']
# TODO replacement for the mount_internal_use_fs recipe since it uses shared storage
# - name: mount_shared_compute
# run_list:
# - recipe[aws-parallelcluster-tests::setup]
# - recipe[aws-parallelcluster-environment::mount_shared]
# verifier:
# controls:
# - mount_home
# - mount_shared_compute
# attributes:
# dependencies:
# - recipe:aws-parallelcluster-platform::directories
# - resource:nfs
# - recipe:aws-parallelcluster-environment::mock_export_directories
# cluster:
# node_type: 'ComputeFleet'
# head_node_private_ip: '127.0.0.1'
# head_node_home_path: '/fake_headnode_home'
# shared_dir_head: '/fake_headnode_shared'
# - name: mount_shared_login
# run_list:
# - recipe[aws-parallelcluster-tests::setup]
# - recipe[aws-parallelcluster-environment::mount_shared]
# verifier:
# controls:
# - mount_home
# - mount_shared_login
# attributes:
# dependencies:
# - recipe:aws-parallelcluster-platform::directories
# - resource:nfs
# - recipe:aws-parallelcluster-environment::mock_export_directories
# cluster:
# node_type: 'LoginNode'
# head_node_private_ip: '127.0.0.1'
# head_node_home_path: '/fake_headnode_home'
# shared_dir_head: '/fake_headnode_shared'
- name: mount_shared_compute
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::mount_shared]
verifier:
controls:
- mount_home
- mount_shared_compute
attributes:
dependencies:
- recipe:aws-parallelcluster-platform::directories
- resource:nfs
- recipe:aws-parallelcluster-environment::mock_export_directories
cluster:
node_type: 'ComputeFleet'
head_node_private_ip: '127.0.0.1'
head_node_home_path: '/fake_headnode_home'
shared_dir_head: '/fake_headnode_shared'
- name: mount_shared_login
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::mount_shared]
verifier:
controls:
- mount_home
- mount_shared_login
attributes:
dependencies:
- recipe:aws-parallelcluster-platform::directories
- resource:nfs
- recipe:aws-parallelcluster-environment::mock_export_directories
cluster:
node_type: 'LoginNode'
head_node_private_ip: '127.0.0.1'
head_node_home_path: '/fake_headnode_home'
shared_dir_head: '/fake_headnode_shared'
- name: raid_compute
run_list:
- recipe[aws-parallelcluster-tests::setup]
Expand Down Expand Up @@ -542,7 +541,7 @@ suites:
- name: shared_storages_compute
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::efs]
- recipe[aws-parallelcluster-environment::shared_storages]
verifier:
controls:
- shared_storages_compute_and_login
Expand All @@ -556,7 +555,7 @@ suites:
- name: shared_storages_login
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-environment::efs]
- recipe[aws-parallelcluster-environment::shared_storages]
verifier:
controls:
- shared_storages_compute_and_login
Expand Down
20 changes: 4 additions & 16 deletions cookbooks/aws-parallelcluster-environment/recipes/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,9 @@
action :configure
end
include_recipe 'aws-parallelcluster-environment::ephemeral_drives'
# update_fs_mapping generates the shared storages mapping file so must be executed before shared storages recipes
include_recipe 'aws-parallelcluster-environment::update_fs_mapping'
include_recipe 'aws-parallelcluster-environment::export_home'
# fs_update generates the shared storages mapping file so must be executed before shared storages recipes
include_recipe 'aws-parallelcluster-environment::fs_update'
include_recipe 'aws-parallelcluster-environment::shared_storages'
include_recipe 'aws-parallelcluster-environment::ebs'
include_recipe 'aws-parallelcluster-environment::raid'
include_recipe "aws-parallelcluster-environment::efs"

# Mount FSx directory with manage_fsx resource
lustre "mount fsx" do
fsx_fs_id_array node['cluster']['fsx_fs_ids'].split(',')
fsx_fs_type_array node['cluster']['fsx_fs_types'].split(',')
fsx_shared_dir_array node['cluster']['fsx_shared_dirs'].split(',')
fsx_dns_name_array node['cluster']['fsx_dns_names'].split(',')
fsx_mount_name_array node['cluster']['fsx_mount_names'].split(',')
fsx_volume_junction_path_array node['cluster']['fsx_volume_junction_paths'].split(',')
action :mount
not_if { node['cluster']['fsx_fs_ids'].split(',').empty? }
end
include_recipe "aws-parallelcluster-environment::fs_mount"
41 changes: 0 additions & 41 deletions cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# frozen_string_literal: true

# Copyright:: 2013-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

# Mount EFS directory with efs resource
efs "mount efs" do
shared_dir_array node['cluster']['efs_shared_dirs'].split(',')
efs_fs_id_array node['cluster']['efs_fs_ids'].split(',')
efs_encryption_in_transit_array node['cluster']['efs_encryption_in_transits'].split(',')
efs_iam_authorization_array node['cluster']['efs_iam_authorizations'].split(',')
action :mount
not_if { node['cluster']['efs_shared_dirs'].split(',').empty? }
end

# Mount FSx directory with manage_fsx resource
lustre "mount fsx" do
fsx_fs_id_array node['cluster']['fsx_fs_ids'].split(',')
fsx_fs_type_array node['cluster']['fsx_fs_types'].split(',')
fsx_shared_dir_array node['cluster']['fsx_shared_dirs'].split(',')
fsx_dns_name_array node['cluster']['fsx_dns_names'].split(',')
fsx_mount_name_array node['cluster']['fsx_mount_names'].split(',')
fsx_volume_junction_path_array node['cluster']['fsx_volume_junction_paths'].split(',')
action :mount
not_if { node['cluster']['fsx_fs_ids'].split(',').empty? }
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# frozen_string_literal: true

# Copyright:: 2013-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

return if on_docker?

case node['cluster']['node_type']
when 'ComputeFleet'
include_recipe 'aws-parallelcluster-environment::mount_home'

# Mount /opt/parallelcluster/shared over NFS
volume "mount #{node['cluster']['shared_dir_compute']}" do
action :mount
shared_dir node['cluster']['shared_dir_compute']
device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['shared_dir_head']}" })
fstype 'nfs'
options node['cluster']['nfs']['hard_mount_options']
retries 10
retry_delay 6
end

when 'LoginNode'
include_recipe 'aws-parallelcluster-environment::mount_home'

# Mount /opt/parallelcluster/shared_login_nodes over NFS
volume "mount #{node['cluster']['shared_dir_login']}" do
action :mount
shared_dir node['cluster']['shared_dir_login']
device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['shared_dir_login']}" })
fstype 'nfs'
options node['cluster']['nfs']['hard_mount_options']
retries 10
retry_delay 6
end
when 'HeadNode'
Chef::Log.info("Nothing to mount in the HeadNode")
else
raise "node_type must be HeadNode, LoginNode or ComputeFleet"
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# frozen_string_literal: true

#
# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

return if on_docker?

case node['cluster']['node_type']
when 'HeadNode'
volume "export /home" do
shared_dir "/home"
action :export
end

# Export /opt/parallelcluster/shared
volume "export #{node['cluster']['shared_dir']}" do
shared_dir node['cluster']['shared_dir']
action :export
end

# Export /opt/parallelcluster/shared_login_nodes
volume "export #{node['cluster']['shared_dir_login_nodes']}" do
shared_dir node['cluster']['shared_dir_login_nodes']
action :export
end

# Export /opt/intel only if exists
volume "export /opt/intel" do
shared_dir "/opt/intel"
only_if { ::File.directory?("/opt/intel") }
action :export
end

when 'ComputeFleet', 'LoginNode'
# Mount /opt/intel over NFS only if it exists
exported_intel_dir = format_directory('/opt/intel')
volume "mount /opt/intel" do
action :mount
shared_dir '/opt/intel'
device(lazy { "#{node['cluster']['head_node_private_ip']}:#{exported_intel_dir}" })
fstype 'nfs'
options node['cluster']['nfs']['hard_mount_options']
retries 10
retry_delay 6
only_if { ::File.directory?("/opt/intel") }
end

else
raise "node_type must be HeadNode or ComputeFleet"
end
8 changes: 1 addition & 7 deletions cookbooks/aws-parallelcluster-environment/recipes/init.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,12 @@
#

include_recipe "aws-parallelcluster-environment::cfnconfig_mixed"
include_recipe "aws-parallelcluster-environment::mount_shared"
cloudwatch "Configure CloudWatch" do
action :configure
end
include_recipe "aws-parallelcluster-environment::update_fs_mapping"
include_recipe "aws-parallelcluster-environment::backup_internal_use_shared_data"
include_recipe "aws-parallelcluster-environment::mount_internal_use_fs"
include_recipe "aws-parallelcluster-environment::restore_internal_use_shared_data"

include_recipe "aws-parallelcluster-environment::network_interfaces"
include_recipe 'aws-parallelcluster-environment::imds'

# login nodes keys and directory service require shared storage
include_recipe "aws-parallelcluster-environment::login_nodes_keys"
include_recipe "aws-parallelcluster-environment::directory_service"

Expand Down
Loading
Loading