diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb index b4bf24216..67ee86050 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb @@ -65,15 +65,6 @@ def enable_munge_service end end -def restart_munge_service - service "munge" do - supports restart: true - action :restart - retries 5 - retry_delay 10 - end -end - def setup_munge_head_node # Generate munge key or get it's value from secrets manager munge_key_manager 'manage_munge_key' do @@ -81,9 +72,6 @@ def setup_munge_head_node node['cluster']['config'].dig(:DevSettings, :MungeKeySettings, :MungeKeySecretArn) } end - - enable_munge_service - share_munge_head_node end def update_munge_head_node @@ -92,29 +80,6 @@ def update_munge_head_node action :update_munge_key only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? } end - - restart_munge_service - share_munge_head_node -end - -def share_munge_key_to_dir(shared_dir) - bash 'share_munge_key' do - user 'root' - group 'root' - code <<-SHARE_MUNGE_KEY - set -e - mkdir -p #{shared_dir}/.munge - # Copy key to shared dir - cp /etc/munge/munge.key #{shared_dir}/.munge/.munge.key - chmod 0700 #{shared_dir}/.munge - chmod 0600 #{shared_dir}/.munge/.munge.key - SHARE_MUNGE_KEY - end -end - -def share_munge_head_node - share_munge_key_to_dir(node['cluster']['shared_dir']) - share_munge_key_to_dir(node['cluster']['shared_dir_login']) end def setup_munge_key(shared_dir) diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb index 69c3e5316..dd30a2c7b 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb @@ -15,6 +15,26 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. +# Copy pcluster config generator and templates +include_recipe 'aws-parallelcluster-slurm::config_head_node_directories' + +include_recipe 'aws-parallelcluster-slurm::config_check_login_stopped_script' + +template "#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh" do + source 'slurm/head_node/update_munge_key.sh.erb' + owner 'root' + group 'root' + mode '0700' + variables( + munge_key_secret_arn: lazy { node['cluster']['config'].dig(:DevSettings, :MungeKeySettings, :MungeKeySecretArn) }, + region: node['cluster']['region'], + munge_user: node['cluster']['munge']['user'], + munge_group: node['cluster']['munge']['group'], + shared_directory_compute: node['cluster']['shared_dir'], + shared_directory_login: node['cluster']['shared_dir_login_nodes'] + ) +end + include_recipe 'aws-parallelcluster-slurm::config_munge_key' # Export /opt/slurm @@ -25,8 +45,6 @@ only_if { node['cluster']['internal_shared_storage_type'] == 'ebs' } end unless on_docker? -include_recipe 'aws-parallelcluster-slurm::config_head_node_directories' - template "#{node['cluster']['slurm']['install_dir']}/etc/slurm.conf" do source 'slurm/slurm.conf.erb' owner 'root' @@ -218,20 +236,3 @@ retries 5 retry_delay 2 end unless redhat_on_docker? - -include_recipe 'aws-parallelcluster-slurm::config_check_login_stopped_script' - -template "#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh" do - source 'slurm/head_node/update_munge_key.sh.erb' - owner 'root' - group 'root' - mode '0700' - variables( - munge_key_secret_arn: lazy { node['cluster']['config'].dig(:DevSettings, :MungeKeySettings, :MungeKeySecretArn) }, - region: node['cluster']['region'], - munge_user: node['cluster']['munge']['user'], - munge_group: node['cluster']['munge']['group'], - shared_directory_compute: node['cluster']['shared_dir'], - shared_directory_login: node['cluster']['shared_dir_login'] - ) -end diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb index 729406dac..a8ffc5aaf 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb @@ -225,7 +225,7 @@ def update_nodes_in_queue(strategy, queues) munge_user: node['cluster']['munge']['user'], munge_group: node['cluster']['munge']['group'], shared_directory_compute: node['cluster']['shared_dir'], - shared_directory_login: node['cluster']['shared_dir_login'] + shared_directory_login: node['cluster']['shared_dir_login_nodes'] ) only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? } end diff --git a/cookbooks/aws-parallelcluster-slurm/resources/munge_key_manager.rb b/cookbooks/aws-parallelcluster-slurm/resources/munge_key_manager.rb index a5f030369..968e323b9 100644 --- a/cookbooks/aws-parallelcluster-slurm/resources/munge_key_manager.rb +++ b/cookbooks/aws-parallelcluster-slurm/resources/munge_key_manager.rb @@ -23,35 +23,41 @@ default_action :setup_munge_key -def fetch_and_decode_munge_key(munge_key_secret_arn) - declare_resource(:bash, 'fetch_and_decode_munge_key') do +def restart_munge_service + declare_resource(:service, "munge") do + supports restart: true + action :restart + retries 5 + retry_delay 10 + end +end + +def share_munge_key_to_dir(shared_dir) + declare_resource(:bash, 'share_munge_key') do user 'root' group 'root' - cwd '/tmp' - code <<-FETCH_AND_DECODE + code <<-SHARE_MUNGE_KEY set -e - # Get encoded munge key from secrets manager - encoded_key=$(aws secretsmanager get-secret-value --secret-id #{munge_key_secret_arn} --query 'SecretString' --output text --region #{node['cluster']['region']}) - # If encoded_key doesn't have a value, error and exit - if [ -z "$encoded_key" ]; then - echo "Error fetching munge key from Secrets Manager or the key is empty" - exit 1 - fi - - # Decode munge key and write to /etc/munge/munge.key - decoded_key=$(echo $encoded_key | base64 -d) - if [ $? -ne 0 ]; then - echo "Error decoding the munge key with base64" - exit 1 - fi + mkdir -p #{shared_dir}/.munge + # Copy key to shared dir + cp /etc/munge/munge.key #{shared_dir}/.munge/.munge.key + chmod 0700 #{shared_dir}/.munge + chmod 0600 #{shared_dir}/.munge/.munge.key + SHARE_MUNGE_KEY + end +end - echo "$decoded_key" > /etc/munge/munge.key +def share_munge + share_munge_key_to_dir(node['cluster']['shared_dir']) + share_munge_key_to_dir(node['cluster']['shared_dir_login_nodes']) +end - # Set ownership on the key - chown #{node['cluster']['munge']['user']}:#{node['cluster']['munge']['group']} /etc/munge/munge.key - # Enforce correct permission on the key - chmod 0600 /etc/munge/munge.key - FETCH_AND_DECODE +# TODO: Consider renaming 'generate_munge_key' and 'fetch_and_decode_munge_key' to more descriptive names that better convey their functionalities. +def fetch_and_decode_munge_key + declare_resource(:execute, 'fetch_and_decode_munge_key') do + user 'root' + group 'root' + command "/#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh -d" end end @@ -66,12 +72,21 @@ def generate_munge_key chmod 0600 /etc/munge/munge.key GENERATE_KEY end + + # This function randomly generates a new munge key. + # After generating the key, it is essential to restart the munge service so that it starts using the new key. + # Moreover, the new key has to be shared across relevant directories to ensure consistent authentication across the cluster. + # We're restarting the munge service and sharing the munge key here within the `generate_munge_key` method, + # and not within the `fetch_and_decode_munge_key` method because the `update_munge_key.sh` script, + # which is called by `fetch_and_decode_munge_key`, already includes these two operations. + restart_munge_service + share_munge end action :setup_munge_key do if new_resource.munge_key_secret_arn # This block will fetch the munge key from Secrets Manager - fetch_and_decode_munge_key(new_resource.munge_key_secret_arn) + fetch_and_decode_munge_key else # This block will randomly generate a munge key generate_munge_key @@ -81,7 +96,7 @@ def generate_munge_key action :update_munge_key do if new_resource.munge_key_secret_arn # This block will fetch the munge key from Secrets Manager and replace the previous munge key - fetch_and_decode_munge_key(new_resource.munge_key_secret_arn) + fetch_and_decode_munge_key else # This block will randomly generate a munge key and replace the previous munge key generate_munge_key diff --git a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb index 044819e52..1fa9ffee6 100644 --- a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb +++ b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb @@ -15,21 +15,37 @@ MUNGE_USER="<%= @munge_user %>" MUNGE_GROUP="<%= @munge_group %>" SHARED_DIRECTORY_COMPUTE="<%= @shared_directory_compute %>" SHARED_DIRECTORY_LOGIN="<%= @shared_directory_login %>" -CHECK_LOGIN_NODES_SCRIPT_PATH="<%= node['cluster']['scripts_dir'] %>/slurm/check_login_nodes_stopped.sh" - -# Check if the script exists -if [ -f "$CHECK_LOGIN_NODES_SCRIPT_PATH" ]; then - # Check if login nodes are running - if ! $CHECK_LOGIN_NODES_SCRIPT_PATH; then - exit 1 - fi -fi -# Check compute fleet status -compute_fleet_status=$(get-compute-fleet-status.sh) -if ! echo "$compute_fleet_status" | grep -q '"status": "STOPPED"'; then - echo "Compute fleet is not stopped. Please stop it before updating the munge key." - exit 1 +DISABLE_RESOURCE_CHECK=false + +while getopts "d" opt; do + case $opt in + d) DISABLE_RESOURCE_CHECK=true;; + *) + echo "Usage: $0 [-d]" >&2 + exit 1 + ;; + esac +done + +if ! $DISABLE_RESOURCE_CHECK; then + # Check compute fleet status + compute_fleet_status=$(get-compute-fleet-status.sh) + if ! echo "$compute_fleet_status" | grep -q '"status": "STOPPED"'; then + echo "Compute fleet is not stopped. Please stop it before updating the munge key." + exit 1 + fi + + # Check LoginNodes status + CHECK_LOGIN_NODES_SCRIPT_PATH="<%= node['cluster']['scripts_dir'] %>/slurm/check_login_nodes_stopped.sh" + + # Check if the script exists + if [ -f "$CHECK_LOGIN_NODES_SCRIPT_PATH" ]; then + # Check if login nodes are running + if ! $CHECK_LOGIN_NODES_SCRIPT_PATH; then + exit 1 + fi + fi fi # If SECRET_ARN is provided, fetch the munge key from Secrets Manager