diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb index ecd6b08b3c..74113ead54 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb @@ -281,3 +281,17 @@ retries 5 retry_delay 2 end unless redhat_on_docker? + +template "#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh" do + source 'slurm/head_node/update_munge_key.sh.erb' + owner 'root' + group 'root' + mode '0700' + variables( + munge_key_secret_arn: lazy { node['cluster']['config'].dig(:DevSettings, :SlurmSettings, :MungeKeySecretArn) }, + region: node['cluster']['region'], + munge_user: node['cluster']['munge']['user'], + munge_group: node['cluster']['munge']['group'], + cluster_user: node['cluster']['cluster_user'] + ) +end diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb index 812c9c1038..84bfdf22bc 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb @@ -201,6 +201,22 @@ def update_nodes_in_queue(strategy, queues) only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_slurm_database_updated? } end unless on_docker? +# Update rotation script to update secret arn +template "#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh" do + source 'slurm/head_node/update_munge_key.sh.erb' + owner 'root' + group 'root' + mode '0700' + variables( + munge_key_secret_arn: lazy { node['cluster']['config'].dig(:DevSettings, :SlurmSettings, :MungeKeySecretArn) }, + region: node['cluster']['region'], + munge_user: node['cluster']['munge']['user'], + munge_group: node['cluster']['munge']['group'], + cluster_user: node['cluster']['cluster_user'] + ) + only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? } +end + update_munge_head_node # The previous execute "generate_pcluster_slurm_configs" block resource may have overridden the slurmdbd password in diff --git a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb new file mode 100644 index 0000000000..3fc6533f65 --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb @@ -0,0 +1,69 @@ +#!/bin/bash +# This script updates the munge key used in the system. +# It fetches the key from AWS Secrets Manager or generates one if it doesn't exist. +# The script does not require any argument. +# +# Usage: ./update_munge_key.sh +# # + +set -e + +MUNGE_KEY_FILE="/etc/munge/munge.key" +SECRET_ARN="<%= @munge_key_secret_arn %>" +REGION="<%= @region %>" +MUNGE_USER="<%= @munge_user %>" +MUNGE_GROUP="<%= @munge_group %>" +CLUSTER_USER="<%= @cluster_user %>" + +# If SECRET_ARN is provided, fetch the munge key from Secrets Manager +if [ -n "${SECRET_ARN}" ]; then + echo "Fetching munge key from AWS Secrets Manager: ${SECRET_ARN}" + encoded_key=$(aws secretsmanager get-secret-value --secret-id ${SECRET_ARN} --query 'SecretString' --output text --region ${REGION}) + + if [ -z "${encoded_key}" ]; then + echo "Error fetching munge key from Secrets Manager or the key is empty" + exit 1 + fi + + # Decode munge key and write to munge.key file + decoded_key=$(echo $encoded_key | base64 -d) + if [ $? -ne 0 ]; then + echo "Error decoding the munge key with base64" + exit 1 + fi + + echo "${decoded_key}" > ${MUNGE_KEY_FILE} + + # Set ownership on the key + chown ${MUNGE_USER}:${MUNGE_GROUP} ${MUNGE_KEY_FILE} + # Enforce correct permission on the key + chmod 0600 ${MUNGE_KEY_FILE} + +else + echo "MUNGE KEY SECRET ARN isn't provided" + exit 1 +fi + +# Enable and restart munge service +systemctl enable munge +echo "Restarting munge service" +systemctl restart munge + +# Wait for a short period +sleep 5 + +# Check if munge service is running +if systemctl --quiet is-active munge; then + echo "Munge service is active" +else + echo "Failed to restart munge service" + exit 1 +fi + +# Share munge key +echo "Sharing munge key" +mkdir -p /home/${CLUSTER_USER}/.munge +cp /etc/munge/munge.key /home/${CLUSTER_USER}/.munge/.munge.key +echo "Shared munge key" + +exit 0