From 332047f0bb64b6521eb8b4783040dbdb19f7e34e Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 11 Sep 2023 14:57:09 -0400 Subject: [PATCH 1/2] Munge key rotation --- .../recipes/config/config_head_node.rb | 14 ++++ .../recipes/update/update_head_node.rb | 16 +++++ .../slurm/head_node/update_munge_key.sh.erb | 64 +++++++++++++++++++ 3 files changed, 94 insertions(+) create mode 100644 cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb index 8d4ce68f0..799c28b19 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb @@ -281,3 +281,17 @@ retries 5 retry_delay 2 end unless redhat_on_docker? + +template "#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh" do + source 'slurm/head_node/update_munge_key.sh.erb' + owner 'root' + group 'root' + mode '0700' + variables( + munge_key_secret_arn: lazy { node['cluster']['config'].dig(:DevSettings, :SlurmSettings, :MungeKeySecretArn) }, + region: node['cluster']['region'], + munge_user: node['cluster']['munge']['user'], + munge_group: node['cluster']['munge']['group'], + cluster_user: node['cluster']['cluster_user'] + ) +end diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb index 9f5ba1e7c..c39f756e3 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb @@ -201,6 +201,22 @@ def update_nodes_in_queue(strategy, queues) only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_slurm_database_updated? } end unless on_docker? +# Update rotation script to update secret arn +template "#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh" do + source 'slurm/head_node/update_munge_key.sh.erb' + owner 'root' + group 'root' + mode '0700' + variables( + munge_key_secret_arn: lazy { node['cluster']['config'].dig(:DevSettings, :SlurmSettings, :MungeKeySecretArn) }, + region: node['cluster']['region'], + munge_user: node['cluster']['munge']['user'], + munge_group: node['cluster']['munge']['group'], + cluster_user: node['cluster']['cluster_user'] + ) + only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? } +end + # The previous execute "generate_pcluster_slurm_configs" block resource may have overridden the slurmdbd password in # slurm_parallelcluster_slurmdbd.conf with a default value, so if it has run and Slurm accounting # is enabled we must pull the database password from Secrets Manager once again. diff --git a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb new file mode 100644 index 000000000..198032e45 --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb @@ -0,0 +1,64 @@ +#!/bin/bash +# This script updates the munge key used in the system. +# It fetches the key from AWS Secrets Manager or generates one if it doesn't exist. +# The script does not require any argument. +# +# Usage: ./update_munge_key.sh +# # + +set -e + +MUNGE_KEY_FILE="/etc/munge/munge.key" +SECRET_ARN="<%= @munge_key_secret_arn %>" +REGION="<%= @region %>" +MUNGE_USER="<%= @munge_user %>" +MUNGE_GROUP="<%= @munge_group %>" +CLUSTER_USER="<%= @cluster_user %>" + +# If SECRET_ARN is provided, fetch the munge key from Secrets Manager +if [ -n "${SECRET_ARN}" ]; then + echo "Fetching munge key from AWS Secrets Manager: ${SECRET_ARN}" + encoded_key=$(aws secretsmanager get-secret-value --secret-id ${SECRET_ARN} --query 'SecretString' --output text --region ${REGION}) + + if [ -z "${encoded_key}" ]; then + echo "Error fetching munge key from Secrets Manager or the key is empty" + exit 1 + fi + + # Decode munge key and write to munge.key file + decoded_key=$(echo $encoded_key | base64 -d) + if [ $? -ne 0 ]; then + echo "Error decoding the munge key with base64" + exit 1 + fi + + # Remove current munge key if exists + if [ -f "${MUNGE_KEY_FILE}" ]; then + rm -f ${MUNGE_KEY_FILE} + fi + + echo "${decoded_key}" > ${MUNGE_KEY_FILE} + + # Set ownership on the key + chown ${MUNGE_USER}:${MUNGE_GROUP} ${MUNGE_KEY_FILE} + # Enforce correct permission on the key + chmod 0600 ${MUNGE_KEY_FILE} + +else + echo "MUNGE KEY SECRET ARN isn't provided" + exit 1 +fi + +# Enable and restart munge service +systemctl enable munge +echo "Start to Restart munge service" +systemctl restart munge || { sleep 10; systemctl restart munge; } || { sleep 10; systemctl restart munge; } || { sleep 10; systemctl restart munge; } || { sleep 10; systemctl restart munge; } +echo "Restart munge service completed" + +# Share munge key +echo "Start to Share munge key" +mkdir -p /home/${CLUSTER_USER}/.munge +cp /etc/munge/munge.key /home/${CLUSTER_USER}/.munge/.munge.key +echo "Share munge key completed" + +exit 0 From b9c81562719c3472989b34d3d44b5dc8966bd9f5 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Thu, 14 Sep 2023 15:41:58 -0400 Subject: [PATCH 2/2] Refined the rotation script --- .../slurm/head_node/update_munge_key.sh.erb | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb index 198032e45..3fc6533f6 100644 --- a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb +++ b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb @@ -32,11 +32,6 @@ if [ -n "${SECRET_ARN}" ]; then exit 1 fi - # Remove current munge key if exists - if [ -f "${MUNGE_KEY_FILE}" ]; then - rm -f ${MUNGE_KEY_FILE} - fi - echo "${decoded_key}" > ${MUNGE_KEY_FILE} # Set ownership on the key @@ -51,14 +46,24 @@ fi # Enable and restart munge service systemctl enable munge -echo "Start to Restart munge service" -systemctl restart munge || { sleep 10; systemctl restart munge; } || { sleep 10; systemctl restart munge; } || { sleep 10; systemctl restart munge; } || { sleep 10; systemctl restart munge; } -echo "Restart munge service completed" +echo "Restarting munge service" +systemctl restart munge + +# Wait for a short period +sleep 5 + +# Check if munge service is running +if systemctl --quiet is-active munge; then + echo "Munge service is active" +else + echo "Failed to restart munge service" + exit 1 +fi # Share munge key -echo "Start to Share munge key" +echo "Sharing munge key" mkdir -p /home/${CLUSTER_USER}/.munge cp /etc/munge/munge.key /home/${CLUSTER_USER}/.munge/.munge.key -echo "Share munge key completed" +echo "Shared munge key" exit 0