Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[develop] Add custom munge key update logic #2452

Merged
merged 9 commits into from
Sep 15, 2023
20 changes: 20 additions & 0 deletions cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,15 @@ def enable_munge_service
end
end

def restart_munge_service
service "munge" do
supports restart: true
action :restart
retries 5
retry_delay 10
end
end

def setup_munge_head_node
# Generate munge key or get it's value from secrets manager
munge_key_manager 'manage_munge_key' do
Expand All @@ -77,6 +86,17 @@ def setup_munge_head_node
share_munge_head_node
end

def update_munge_head_node
munge_key_manager 'update_munge_key' do
munge_key_secret_arn lazy { node['cluster']['config'].dig(:DevSettings, :SlurmSettings, :MungeKeySecretArn) }
action :update_munge_key
only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? }
end

restart_munge_service
share_munge_head_node
end

def share_munge_head_node
# Share munge key
bash 'share_munge_key' do
Expand Down
16 changes: 14 additions & 2 deletions cookbooks/aws-parallelcluster-slurm/libraries/update.rb
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,18 @@ def is_compute_node_bootstrap_timeout_updated?(previous_config, config)
evaluate_compute_bootstrap_timeout(previous_config) != evaluate_compute_bootstrap_timeout(config)
end

def is_slurm_database_updated?
def config_parameter_changed?(param)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optional: It can be used for other functions in the file e.g. are_bulk_custom_slurm_settings_updated

# Compares previous cluster config with the current one for changes in a parameter
# Parameters:
# - `param`: An array representing the sequence of nested keys to the parameter to be checked
require 'yaml'
config = YAML.safe_load(File.read(node['cluster']['cluster_config_path']))
previous_config = YAML.safe_load(File.read(node['cluster']['previous_cluster_config_path']))
config["Scheduling"]["SlurmSettings"]["Database"] != previous_config["Scheduling"]["SlurmSettings"]["Database"]
config.dig(*param) != previous_config.dig(*param)
end

def is_slurm_database_updated?
config_parameter_changed?(%w(Scheduling SlurmSettings Database))
end

def raise_command_error(command, cmd)
Expand All @@ -71,3 +78,8 @@ def execute_command(command, user = "root", timeout = 300, raise_on_error = true
raise_command_error(command, cmd) if raise_on_error && cmd.error?
cmd.stdout.strip
end

# Verify if MungeKeySecretArn in SlurmSettings section of cluster configuration has been updated
def is_custom_munge_key_updated?
config_parameter_changed?(%w(DevSettings SlurmSettings MungeKeySecretArn))
end
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ def update_nodes_in_queue(strategy, queues)
only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_slurm_database_updated? }
end unless on_docker?

update_munge_head_node

# The previous execute "generate_pcluster_slurm_configs" block resource may have overridden the slurmdbd password in
# slurm_parallelcluster_slurmdbd.conf with a default value, so if it has run and Slurm accounting
# is enabled we must pull the database password from Secrets Manager once again.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,50 +23,67 @@

default_action :setup_munge_key

action :setup_munge_key do
if new_resource.munge_key_secret_arn
# This block will fetch the munge key from Secrets Manager
bash 'fetch_and_decode_munge_key' do
user 'root'
group 'root'
cwd '/tmp'
code <<-FETCH_AND_DECODE
set -e
# Get encoded munge key from secrets manager
encoded_key=$(aws secretsmanager get-secret-value --secret-id #{new_resource.munge_key_secret_arn} --query 'SecretString' --output text --region #{node['cluster']['region']})
# If encoded_key doesn't have a value, error and exit
if [ -z "$encoded_key" ]; then
echo "Error fetching munge key from Secrets Manager or the key is empty"
exit 1
fi
def fetch_and_decode_munge_key
declare_resource(:bash, 'fetch_and_decode_munge_key') do
user 'root'
group 'root'
cwd '/tmp'
code <<-FETCH_AND_DECODE
set -e
# Get encoded munge key from secrets manager
encoded_key=$(aws secretsmanager get-secret-value --secret-id #{new_resource.munge_key_secret_arn} --query 'SecretString' --output text --region #{node['cluster']['region']})
# If encoded_key doesn't have a value, error and exit
if [ -z "$encoded_key" ]; then
echo "Error fetching munge key from Secrets Manager or the key is empty"
exit 1
fi

# Decode munge key and write to /etc/munge/munge.key
decoded_key=$(echo $encoded_key | base64 -d)
if [ $? -ne 0 ]; then
echo "Error decoding the munge key with base64"
exit 1
fi
# Decode munge key and write to /etc/munge/munge.key
decoded_key=$(echo $encoded_key | base64 -d)
if [ $? -ne 0 ]; then
echo "Error decoding the munge key with base64"
exit 1
fi

echo "$decoded_key" > /etc/munge/munge.key
echo "$decoded_key" > /etc/munge/munge.key

# Set ownership on the key
chown #{node['cluster']['munge']['user']}:#{node['cluster']['munge']['group']} /etc/munge/munge.key
# Enforce correct permission on the key
chmod 0600 /etc/munge/munge.key
FETCH_AND_DECODE
end
else
# This block will generate a munge key if it doesn't exist
bash 'generate_munge_key' do
not_if { ::File.exist?('/etc/munge/munge.key') }
user node['cluster']['munge']['user']
group node['cluster']['munge']['group']
cwd '/tmp'
code <<-GENERATE_KEY
# Set ownership on the key
chown #{node['cluster']['munge']['user']}:#{node['cluster']['munge']['group']} /etc/munge/munge.key
# Enforce correct permission on the key
chmod 0600 /etc/munge/munge.key
FETCH_AND_DECODE
end
end

def generate_munge_key
declare_resource(:bash, 'generate_munge_key') do
user node['cluster']['munge']['user']
group node['cluster']['munge']['group']
cwd '/tmp'
code <<-GENERATE_KEY
set -e
/usr/sbin/mungekey --verbose
chmod 0600 /etc/munge/munge.key
GENERATE_KEY
end
GENERATE_KEY
end
end

action :setup_munge_key do
if new_resource.munge_key_secret_arn
# This block will fetch the munge key from Secrets Manager
fetch_and_decode_munge_key
else
# This block will randomly generate a munge key
generate_munge_key
end
end

action :update_munge_key do
if new_resource.munge_key_secret_arn
# This block will fetch the munge key from Secrets Manager and replace the previous munge key
fetch_and_decode_munge_key
else
# This block will randomly generate a munge key and replace the previous munge key
generate_munge_key
end
end