Skip to content

Commit

Permalink
Merge branch 'develop' into wip/munge-key-rotation
Browse files Browse the repository at this point in the history
  • Loading branch information
hehe7318 authored Sep 18, 2023
2 parents b9c8156 + 4c5b1e9 commit 13a0246
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 41 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@ This file is used to list changes made in each version of the AWS ParallelCluste
**BUG FIXES**
- Fix inconsistent scaling configuration after cluster update rollback when modifying the list of instance types declared in the Compute Resources.

3.7.1
------
**CHANGES**
- Upgrade Slurm to 23.02.5 (from 23.02.4).
- Upgrade Pmix to 4.2.6 (from 3.2.3).
- Upgrade libjwt to 1.15.3 (from 1.12.0).

3.7.0
------

Expand Down
20 changes: 20 additions & 0 deletions cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,15 @@ def enable_munge_service
end
end

def restart_munge_service
service "munge" do
supports restart: true
action :restart
retries 5
retry_delay 10
end
end

def setup_munge_head_node
# Generate munge key or get it's value from secrets manager
munge_key_manager 'manage_munge_key' do
Expand All @@ -77,6 +86,17 @@ def setup_munge_head_node
share_munge_head_node
end

def update_munge_head_node
munge_key_manager 'update_munge_key' do
munge_key_secret_arn lazy { node['cluster']['config'].dig(:DevSettings, :SlurmSettings, :MungeKeySecretArn) }
action :update_munge_key
only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? }
end

restart_munge_service
share_munge_head_node
end

def share_munge_head_node
# Share munge key
bash 'share_munge_key' do
Expand Down
16 changes: 14 additions & 2 deletions cookbooks/aws-parallelcluster-slurm/libraries/update.rb
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,18 @@ def is_compute_node_bootstrap_timeout_updated?(previous_config, config)
evaluate_compute_bootstrap_timeout(previous_config) != evaluate_compute_bootstrap_timeout(config)
end

def is_slurm_database_updated?
def config_parameter_changed?(param)
# Compares previous cluster config with the current one for changes in a parameter
# Parameters:
# - `param`: An array representing the sequence of nested keys to the parameter to be checked
require 'yaml'
config = YAML.safe_load(File.read(node['cluster']['cluster_config_path']))
previous_config = YAML.safe_load(File.read(node['cluster']['previous_cluster_config_path']))
config["Scheduling"]["SlurmSettings"]["Database"] != previous_config["Scheduling"]["SlurmSettings"]["Database"]
config.dig(*param) != previous_config.dig(*param)
end

def is_slurm_database_updated?
config_parameter_changed?(%w(Scheduling SlurmSettings Database))
end

def raise_command_error(command, cmd)
Expand All @@ -71,3 +78,8 @@ def execute_command(command, user = "root", timeout = 300, raise_on_error = true
raise_command_error(command, cmd) if raise_on_error && cmd.error?
cmd.stdout.strip
end

# Verify if MungeKeySecretArn in SlurmSettings section of cluster configuration has been updated
def is_custom_munge_key_updated?
config_parameter_changed?(%w(DevSettings SlurmSettings MungeKeySecretArn))
end
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,8 @@ def update_nodes_in_queue(strategy, queues)
only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? }
end

update_munge_head_node

# The previous execute "generate_pcluster_slurm_configs" block resource may have overridden the slurmdbd password in
# slurm_parallelcluster_slurmdbd.conf with a default value, so if it has run and Slurm accounting
# is enabled we must pull the database password from Secrets Manager once again.
Expand Down
95 changes: 56 additions & 39 deletions cookbooks/aws-parallelcluster-slurm/resources/munge_key_manager.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,50 +23,67 @@

default_action :setup_munge_key

action :setup_munge_key do
if new_resource.munge_key_secret_arn
# This block will fetch the munge key from Secrets Manager
bash 'fetch_and_decode_munge_key' do
user 'root'
group 'root'
cwd '/tmp'
code <<-FETCH_AND_DECODE
set -e
# Get encoded munge key from secrets manager
encoded_key=$(aws secretsmanager get-secret-value --secret-id #{new_resource.munge_key_secret_arn} --query 'SecretString' --output text --region #{node['cluster']['region']})
# If encoded_key doesn't have a value, error and exit
if [ -z "$encoded_key" ]; then
echo "Error fetching munge key from Secrets Manager or the key is empty"
exit 1
fi
def fetch_and_decode_munge_key
declare_resource(:bash, 'fetch_and_decode_munge_key') do
user 'root'
group 'root'
cwd '/tmp'
code <<-FETCH_AND_DECODE
set -e
# Get encoded munge key from secrets manager
encoded_key=$(aws secretsmanager get-secret-value --secret-id #{new_resource.munge_key_secret_arn} --query 'SecretString' --output text --region #{node['cluster']['region']})
# If encoded_key doesn't have a value, error and exit
if [ -z "$encoded_key" ]; then
echo "Error fetching munge key from Secrets Manager or the key is empty"
exit 1
fi
# Decode munge key and write to /etc/munge/munge.key
decoded_key=$(echo $encoded_key | base64 -d)
if [ $? -ne 0 ]; then
echo "Error decoding the munge key with base64"
exit 1
fi
# Decode munge key and write to /etc/munge/munge.key
decoded_key=$(echo $encoded_key | base64 -d)
if [ $? -ne 0 ]; then
echo "Error decoding the munge key with base64"
exit 1
fi
echo "$decoded_key" > /etc/munge/munge.key
echo "$decoded_key" > /etc/munge/munge.key
# Set ownership on the key
chown #{node['cluster']['munge']['user']}:#{node['cluster']['munge']['group']} /etc/munge/munge.key
# Enforce correct permission on the key
chmod 0600 /etc/munge/munge.key
FETCH_AND_DECODE
end
else
# This block will generate a munge key if it doesn't exist
bash 'generate_munge_key' do
not_if { ::File.exist?('/etc/munge/munge.key') }
user node['cluster']['munge']['user']
group node['cluster']['munge']['group']
cwd '/tmp'
code <<-GENERATE_KEY
# Set ownership on the key
chown #{node['cluster']['munge']['user']}:#{node['cluster']['munge']['group']} /etc/munge/munge.key
# Enforce correct permission on the key
chmod 0600 /etc/munge/munge.key
FETCH_AND_DECODE
end
end

def generate_munge_key
declare_resource(:bash, 'generate_munge_key') do
user node['cluster']['munge']['user']
group node['cluster']['munge']['group']
cwd '/tmp'
code <<-GENERATE_KEY
set -e
/usr/sbin/mungekey --verbose
chmod 0600 /etc/munge/munge.key
GENERATE_KEY
end
GENERATE_KEY
end
end

action :setup_munge_key do
if new_resource.munge_key_secret_arn
# This block will fetch the munge key from Secrets Manager
fetch_and_decode_munge_key
else
# This block will randomly generate a munge key
generate_munge_key
end
end

action :update_munge_key do
if new_resource.munge_key_secret_arn
# This block will fetch the munge key from Secrets Manager and replace the previous munge key
fetch_and_decode_munge_key
else
# This block will randomly generate a munge key and replace the previous munge key
generate_munge_key
end
end

0 comments on commit 13a0246

Please sign in to comment.