From 2210cd8ee97e2459ba0589f41d1ccdb6f3da88cd Mon Sep 17 00:00:00 2001 From: Xuanqi He <93849823+hehe7318@users.noreply.github.com> Date: Fri, 15 Sep 2023 06:11:05 -0400 Subject: [PATCH 1/2] [develop] Add custom munge key update logic (#2452) * Munge key update * Move action of updating munge key to munge key manager and delete the update manager --- .../libraries/helpers.rb | 20 ++++ .../libraries/update.rb | 16 +++- .../recipes/update/update_head_node.rb | 2 + .../resources/munge_key_manager.rb | 95 +++++++++++-------- 4 files changed, 92 insertions(+), 41 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb index 24a2c01b4..a3cd308e9 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb @@ -65,6 +65,15 @@ def enable_munge_service end end +def restart_munge_service + service "munge" do + supports restart: true + action :restart + retries 5 + retry_delay 10 + end +end + def setup_munge_head_node # Generate munge key or get it's value from secrets manager munge_key_manager 'manage_munge_key' do @@ -77,6 +86,17 @@ def setup_munge_head_node share_munge_head_node end +def update_munge_head_node + munge_key_manager 'update_munge_key' do + munge_key_secret_arn lazy { node['cluster']['config'].dig(:DevSettings, :SlurmSettings, :MungeKeySecretArn) } + action :update_munge_key + only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? } + end + + restart_munge_service + share_munge_head_node +end + def share_munge_head_node # Share munge key bash 'share_munge_key' do diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/update.rb b/cookbooks/aws-parallelcluster-slurm/libraries/update.rb index de9fbac08..81204312e 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/update.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/update.rb @@ -53,11 +53,18 @@ def is_compute_node_bootstrap_timeout_updated?(previous_config, config) evaluate_compute_bootstrap_timeout(previous_config) != evaluate_compute_bootstrap_timeout(config) end -def is_slurm_database_updated? +def config_parameter_changed?(param) + # Compares previous cluster config with the current one for changes in a parameter + # Parameters: + # - `param`: An array representing the sequence of nested keys to the parameter to be checked require 'yaml' config = YAML.safe_load(File.read(node['cluster']['cluster_config_path'])) previous_config = YAML.safe_load(File.read(node['cluster']['previous_cluster_config_path'])) - config["Scheduling"]["SlurmSettings"]["Database"] != previous_config["Scheduling"]["SlurmSettings"]["Database"] + config.dig(*param) != previous_config.dig(*param) +end + +def is_slurm_database_updated? + config_parameter_changed?(%w(Scheduling SlurmSettings Database)) end def raise_command_error(command, cmd) @@ -71,3 +78,8 @@ def execute_command(command, user = "root", timeout = 300, raise_on_error = true raise_command_error(command, cmd) if raise_on_error && cmd.error? cmd.stdout.strip end + +# Verify if MungeKeySecretArn in SlurmSettings section of cluster configuration has been updated +def is_custom_munge_key_updated? + config_parameter_changed?(%w(DevSettings SlurmSettings MungeKeySecretArn)) +end diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb index 9f5ba1e7c..812c9c103 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb @@ -201,6 +201,8 @@ def update_nodes_in_queue(strategy, queues) only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_slurm_database_updated? } end unless on_docker? +update_munge_head_node + # The previous execute "generate_pcluster_slurm_configs" block resource may have overridden the slurmdbd password in # slurm_parallelcluster_slurmdbd.conf with a default value, so if it has run and Slurm accounting # is enabled we must pull the database password from Secrets Manager once again. diff --git a/cookbooks/aws-parallelcluster-slurm/resources/munge_key_manager.rb b/cookbooks/aws-parallelcluster-slurm/resources/munge_key_manager.rb index 2a696ba18..64bdd977d 100644 --- a/cookbooks/aws-parallelcluster-slurm/resources/munge_key_manager.rb +++ b/cookbooks/aws-parallelcluster-slurm/resources/munge_key_manager.rb @@ -23,50 +23,67 @@ default_action :setup_munge_key -action :setup_munge_key do - if new_resource.munge_key_secret_arn - # This block will fetch the munge key from Secrets Manager - bash 'fetch_and_decode_munge_key' do - user 'root' - group 'root' - cwd '/tmp' - code <<-FETCH_AND_DECODE - set -e - # Get encoded munge key from secrets manager - encoded_key=$(aws secretsmanager get-secret-value --secret-id #{new_resource.munge_key_secret_arn} --query 'SecretString' --output text --region #{node['cluster']['region']}) - # If encoded_key doesn't have a value, error and exit - if [ -z "$encoded_key" ]; then - echo "Error fetching munge key from Secrets Manager or the key is empty" - exit 1 - fi +def fetch_and_decode_munge_key + declare_resource(:bash, 'fetch_and_decode_munge_key') do + user 'root' + group 'root' + cwd '/tmp' + code <<-FETCH_AND_DECODE + set -e + # Get encoded munge key from secrets manager + encoded_key=$(aws secretsmanager get-secret-value --secret-id #{new_resource.munge_key_secret_arn} --query 'SecretString' --output text --region #{node['cluster']['region']}) + # If encoded_key doesn't have a value, error and exit + if [ -z "$encoded_key" ]; then + echo "Error fetching munge key from Secrets Manager or the key is empty" + exit 1 + fi - # Decode munge key and write to /etc/munge/munge.key - decoded_key=$(echo $encoded_key | base64 -d) - if [ $? -ne 0 ]; then - echo "Error decoding the munge key with base64" - exit 1 - fi + # Decode munge key and write to /etc/munge/munge.key + decoded_key=$(echo $encoded_key | base64 -d) + if [ $? -ne 0 ]; then + echo "Error decoding the munge key with base64" + exit 1 + fi - echo "$decoded_key" > /etc/munge/munge.key + echo "$decoded_key" > /etc/munge/munge.key - # Set ownership on the key - chown #{node['cluster']['munge']['user']}:#{node['cluster']['munge']['group']} /etc/munge/munge.key - # Enforce correct permission on the key - chmod 0600 /etc/munge/munge.key - FETCH_AND_DECODE - end - else - # This block will generate a munge key if it doesn't exist - bash 'generate_munge_key' do - not_if { ::File.exist?('/etc/munge/munge.key') } - user node['cluster']['munge']['user'] - group node['cluster']['munge']['group'] - cwd '/tmp' - code <<-GENERATE_KEY + # Set ownership on the key + chown #{node['cluster']['munge']['user']}:#{node['cluster']['munge']['group']} /etc/munge/munge.key + # Enforce correct permission on the key + chmod 0600 /etc/munge/munge.key + FETCH_AND_DECODE + end +end + +def generate_munge_key + declare_resource(:bash, 'generate_munge_key') do + user node['cluster']['munge']['user'] + group node['cluster']['munge']['group'] + cwd '/tmp' + code <<-GENERATE_KEY set -e /usr/sbin/mungekey --verbose chmod 0600 /etc/munge/munge.key - GENERATE_KEY - end + GENERATE_KEY + end +end + +action :setup_munge_key do + if new_resource.munge_key_secret_arn + # This block will fetch the munge key from Secrets Manager + fetch_and_decode_munge_key + else + # This block will randomly generate a munge key + generate_munge_key + end +end + +action :update_munge_key do + if new_resource.munge_key_secret_arn + # This block will fetch the munge key from Secrets Manager and replace the previous munge key + fetch_and_decode_munge_key + else + # This block will randomly generate a munge key and replace the previous munge key + generate_munge_key end end From 4c5b1e943dd98f1841d128b5ab2440a78d9abe01 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Fri, 15 Sep 2023 07:01:10 -0700 Subject: [PATCH 2/2] Update CHANGELOG.md Signed-off-by: Hanwen --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc4845d5f..63938c8ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,13 @@ This file is used to list changes made in each version of the AWS ParallelCluste **BUG FIXES** - Fix inconsistent scaling configuration after cluster update rollback when modifying the list of instance types declared in the Compute Resources. +3.7.1 +------ +**CHANGES** +- Upgrade Slurm to 23.02.5 (from 23.02.4). + - Upgrade Pmix to 4.2.6 (from 3.2.3). + - Upgrade libjwt to 1.15.3 (from 1.12.0). + 3.7.0 ------