Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[develop] Reuse the rotation script inside the munge resource #2471

Merged
merged 17 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 0 additions & 35 deletions cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,25 +65,13 @@ def enable_munge_service
end
end

def restart_munge_service
service "munge" do
supports restart: true
action :restart
retries 5
retry_delay 10
end
end

def setup_munge_head_node
# Generate munge key or get it's value from secrets manager
munge_key_manager 'manage_munge_key' do
munge_key_secret_arn lazy {
node['cluster']['config'].dig(:DevSettings, :MungeKeySettings, :MungeKeySecretArn)
}
end

enable_munge_service
share_munge_head_node
end

def update_munge_head_node
Expand All @@ -92,29 +80,6 @@ def update_munge_head_node
action :update_munge_key
only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? }
end

restart_munge_service
share_munge_head_node
end

def share_munge_key_to_dir(shared_dir)
bash 'share_munge_key' do
user 'root'
group 'root'
code <<-SHARE_MUNGE_KEY
set -e
mkdir -p #{shared_dir}/.munge
# Copy key to shared dir
cp /etc/munge/munge.key #{shared_dir}/.munge/.munge.key
chmod 0700 #{shared_dir}/.munge
chmod 0600 #{shared_dir}/.munge/.munge.key
SHARE_MUNGE_KEY
end
end

def share_munge_head_node
share_munge_key_to_dir(node['cluster']['shared_dir'])
share_munge_key_to_dir(node['cluster']['shared_dir_login'])
end

def setup_munge_key(shared_dir)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,26 @@
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

# Copy pcluster config generator and templates
include_recipe 'aws-parallelcluster-slurm::config_head_node_directories'

include_recipe 'aws-parallelcluster-slurm::config_check_login_stopped_script'

template "#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh" do
source 'slurm/head_node/update_munge_key.sh.erb'
owner 'root'
group 'root'
mode '0700'
variables(
munge_key_secret_arn: lazy { node['cluster']['config'].dig(:DevSettings, :MungeKeySettings, :MungeKeySecretArn) },
region: node['cluster']['region'],
munge_user: node['cluster']['munge']['user'],
munge_group: node['cluster']['munge']['group'],
shared_directory_compute: node['cluster']['shared_dir'],
shared_directory_login: node['cluster']['shared_dir_login_nodes']
)
end

include_recipe 'aws-parallelcluster-slurm::config_munge_key'

# Export /opt/slurm
Expand All @@ -25,8 +45,6 @@
only_if { node['cluster']['internal_shared_storage_type'] == 'ebs' }
end unless on_docker?

include_recipe 'aws-parallelcluster-slurm::config_head_node_directories'

template "#{node['cluster']['slurm']['install_dir']}/etc/slurm.conf" do
source 'slurm/slurm.conf.erb'
owner 'root'
Expand Down Expand Up @@ -218,20 +236,3 @@
retries 5
retry_delay 2
end unless redhat_on_docker?

include_recipe 'aws-parallelcluster-slurm::config_check_login_stopped_script'

template "#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh" do
source 'slurm/head_node/update_munge_key.sh.erb'
owner 'root'
group 'root'
mode '0700'
variables(
munge_key_secret_arn: lazy { node['cluster']['config'].dig(:DevSettings, :MungeKeySettings, :MungeKeySecretArn) },
region: node['cluster']['region'],
munge_user: node['cluster']['munge']['user'],
munge_group: node['cluster']['munge']['group'],
shared_directory_compute: node['cluster']['shared_dir'],
shared_directory_login: node['cluster']['shared_dir_login']
)
end
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def update_nodes_in_queue(strategy, queues)
munge_user: node['cluster']['munge']['user'],
munge_group: node['cluster']['munge']['group'],
shared_directory_compute: node['cluster']['shared_dir'],
shared_directory_login: node['cluster']['shared_dir_login']
shared_directory_login: node['cluster']['shared_dir_login_nodes']
)
only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? }
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,35 +23,41 @@

default_action :setup_munge_key

def fetch_and_decode_munge_key(munge_key_secret_arn)
declare_resource(:bash, 'fetch_and_decode_munge_key') do
def restart_munge_service
declare_resource(:service, "munge") do
supports restart: true
action :restart
retries 5
retry_delay 10
end
end

def share_munge_key_to_dir(shared_dir)
declare_resource(:bash, 'share_munge_key') do
user 'root'
group 'root'
cwd '/tmp'
code <<-FETCH_AND_DECODE
code <<-SHARE_MUNGE_KEY
set -e
# Get encoded munge key from secrets manager
encoded_key=$(aws secretsmanager get-secret-value --secret-id #{munge_key_secret_arn} --query 'SecretString' --output text --region #{node['cluster']['region']})
# If encoded_key doesn't have a value, error and exit
if [ -z "$encoded_key" ]; then
echo "Error fetching munge key from Secrets Manager or the key is empty"
exit 1
fi

# Decode munge key and write to /etc/munge/munge.key
decoded_key=$(echo $encoded_key | base64 -d)
if [ $? -ne 0 ]; then
echo "Error decoding the munge key with base64"
exit 1
fi
mkdir -p #{shared_dir}/.munge
# Copy key to shared dir
cp /etc/munge/munge.key #{shared_dir}/.munge/.munge.key
chmod 0700 #{shared_dir}/.munge
chmod 0600 #{shared_dir}/.munge/.munge.key
SHARE_MUNGE_KEY
end
end

echo "$decoded_key" > /etc/munge/munge.key
def share_munge
share_munge_key_to_dir(node['cluster']['shared_dir'])
share_munge_key_to_dir(node['cluster']['shared_dir_login_nodes'])
end

# Set ownership on the key
chown #{node['cluster']['munge']['user']}:#{node['cluster']['munge']['group']} /etc/munge/munge.key
# Enforce correct permission on the key
chmod 0600 /etc/munge/munge.key
FETCH_AND_DECODE
# TODO: Consider renaming 'generate_munge_key' and 'fetch_and_decode_munge_key' to more descriptive names that better convey their functionalities.
def fetch_and_decode_munge_key
declare_resource(:execute, 'fetch_and_decode_munge_key') do
jdeamicis marked this conversation as resolved.
Show resolved Hide resolved
user 'root'
group 'root'
command "/#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh -d"
end
end
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe you can simplify this block:

def fetch_and_decode_munge_key
  declare_resource(:execute, 'fetch_and_decode_munge_key') do
    user 'root'
    group 'root'
    command "/#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh -d"
  end
end

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great comment! Thank you Jacopo.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we actually need the first / character in the command variable. Though, in principle scripts_dir will always be an absolute path, so it shouldn't hurt.


jdeamicis marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -66,12 +72,21 @@ def generate_munge_key
chmod 0600 /etc/munge/munge.key
GENERATE_KEY
end

# This function randomly generates a new munge key.
# After generating the key, it is essential to restart the munge service so that it starts using the new key.
# Moreover, the new key has to be shared across relevant directories to ensure consistent authentication across the cluster.
# We're restarting the munge service and sharing the munge key here within the `generate_munge_key` method,
# and not within the `fetch_and_decode_munge_key` method because the `update_munge_key.sh` script,
# which is called by `fetch_and_decode_munge_key`, already includes these two operations.
restart_munge_service
share_munge
end

action :setup_munge_key do
if new_resource.munge_key_secret_arn
# This block will fetch the munge key from Secrets Manager
fetch_and_decode_munge_key(new_resource.munge_key_secret_arn)
fetch_and_decode_munge_key
else
# This block will randomly generate a munge key
generate_munge_key
Expand All @@ -81,7 +96,7 @@ def generate_munge_key
action :update_munge_key do
if new_resource.munge_key_secret_arn
# This block will fetch the munge key from Secrets Manager and replace the previous munge key
fetch_and_decode_munge_key(new_resource.munge_key_secret_arn)
fetch_and_decode_munge_key
else
# This block will randomly generate a munge key and replace the previous munge key
generate_munge_key
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,37 @@ MUNGE_USER="<%= @munge_user %>"
MUNGE_GROUP="<%= @munge_group %>"
SHARED_DIRECTORY_COMPUTE="<%= @shared_directory_compute %>"
SHARED_DIRECTORY_LOGIN="<%= @shared_directory_login %>"
CHECK_LOGIN_NODES_SCRIPT_PATH="<%= node['cluster']['scripts_dir'] %>/slurm/check_login_nodes_stopped.sh"

# Check if the script exists
if [ -f "$CHECK_LOGIN_NODES_SCRIPT_PATH" ]; then
# Check if login nodes are running
if ! $CHECK_LOGIN_NODES_SCRIPT_PATH; then
exit 1
fi
fi

# Check compute fleet status
compute_fleet_status=$(get-compute-fleet-status.sh)
if ! echo "$compute_fleet_status" | grep -q '"status": "STOPPED"'; then
echo "Compute fleet is not stopped. Please stop it before updating the munge key."
exit 1
DISABLE_RESOURCE_CHECK=false

while getopts "d" opt; do
case $opt in
d) DISABLE_RESOURCE_CHECK=true;;
*)
echo "Usage: $0 [-d]" >&2
exit 1
;;
esac
done

if ! $DISABLE_RESOURCE_CHECK; then
jdeamicis marked this conversation as resolved.
Show resolved Hide resolved
# Check compute fleet status
compute_fleet_status=$(get-compute-fleet-status.sh)
if ! echo "$compute_fleet_status" | grep -q '"status": "STOPPED"'; then
echo "Compute fleet is not stopped. Please stop it before updating the munge key."
exit 1
fi
jdeamicis marked this conversation as resolved.
Show resolved Hide resolved

# Check LoginNodes status
CHECK_LOGIN_NODES_SCRIPT_PATH="<%= node['cluster']['scripts_dir'] %>/slurm/check_login_nodes_stopped.sh"

# Check if the script exists
if [ -f "$CHECK_LOGIN_NODES_SCRIPT_PATH" ]; then
# Check if login nodes are running
if ! $CHECK_LOGIN_NODES_SCRIPT_PATH; then
exit 1
fi
fi
fi

# If SECRET_ARN is provided, fetch the munge key from Secrets Manager
Expand Down