Skip to content

Commit

Permalink
Modify the check_login_stopped.sh script to directly use TG name to q…
Browse files Browse the repository at this point in the history
…uery (#2524)

Reduce the check steps when checking if LoginNodes stopped. 
So that we can delete the checking NLB loop to reduce running time.
  • Loading branch information
hehe7318 authored Nov 2, 2023
1 parent a6b8e1f commit a9c1dfb
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 38 deletions.
11 changes: 11 additions & 0 deletions cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
#
# Retrieve compute nodename from file
#

require 'digest'

def slurm_nodename
slurm_nodename_file = "#{node['cluster']['slurm_plugin_dir']}/slurm_nodename"

Expand Down Expand Up @@ -137,3 +140,11 @@ def get_primary_ip

primary_ip
end

def get_target_group_name(cluster_name, pool_name)
partial_cluster_name = cluster_name[0..6]
partial_pool_name = pool_name[0..6]
combined_name = cluster_name + pool_name
hash_value = Digest::SHA256.hexdigest(combined_name)[0..15]
"#{partial_cluster_name}-#{partial_pool_name}-#{hash_value}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,19 @@
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.
# rubocop:disable Style/SingleArgumentDig

template "#{node['cluster']['scripts_dir']}/slurm/check_login_nodes_stopped.sh" do
source 'slurm/head_node/check_login_nodes_stopped.sh.erb'
owner 'root'
group 'root'
mode '0700'
variables(
cluster_name: node['cluster']['cluster_name'] || node['cluster']['stack_name'],
login_nodes_pool_name: lazy { node['cluster']['config'].dig(:LoginNodes, :Pools, 0, :Name) },
target_group_name: lazy do
get_target_group_name(
node['cluster']['cluster_name'] || node['cluster']['stack_name'],
node['cluster']['config'].dig(:LoginNodes, :Pools, 0, :Name)
)
end,
region: node['cluster']['region']
)
only_if do
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
require_relative '../../../libraries/helpers'

describe 'get_target_group_name' do
shared_examples 'a valid target group name generator' do |cluster_name, pool_name, expected_result|
it 'generates a correctly formatted target group name' do
target_group_name = get_target_group_name(cluster_name, pool_name)
expect(target_group_name).to eq(expected_result)
end
end

context 'when cluster and pool names are regular strings' do
include_examples 'a valid target group name generator', 'test-cluster', 'test-pool', 'test-cl-test-po-18c74b16dfbc78ac'
end

context 'when cluster and pool names are longer strings' do
include_examples 'a valid target group name generator', 'abcdefghijklmnopqrstuvwxyz', 'zyxwvutsrqponmlkjihgfedcba', 'abcdefg-zyxwvut-20f1fcdf919164c7'
end

context 'when cluster and pool names are single characters' do
include_examples 'a valid target group name generator', 'a', 'b', 'a-b-fb8e20fc2e4c3f24'
end
end
Original file line number Diff line number Diff line change
@@ -1,50 +1,19 @@
#!/bin/bash
# This script checks whether there are running login nodes in a specified AWS ParallelCluster stack and login nodes pool.
# It first retrieves the ARN of the Load Balancer associated with the specified stack and login nodes pool.
# If a Load Balancer is found, it then retrieves the ARN of the Target Group associated with the Load Balancer.
# Lastly, it checks the health of the targets in the Target Group to determine the number of healthy and unhealthy login nodes.
# It first retrieves the ARN of the Target Group associated with the LoginNodes Network Load Balancer.
# Then, it checks the health of the targets in the Target Group to determine the number of healthy and unhealthy login nodes.
# If there are any healthy or unhealthy nodes found, it concludes that there are running login nodes.
#
# Usage: ./check_if_has_running_login_nodes.sh

set -e

CLUSTER_NAME="<%= @cluster_name %>"
LOGIN_NODES_POOL_NAME="<%= @login_nodes_pool_name %>"
TARGET_GROUP_NAME="<%= @target_group_name %>"
REGION="<%= @region %>"

# List all Load Balancers
load_balancers=$(aws elbv2 describe-load-balancers --region ${REGION})

# Iterate over Load Balancers to find the one with matching tags
load_balancer_arn=''
for arn in $(echo "${load_balancers}" | jq -r '.LoadBalancers[].LoadBalancerArn'); do
# Get tags for the current Load Balancer
tags=$(aws elbv2 describe-tags --resource-arns "${arn}" --region ${REGION})

# Check if the tags match the desired stack name and login nodes pool name
cluster_name_match=$(echo "${tags}" | jq -r --arg key "parallelcluster:cluster-name" --arg value "${CLUSTER_NAME}" '.TagDescriptions[] | select(.Tags[]? | (.Key == $key and .Value == $value))')
login_nodes_pool_name_match=$(echo "${tags}" | jq -r --arg key "parallelcluster:login-nodes-pool" --arg value "${LOGIN_NODES_POOL_NAME}" '.TagDescriptions[] | select(.Tags[]? | (.Key == $key and .Value == $value))')

# If both tags are found, store the ARN and break the loop
# For now, there's only one pool of login nodes per cluster.
if [[ -n "${cluster_name_match}" && -n "${login_nodes_pool_name_match}" ]]; then
load_balancer_arn="${arn}"
break
fi
done

# Output result
if [[ -n "${load_balancer_arn}" ]]; then
echo "Load Balancer ARN found: ${load_balancer_arn}"
else
echo "No Load Balancer found for the cluster ${CLUSTER_NAME} and login nodes pool ${LOGIN_NODES_POOL_NAME}."
exit 1
fi

# Get Target Group ARN associated with the Load Balancer
# Get Target Group ARN
target_group_arn=$(aws elbv2 describe-target-groups \
--load-balancer-arn $load_balancer_arn \
--names ${TARGET_GROUP_NAME} \
--query "TargetGroups[0].TargetGroupArn" \
--output text \
--region ${REGION})
Expand Down

0 comments on commit a9c1dfb

Please sign in to comment.