Skip to content

Commit

Permalink
Merge pull request #1245 from buildkite/pdp-1828-take-over-terminate-…
Browse files Browse the repository at this point in the history
…instance-pr
  • Loading branch information
triarius authored Oct 23, 2023
2 parents 35e31db + 198db6e commit d80bf0d
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 14 deletions.
15 changes: 15 additions & 0 deletions packer/linux/conf/bin/bk-install-elastic-stack.sh
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@ echo Writing Phase 2/2 for /var/lib/buildkite-agent/cfn-env helper function...
cat <<EOF >>/var/lib/buildkite-agent/cfn-env
set_always "BUILDKITE_AGENTS_PER_INSTANCE" "$BUILDKITE_AGENTS_PER_INSTANCE"
# also set via /etc/systemd/system/buildkite-agent.service.d/environment.conf
set_always "BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB" "$BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB"
set_always "BUILDKITE_ECR_POLICY" "${BUILDKITE_ECR_POLICY:-none}"
set_always "BUILDKITE_SECRETS_BUCKET" "$BUILDKITE_SECRETS_BUCKET"
set_always "BUILDKITE_SECRETS_BUCKET_REGION" "$BUILDKITE_SECRETS_BUCKET_REGION"
Expand Down Expand Up @@ -357,6 +361,17 @@ done
echo "Waited $next_wait_time times for docker to start. We will exit if it still has not started."
check_docker

echo Writing buildkite-agent systemd environment override...
# also set in /var/lib/buildkite-agent/cfn-env so that it's shown in the job logs
mkdir -p /etc/systemd/system/buildkite-agent.service.d
cat <<EOF | tee /etc/systemd/system/buildkite-agent.service.d/environment.conf
[Service]
Environment="BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB=${BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB}"
EOF

echo Reloading systemctl services...
systemctl daemon-reload

echo Starting buildkite-agent...
systemctl enable --now buildkite-agent

Expand Down
49 changes: 43 additions & 6 deletions packer/linux/conf/buildkite-agent/scripts/terminate-instance
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,51 @@

set -euo pipefail

echo "sleeping for 10 seconds before terminating instance to allow agent logs to drain to cloudwatch..."
terminate() {
aws autoscaling terminate-instance-in-auto-scaling-group \
--region "$1" \
--instance-id "$2" \
--should-decrement-desired-capacity
}

mark_as_unhealthy() {
aws autoscaling set-instance-health \
--region "$1" \
--instance-id "$2" \
--health-status Unhealthy
}

echo "sleeping for 10 seconds before terminating instance to allow agent logs to drain to cloudwatch..."
sleep 10

token=$(curl -X PUT -H "X-aws-ec2-metadata-token-ttl-seconds: 60" --fail --silent --show-error --location "http://169.254.169.254/latest/api/token")
instance_id=$(curl -H "X-aws-ec2-metadata-token: $token" --fail --silent --show-error --location "http://169.254.169.254/latest/meta-data/instance-id")
region=$(curl -H "X-aws-ec2-metadata-token: $token" --fail --silent --show-error --location "http://169.254.169.254/latest/meta-data/placement/region")
token=$(
curl \
--fail --silent --show-error \
-X PUT \
-H "X-aws-ec2-metadata-token-ttl-seconds: 60" \
--location "http://169.254.169.254/latest/api/token"
)
instance_id=$(
curl \
--fail --silent --show-error \
-H "X-aws-ec2-metadata-token: $token" \
--location "http://169.254.169.254/latest/meta-data/instance-id"
)
region=$(
curl \
--fail --silent --show-error \
-H "X-aws-ec2-metadata-token: $token" \
--location "http://169.254.169.254/latest/meta-data/placement/region"
)

echo "requesting instance termination..."

aws autoscaling terminate-instance-in-auto-scaling-group --region "$region" --instance-id "$instance_id" "--should-decrement-desired-capacity"
if [[ $BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB == "true" ]]; then
# If the ASG is at the min capacity, the call to terminate-instance-in-autoscaling-group
# In this case, we mark the instance as unhealthy, then the ASG will spin up a new instance
# to replace it.
terminate "$region" "$instance_id" || mark_as_unhealthy "$region" "$instance_id"
else
# If we're not in terminate-after-job mode, then it's fine for this to fail. Systemd will restart
# the agent and it'll be as if the instance never got shut down.
terminate "$region" "$instance_id"
fi
19 changes: 16 additions & 3 deletions packer/windows/conf/bin/bk-install-elastic-stack.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ function set_always() {
Add-Content -Path C:\buildkite-agent\cfn-env -Value @"
set_always "BUILDKITE_AGENTS_PER_INSTANCE" "$Env:BUILDKITE_AGENTS_PER_INSTANCE"
# also set via nssm
set_always "BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB" "$Env:BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB"
set_always "BUILDKITE_ECR_POLICY" "$Env:BUILDKITE_ECR_POLICY"
set_always "BUILDKITE_SECRETS_BUCKET" "$Env:BUILDKITE_SECRETS_BUCKET"
set_always "BUILDKITE_SECRETS_BUCKET_REGION" "$Env:BUILDKITE_SECRETS_BUCKET_REGION"
Expand Down Expand Up @@ -146,7 +150,7 @@ tracing-backend=${Env:BUILDKITE_AGENT_TRACING_BACKEND}
"@
$OFS=" "

nssm set lifecycled AppEnvironmentExtra :AWS_REGION=$Env:AWS_REGION
nssm set lifecycled AppEnvironmentExtra +AWS_REGION=$Env:AWS_REGION
nssm set lifecycled AppEnvironmentExtra +LIFECYCLED_HANDLER="C:\buildkite-agent\bin\stop-agent-gracefully.ps1"
Restart-Service lifecycled

Expand Down Expand Up @@ -212,26 +216,35 @@ Write-Output "Starting the Buildkite Agent"

nssm install buildkite-agent C:\buildkite-agent\bin\buildkite-agent.exe start
If ($lastexitcode -ne 0) { Exit $lastexitcode }

nssm set buildkite-agent ObjectName .\$UserName $Password
If ($lastexitcode -ne 0) { Exit $lastexitcode }

nssm set buildkite-agent AppStdout C:\buildkite-agent\buildkite-agent.log
If ($lastexitcode -ne 0) { Exit $lastexitcode }

nssm set buildkite-agent AppStderr C:\buildkite-agent\buildkite-agent.log
If ($lastexitcode -ne 0) { Exit $lastexitcode }
nssm set buildkite-agent AppEnvironmentExtra :HOME=C:\buildkite-agent

nssm set buildkite-agent AppEnvironmentExtra +HOME=C:\buildkite-agent

If ((![string]::IsNullOrEmpty($Env:BUILDKITE_ENV_FILE_URL)) -And (Test-Path -Path C:\buildkite-agent\env -PathType leaf)) {
foreach ($var in Get-Content C:\buildkite-agent\env) {
nssm set buildkite-agent AppEnvironmentExtra $var
nssm set buildkite-agent AppEnvironmentExtra "+$var"
If ($lastexitcode -ne 0) { Exit $lastexitcode }
}
}

# also set in cfn so it's show in job logs
nssm set buildkite-agent AppEnvironmentExtra +BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB=$Env:BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB
If ($lastexitcode -ne 0) { Exit $lastexitcode }

nssm set buildkite-agent AppExit Default Restart
If ($lastexitcode -ne 0) { Exit $lastexitcode }

nssm set buildkite-agent AppRestartDelay 10000
If ($lastexitcode -ne 0) { Exit $lastexitcode }

nssm set buildkite-agent AppEvents Exit/Post "powershell C:\buildkite-agent\bin\terminate-instance.ps1"
If ($lastexitcode -ne 0) { Exit $lastexitcode }

Expand Down
25 changes: 20 additions & 5 deletions packer/windows/conf/buildkite-agent/scripts/terminate-instance.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,28 @@ $Token = (Invoke-WebRequest -UseBasicParsing -Method Put -Headers @{'X-aws-ec2-m
$InstanceId = (Invoke-WebRequest -UseBasicParsing -Headers @{'X-aws-ec2-metadata-token' = $Token} http://169.254.169.254/latest/meta-data/instance-id).content
$Region = (Invoke-WebRequest -UseBasicParsing -Headers @{'X-aws-ec2-metadata-token' = $Token} http://169.254.169.254/latest/meta-data/placement/region).content

Write-Output "terminate-instance: disconnecting agent..."
nssm stop buildkite-agent

Write-Output "terminate-instance: requesting instance termination..."
aws autoscaling terminate-instance-in-auto-scaling-group --region "$Region" --instance-id "$InstanceId" "--should-decrement-desired-capacity" 2> $null

if ($lastexitcode -eq 0) { # If autoscaling request was successful, we will terminate
Write-Output "terminate-instance: disabling buildkite-agent service"
nssm stop buildkite-agent
}
else {
# If autoscaling request was successful, we will terminate the instance, otherwise, if
# BuildkiteTerminateInstanceAfterJob is set to true, we will mark the instance as unhealthy
# so that the ASG will terminate it despite scale-in protection. Otherwise, we should not
# terminate the instance, so we need to retart the agent.
if ($lastexitcode -eq 0) {
Write-Output "terminate-instance: terminating instance..."
} else {
Write-Output "terminate-instance: ASG could not decrement (we're already at minSize)"
if ($Env:BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB -eq "true") {
Write-Output "terminate-instance: marking instance as unhealthy"
aws autoscaling set-instance-health `
--instance-id "$InstanceId" `
--region "$Region" `
--health-status Unhealthy
} else {
Write-Output "terminate-instance: restarting agent..."
nssm start buildkite-agent
}
}

0 comments on commit d80bf0d

Please sign in to comment.