Skip to content

Commit

Permalink
ci: delete dangling vms in azure e2e test
Browse files Browse the repository at this point in the history
tags are being used to delete vms that haven't been properly deleted in
the azure e2e test. The deprovisioner will only be called if the tests
have failed. The cleanup has been moved to a separate step.

Signed-off-by: Magnus Kulke <[email protected]>
  • Loading branch information
mkulke committed Jul 4, 2024
1 parent d84e476 commit 1308eda
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 51 deletions.
84 changes: 44 additions & 40 deletions .github/workflows/azure-e2e-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ env:
BUILTIN_CLOUD_PROVIDERS: "azure"
TEST_E2E_CREATE_RG: "no"
ACR_URL: "${{ vars.AZURE_ACR_URL }}"
TEST_TAGS: "owner=github-actions,run=${{ github.run_id }}-${{ github.run_attempt }}"

on:
schedule:
Expand Down Expand Up @@ -110,14 +111,15 @@ jobs:
needs:
- generate-podvm-image-version
- build-caa-container-image
# when none of required steps failed, skipped is ok
if: always() && !failure() && !cancelled()
strategy:
matrix:
parameters:
- id: "tdx"
machine_type: "Standard_DC2es_v5"
- id: "snp"
machine_type: "Standard_DC2as_v5"
if: always() && !cancelled() && needs.build-caa-container-image.result != 'failure'
steps:
- uses: actions/checkout@v3

Expand Down Expand Up @@ -162,6 +164,7 @@ jobs:
KBS_IMAGE="${KBS_IMAGE}"
KBS_IMAGE_TAG="${KBS_IMAGE_TAG}"
AZURE_INSTANCE_SIZE="${AZURE_INSTANCE_SIZE}"
TAGS="${{ env.TEST_TAGS }}"
EOF
cat "$TEST_PROVISION_FILE"
# assert that no variable is unset
Expand Down Expand Up @@ -199,18 +202,17 @@ jobs:
run:
working-directory: src/cloud-api-adaptor
needs:
- build-podvm-image
- build-caa-container-image
- install-aks
- generate-podvm-image-version
- build-podvm-image
# when none of required steps failed, build-podvm-image can be skipped
if: always() && !failure() && !cancelled()
strategy:
matrix:
parameters:
- id: "tdx"
machine_type: "Standard_DC2es_v5"
- id: "snp"
machine_type: "Standard_DC2as_v5"
if: always() && !cancelled() && needs.build-podvm-image.result != 'failure'
steps:
- uses: actions/checkout@v3

Expand Down Expand Up @@ -269,59 +271,37 @@ jobs:
--name "${CLUSTER_NAME}"
make test-e2e
cleanup-resources:
cleanup:
runs-on: ubuntu-latest
defaults:
run:
working-directory: src/cloud-api-adaptor
needs:
- generate-podvm-image-version
- build-podvm-image
- build-caa-container-image
- run-e2e-test
- generate-podvm-image-version
if: always()
strategy:
matrix:
parameters:
- id: "tdx"
machine_type: "Standard_DC2es_v5"
- id: "snp"
machine_type: "Standard_DC2as_v5"
if: always()
steps:
- uses: actions/checkout@v3

- name: Extract go version number
run: echo "GO_VERSION=$(yq -e '.tools.golang' versions.yaml)" >> "$GITHUB_ENV"

- name: Set up Go environment
uses: actions/setup-go@v4
with:
go-version: ${{ env.GO_VERSION }}

- name: Set Provisioner Environment Variables
run: |
echo "TEST_PROVISION_FILE=${{ format(env.TEST_PROVISION_PATH_TEMPLATE, matrix.parameters.id) }}" >> "$GITHUB_ENV"
echo "CLUSTER_NAME=${{ format(env.CLUSTER_NAME_TEMPLATE, matrix.parameters.id) }}" >> "$GITHUB_ENV"
- name: Restore the configuration created before
uses: actions/download-artifact@v3
with:
name: e2e-configuration

- uses: azure/login@v1
name: 'Az CLI login'
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}

# Clean up step, run regardless of the failure state.
- name: Run deprovisioner
working-directory: src/cloud-api-adaptor/test/tools
- name: Delete coco namespace
# We want to delete the coco namespace because CAA might still spawn resources
# which prevents deletion of the AKS cluster
run: |
make caa-provisioner-cli
# Ignore the error if the deprovision fails.
./caa-provisioner-cli -action=deprovision || true
az aks get-credentials \
--resource-group ${{ secrets.AZURE_RESOURCE_GROUP }} \
--name "${{ format(env.CLUSTER_NAME_TEMPLATE, matrix.parameters.id) }}" || true
namespace="confidential-containers-system"
kubectl patch namespace "$namespace" -p '{"metadata":{"finalizers": null }}' || true
kubectl delete namespace "$namespace" || true
- name: Remove podvm image
if: github.event.inputs.podvm-image-id == ''
Expand All @@ -345,11 +325,35 @@ jobs:
--image "${ACR_URL}/cloud-api-adaptor:dev-${GITHUB_SHA}" \
--yes || true
- name: Remove dangling VMs
# Remove any VMs that might have been left behind in failed test runs
run: |
vms=$(az resource list \
--tag owner=github-actions \
--tag run="${{ github.run_id }}-${{ github.run_attempt }}" \
-o tsv --query "[?type == 'Microsoft.Compute/virtualMachines'].name")
for vm in $vms; do
az vm delete -n "$vm" -g "${{ secrets.AZURE_RESOURCE_GROUP }}" --yes || true
done
- name: Remove dangling NICs
# Remove any NICs that might have been left behind in failed test runs
# NICs are reserved for 180s for VMs, even if they never launched
run: |
nics=$(az resource list \
--tag owner=github-actions \
--tag run="${{ github.run_id }}-${{ github.run_attempt }}" \
-o tsv --query "[?type == 'Microsoft.Network/networkInterfaces'].name")
sleep 180
for nic in $nics; do
az network nic delete -n "$nic" -g "${{ secrets.AZURE_RESOURCE_GROUP }}" || true
done
- name: Remove AKS cluster
run: |
# Delete the cluster even if it has been deleted already or does not exists.
az aks delete \
--name "${CLUSTER_NAME}" \
--name "${{ format(env.CLUSTER_NAME_TEMPLATE, matrix.parameters.id) }}" \
--resource-group ${{ secrets.AZURE_RESOURCE_GROUP }} \
--no-wait \
--yes || true
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,7 @@ func getPropertiesImpl() map[string]string {
"AZURE_INSTANCE_SIZE": AzureProps.InstanceSize,
"KBS_IMAGE": AzureProps.KbsImage,
"KBS_IMAGE_TAG": AzureProps.KbsImageTag,
"TAGS": AzureProps.Tags,
}

return props
Expand All @@ -380,7 +381,7 @@ func (p *AzureCloudProvisioner) UploadPodvm(imagePath string, ctx context.Contex

func isAzureKustomizeConfigMapKey(key string) bool {
switch key {
case "CLOUD_PROVIDER", "AZURE_SUBSCRIPTION_ID", "AZURE_REGION", "AZURE_INSTANCE_SIZE", "AZURE_RESOURCE_GROUP", "AZURE_SUBNET_ID", "AZURE_IMAGE_ID", "SSH_USERNAME", "AA_KBC_PARAMS":
case "CLOUD_PROVIDER", "AZURE_SUBSCRIPTION_ID", "AZURE_REGION", "AZURE_INSTANCE_SIZE", "AZURE_RESOURCE_GROUP", "AZURE_SUBNET_ID", "AZURE_IMAGE_ID", "SSH_USERNAME", "AA_KBC_PARAMS", "TAGS":
return true
default:
return false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ type AzureProperties struct {
IsSelfManaged bool
KbsImage string
KbsImageTag string
Tags string

InstanceSize string
NodeName string
Expand Down Expand Up @@ -69,6 +70,7 @@ func initAzureProperties(properties map[string]string) error {
KbsImage: properties["KBS_IMAGE"],
KbsImageTag: properties["KBS_IMAGE_TAG"],
InstanceSize: properties["AZURE_INSTANCE_SIZE"],
Tags: properties["TAGS"],
}

CIManagedStr := properties["IS_CI_MANAGED_CLUSTER"]
Expand Down
22 changes: 12 additions & 10 deletions src/cloud-providers/azure/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ func (p *azureProvider) createNetworkInterface(ctx context.Context, nicName stri
},
},
},
Tags: p.getResourceTags(),
}

if p.serviceConfig.SecurityGroupId != "" {
Expand Down Expand Up @@ -370,6 +371,16 @@ func (p *azureProvider) updateInstanceSizeSpecList() error {
return nil
}

func (p *azureProvider) getResourceTags() map[string]*string {
tags := map[string]*string{}

// Add custom tags from serviceConfig.Tags
for k, v := range p.serviceConfig.Tags {
tags[k] = to.Ptr(v)
}
return tags
}

func (p *azureProvider) getVMParameters(instanceSize, diskName, cloudConfig string, sshBytes []byte, instanceName string, vmNIC *armnetwork.Interface) (*armcompute.VirtualMachine, error) {
userDataB64 := base64.StdEncoding.EncodeToString([]byte(cloudConfig))

Expand Down Expand Up @@ -413,14 +424,6 @@ func (p *azureProvider) getVMParameters(instanceSize, diskName, cloudConfig stri
}
}

// Add tags to the instance
tags := map[string]*string{}

// Add custom tags from serviceConfig.Tags to the instance
for k, v := range p.serviceConfig.Tags {
tags[k] = to.Ptr(v)
}

vmParameters := armcompute.VirtualMachine{
Location: to.Ptr(p.serviceConfig.Region),
Properties: &armcompute.VirtualMachineProperties{
Expand Down Expand Up @@ -469,8 +472,7 @@ func (p *azureProvider) getVMParameters(instanceSize, diskName, cloudConfig stri
},
UserData: to.Ptr(userDataB64),
},
// Add tags to the instance
Tags: tags,
Tags: p.getResourceTags(),
}

return &vmParameters, nil
Expand Down

0 comments on commit 1308eda

Please sign in to comment.