Skip to content

Create AKS cluster, deploy CKF and run bundle test #120

Create AKS cluster, deploy CKF and run bundle test

Create AKS cluster, deploy CKF and run bundle test #120

name: Create AKS cluster, deploy CKF and run bundle test
on:
workflow_dispatch:
inputs:
bundle_version:
description: 'Comma-separated list of bundle versions e.g. 1.8, 1.9, latest. Make sure that the corresponding K8s version is supported by the cloud.'
default: '1.8, 1.9, latest'
required: true
k8s_version:
description: 'Kubernetes version to be used for the AKS cluster'
required: false
uats_branch:
description: 'Branch to run the UATs from e.g. main or track/1.9. By default, this is defined by the dependencies.yaml file.'
required: false
schedule:
- cron: "17 0 * * 2"
jobs:
preprocess-input:
runs-on: ubuntu-24.04
outputs:
processed_bundle_versions: ${{ steps.process_bundle_versions.outputs.bundle_versions }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install CLI tools
run: |
sudo snap install yq
- name: Process bundle versions
id: process_bundle_versions
run: |
if [[ "${{ github.event_name }}" == "schedule" ]]; then
# Use `tr` to remove new lines as a workaround to:
# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#multiline-strngs
bundle_versions=$(yq '. | keys' .github/dependencies.yaml -o=json | tr -d '\n')
echo "bundle_versions=${bundle_versions}"
echo "bundle_versions=${bundle_versions}" >> $GITHUB_OUTPUT
else
python scripts/gh-actions/parse_versions.py "${{ inputs.bundle_version }}"
fi
deploy-ckf-to-aks:
needs: preprocess-input
runs-on: ubuntu-24.04
strategy:
matrix:
bundle_version: ${{ fromJSON(needs.preprocess-input.outputs.processed_bundle_versions) }}
fail-fast: false
env:
AZURE_CORE_OUTPUT: none
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Run YAML to Github Output Action
id: yaml-output
uses: christian-ci/action-yaml-github-output@v2
with:
file_path: ".github/dependencies.yaml"
main_key: ${{ matrix.bundle_version }}
- name: Update ENV variables from inputs if available
run: |
K8S_VERSION=${{ inputs.k8s_version || env.K8S_VERSION }}
echo "K8S_VERSION=${K8S_VERSION}" >> $GITHUB_ENV
UATS_BRANCH=${{ inputs.uats_branch || env.UATS_BRANCH }}
echo "UATS_BRANCH=${UATS_BRANCH}" >> $GITHUB_ENV
- name: Install CLI tools
run: |
pip install tox
sudo snap install juju --classic --channel=${{ env.JUJU_VERSION }}/stable
sudo snap install charmcraft --classic
juju version
- uses: azure/login@v1
with:
creds: ${{ secrets.BUNDLE_KUBEFLOW_AKS_SERVICE_PRINCIPAL }}
- name: Create resource group and cluster
run: |
# We need to remove the dot from version
# due to cluster naming restrictions
version=${{ matrix.bundle_version }}
KF_VERSION="kf-${version//.}"
RESOURCE_GROUP=${KF_VERSION}-ResourceGroup
NAME=${KF_VERSION}-AKSCluster
LOCATION=westeurope
echo "RESOURCE_GROUP=${RESOURCE_GROUP}" >> $GITHUB_ENV
echo "NAME=${NAME}" >> $GITHUB_ENV
echo "LOCATION=${LOCATION}" >> $GITHUB_ENV
az group create --name ${RESOURCE_GROUP} --location ${LOCATION}
az aks create \
--resource-group ${RESOURCE_GROUP} \
--name ${NAME} \
--kubernetes-version ${{ env.K8S_VERSION }} \
--node-count 2 \
--node-vm-size Standard_D8s_v3 \
--node-osdisk-size 100 \
--node-osdisk-type Managed \
--os-sku Ubuntu \
--no-ssh-key
- name: Add AKS cloud to juju and bootstrap controller
run: |
az aks get-credentials --resource-group ${{ env.RESOURCE_GROUP }} --name ${{ env.NAME }} --admin
juju add-k8s aks --client
juju bootstrap aks aks-controller
juju add-model kubeflow
- name: Test bundle deployment
run: |
tox -vve test_bundle_deployment-${{ matrix.bundle_version }} -- --model kubeflow --keep-models -vv -s
- name: Run Kubeflow UATs
run: |
git clone https://github.com/canonical/charmed-kubeflow-uats.git ~/charmed-kubeflow-uats
cd ~/charmed-kubeflow-uats
git checkout ${{ env.UATS_BRANCH }}
tox -e kubeflow-remote
# On failure, capture debugging resources
- name: Save debug artifacts
uses: canonical/kubeflow-ci/actions/dump-charm-debug-artifacts@main
if: always()
- name: Get juju status
run: juju status
if: failure() || cancelled()
- name: Get juju debug logs
run: juju debug-log --replay --no-tail
if: failure() || cancelled()
- name: Get all kubernetes resources
run: kubectl get all -A
if: failure() || cancelled()
- name: Describe all pods
if: failure() || cancelled()
run: kubectl describe pods --all-namespaces
- name: Get logs from pods with status = Pending
run: kubectl -n kubeflow get pods | tail -n +2 | grep Pending | awk '{print $1}' | xargs -n1 kubectl -n kubeflow logs --all-containers=true --tail 100
if: failure() || cancelled()
- name: Get logs from pods with status = Failed
run: kubectl -n kubeflow get pods | tail -n +2 | grep Failed | awk '{print $1}' | xargs -n1 kubectl -n kubeflow logs --all-containers=true --tail 100
if: failure() || cancelled()
- name: Get logs from pods with status = CrashLoopBackOff
run: kubectl -n kubeflow get pods | tail -n +2 | grep CrashLoopBackOff | awk '{print $1}' | xargs -n1 kubectl -n kubeflow logs --all-containers=true --tail 100
if: failure() || cancelled()
- name: Delete AKS cluster
if: always()
run: az aks delete --resource-group ${{ env.RESOURCE_GROUP }} --name ${{ env.NAME }} --yes
- name: Delete resource groups
if: always()
run: |
az group delete --name ${{ env.RESOURCE_GROUP }} --yes
if [ "$(az group exists --name MC_${{ env.RESOURCE_GROUP }}_${{ env.NAME }}_${{ env.LOCATION }})" = "true" ]; then
az group delete --name MC_${{ env.RESOURCE_GROUP }}_${{ env.NAME }}_${{ env.LOCATION }} --yes
fi
- name: Check that resource groups have been deleted, else fail
if: always()
run: |
if [ "$(az group exists --name ${{ env.RESOURCE_GROUP }} )" = "true" ] || [ "$(az group exists --name MC_${{ env.RESOURCE_GROUP }}_${{ env.NAME }}_${{ env.LOCATION }})" = "true" ]; then
exit 1
fi