diff --git a/.github/workflows/kind-e2e.yml b/.github/workflows/kind-e2e.yml index 72539edda..ebe92e1bb 100644 --- a/.github/workflows/kind-e2e.yml +++ b/.github/workflows/kind-e2e.yml @@ -85,6 +85,10 @@ jobs: working-directory: ./e2e run: ./test-default-route1.sh + - name: Test DRA integration + working-directory: ./e2e + run: ./test-dra-integration.sh + - name: Export kind logs if: always() run: | diff --git a/.gitignore b/.gitignore index 186b9dfa0..af074ab8c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ bin/ e2e/bin/ e2e/yamls/ +e2e/repos/ # GOPATH created by the build script gopath/ diff --git a/e2e/get_tools.sh b/e2e/get_tools.sh index fbc5b4057..c8dbf292e 100755 --- a/e2e/get_tools.sh +++ b/e2e/get_tools.sh @@ -13,3 +13,4 @@ curl -Lo ./bin/koko https://github.com/redhat-nfvpe/koko/releases/download/v0.83 chmod +x ./bin/koko curl -Lo ./bin/jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 chmod +x ./bin/jq +wget -qO- https://get.helm.sh/helm-v3.14.3-linux-amd64.tar.gz | tar xvzf - --strip-components=1 -C ./bin linux-amd64/helm diff --git a/e2e/setup_cluster.sh b/e2e/setup_cluster.sh index 40286628a..6963405bc 100755 --- a/e2e/setup_cluster.sh +++ b/e2e/setup_cluster.sh @@ -34,7 +34,21 @@ nodes: nodeRegistration: kubeletExtraArgs: pod-manifest-path: "/etc/kubernetes/manifests/" + feature-gates: "DynamicResourceAllocation=true,KubeletPodResourcesDynamicResources=true" - role: worker +# Required by DRA Integration +## +featureGates: + DynamicResourceAllocation: true +runtimeConfig: + "api/alpha": "true" +containerdConfigPatches: +# Enable CDI as described in +# https://github.com/container-orchestrated-devices/container-device-interface#containerd-configuration +- |- + [plugins."io.containerd.grpc.v1.cri"] + enable_cdi = true +## EOF # load multus image from container host to kind node diff --git a/e2e/templates/dra-integration.yml.j2 b/e2e/templates/dra-integration.yml.j2 new file mode 100644 index 000000000..33334d900 --- /dev/null +++ b/e2e/templates/dra-integration.yml.j2 @@ -0,0 +1,49 @@ +--- +apiVersion: resource.k8s.io/v1alpha2 +kind: ResourceClaimTemplate +metadata: + name: gpu.example.com +spec: + spec: + resourceClassName: gpu.example.com +--- +apiVersion: "k8s.cni.cncf.io/v1" +kind: NetworkAttachmentDefinition +metadata: + name: dra-net + annotations: + k8s.v1.cni.cncf.io/resourceName: gpu.example.com +spec: + config: '{ + "cniVersion": "{{ CNI_VERSION }}", + "plugins": [{ + "name": "mynet", + "type": "dummy", + "ipam": { + "type": "host-local", + "subnet": "10.1.2.0/24" + } + }] + }' +--- +apiVersion: v1 +kind: Pod +metadata: + name: dra-integration + labels: + app: dra-integration + annotations: + k8s.v1.cni.cncf.io/networks: default/dra-net +spec: + containers: + - name: ctr0 + image: ubuntu:22.04 + command: ["bash", "-c"] + args: ["export; sleep 9999"] + resources: + claims: + - name: gpu + resourceClaims: + - name: gpu + source: + resourceClaimTemplateName: gpu.example.com diff --git a/e2e/templates/multus-daemonset-thick.yml.j2 b/e2e/templates/multus-daemonset-thick.yml.j2 index 035304188..d839cd9e6 100644 --- a/e2e/templates/multus-daemonset-thick.yml.j2 +++ b/e2e/templates/multus-daemonset-thick.yml.j2 @@ -158,6 +158,9 @@ spec: - name: multus-daemon-config mountPath: /etc/cni/net.d/multus.d readOnly: true + - name: kubelet-pod-resources + mountPath: /var/lib/kubelet/pod-resources + readOnly: true env: - name: MULTUS_NODE_NAME valueFrom: @@ -187,6 +190,9 @@ spec: - name: cnibin hostPath: path: /opt/cni/bin + - name: kubelet-pod-resources + hostPath: + path: /var/lib/kubelet/pod-resources - name: multus-daemon-config configMap: name: multus-daemon-config diff --git a/e2e/test-dra-integration.sh b/e2e/test-dra-integration.sh new file mode 100755 index 000000000..997996cb4 --- /dev/null +++ b/e2e/test-dra-integration.sh @@ -0,0 +1,59 @@ +#!/bin/sh +set -o errexit + +export PATH=${PATH}:./bin + +# This test is using an example implementation of a DRA driver. This driver is mocking GPU resources. At our test we +# don't care about what these resources are. We want to ensure that such resource is correctly passed in the Pod using +# Multus configurations. A couple of notes: +# - We explitictly don't pin the revision of the dra-example-driver to a specific commit to ensure that the integration +# continues to work even when the dra-example-driver is updated (which may also indicate API changes on the DRA). +# - The chart and latest is image is not published somewhere, therefore we have to build locally. This leads to slower +# e2e suite runs. +echo "installing dra-example-driver" +repo_path="repos/dra-example-driver" + +rm -rf $repo_path || true +git clone https://github.com/kubernetes-sigs/dra-example-driver.git ${repo_path} +${repo_path}/demo/build-driver.sh +KIND_CLUSTER_NAME=kind ${repo_path}/demo/scripts/load-driver-image-into-kind.sh +chart_path=${repo_path}/deployments/helm/dra-example-driver/ +overriden_values_path=${chart_path}/overriden_values.yaml + +# With the thick plugin, in kind, the primary network on the control plane is not always working as expected. The pods +# sometimes are not able to communicate with the control plane and the error looks like this: +# failed to list *v1alpha2.PodSchedulingContext: Get "https://10.96.0.1:443/apis/resource.k8s.io/v1alpha2/podschedulingcontexts?limit=500&resourceVersion=0": dial tcp 10.96.0.1:443: connect: no route to host +# We override the values here to schedule the controller on the worker nodes where the network is working as expected. +cat <> ${overriden_values_path} +controller: + nodeSelector: null + tolerations: null +EOF + +helm install \ + -n dra-example-driver \ + --create-namespace \ + -f ${overriden_values_path} \ + dra-example-driver \ + ${chart_path} + +echo "installing testing pods" +kubectl create -f yamls/dra-integration.yml +kubectl wait --for=condition=ready -l app=dra-integration --timeout=300s pod + +echo "check dra-integration pod for DRA injected environment variable" + +# We can validate that the resource is correctly injected by checking an environment variable this dra driver is injecting +# in the Pod. +# https://github.com/kubernetes-sigs/dra-example-driver/blob/be2b8b1db47b8c757440e955ce5ced88c23bfe86/cmd/dra-example-kubeletplugin/cdi.go#L71C20-L71C44 +env_variable=$(kubectl exec dra-integration -- bash -c "echo \$DRA_RESOURCE_DRIVER_NAME | grep gpu.resource.example.com") +if [ $? -eq 0 ];then + echo "dra-integration pod has DRA injected environment variable" +else + echo "dra-integration pod doesn't have DRA injected environment variable" + exit 1 +fi + +echo "cleanup resources" +kubectl delete -f yamls/dra-integration.yml +helm uninstall -n dra-example-driver dra-example-driver