From 5e1c480cf494688f0147ae22c92a952a68b9c496 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Sat, 13 Jul 2024 01:14:16 +0000 Subject: [PATCH] WIP: Update the rest of the code for 1.31 Signed-off-by: Kevin Klues --- .gitignore | 1 - README.md | 98 +++++--- .../claimparametersgen.go | 229 ------------------ cmd/dra-example-controller/main.go | 207 ---------------- cmd/dra-example-kubeletplugin/cdi.go | 42 ++-- cmd/dra-example-kubeletplugin/checkpoint.go | 2 +- cmd/dra-example-kubeletplugin/discovery.go | 19 +- cmd/dra-example-kubeletplugin/driver.go | 99 +++----- cmd/dra-example-kubeletplugin/main.go | 38 ++- cmd/dra-example-kubeletplugin/state.go | 132 +++------- demo/gpu-test1.yaml | 11 +- demo/gpu-test2.yaml | 8 +- demo/gpu-test3.yaml | 11 +- demo/gpu-test4.yaml | 23 +- deployments/container/Dockerfile | 1 - .../templates/controller.yaml | 58 ----- .../templates/deviceclass.yaml | 9 + .../templates/resourceclass.yaml | 7 - go.mod | 6 +- go.sum | 8 +- pkg/flags/crds.go | 51 ---- pkg/flags/kubeclient.go | 13 +- 22 files changed, 224 insertions(+), 849 deletions(-) delete mode 100644 cmd/dra-example-controller/claimparametersgen.go delete mode 100644 cmd/dra-example-controller/main.go delete mode 100644 deployments/helm/dra-example-driver/templates/controller.yaml create mode 100644 deployments/helm/dra-example-driver/templates/deviceclass.yaml delete mode 100644 deployments/helm/dra-example-driver/templates/resourceclass.yaml delete mode 100644 pkg/flags/crds.go diff --git a/.gitignore b/.gitignore index 5e8272e1..20171eb0 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,4 @@ vendor/ [._]*.sw[a-p] -./dra-example-controller ./dra-example-kubeletplugin diff --git a/README.md b/README.md index c8a9de6f..30a777ff 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,8 @@ subdirectory. All of the scripts and example Pod specs used in this demo are contained here, so take a moment to browse through the various files and see what's available: ``` -git clone https://github.com/kubernetes-sigs/dra-example-driver.git +git clone https://github.com/klueska/dra-example-driver.git +git checkout -t origin/update-v1.31 cd dra-example-driver ``` @@ -96,32 +97,69 @@ And show the initial state of available GPU devices on the worker node: $ kubectl get resourceslice -o yaml apiVersion: v1 items: -- apiVersion: resource.k8s.io/v1alpha2 - driverName: gpu.resource.example.com +- apiVersion: resource.k8s.io/v1alpha3 kind: ResourceSlice metadata: - creationTimestamp: "2024-04-17T13:45:44Z" - generateName: dra-example-driver-cluster-worker-gpu.resource.example.com- - name: dra-example-driver-cluster-worker-gpu.resource.example.comxktph + creationTimestamp: "2024-07-13T01:10:56Z" + generateName: dra-example-driver-cluster-worker-gpu.example.com- + generation: 1 + name: dra-example-driver-cluster-worker-gpu.example.com-vkx6z ownerReferences: - apiVersion: v1 controller: true kind: Node name: dra-example-driver-cluster-worker - uid: 4dc7c3b2-d99c-492b-8ede-37d435e56b2d - resourceVersion: "1189" - uid: 61c965b5-54a9-40ee-88a1-c52a814fa624 - namedResources: - instances: - - name: gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac - - name: gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b - - name: gpu-18db0e85-99e9-c746-8531-ffeb86328b39 - - name: gpu-93d37703-997c-c46f-a531-755e3e0dc2ac - - name: gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744 - - name: gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243 - - name: gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747 - - name: gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e - nodeName: dra-example-driver-cluster-worker + uid: fe806a5b-17a3-42d6-a9bc-fc8ad7609a1a + resourceVersion: "523" + uid: 7a61895f-f5d6-4679-a68c-7ae3c148c9cb + spec: + driver: gpu.example.com + nodeName: dra-example-driver-cluster-worker + pool: + generation: 0 + name: dra-example-driver-cluster-worker + resourceSliceCount: 1 + devices: + - basic: + attributes: + model: + string: LATEST-GPU-MODEL + name: gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747 + - basic: + attributes: + model: + string: LATEST-GPU-MODEL + name: gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e + - basic: + attributes: + model: + string: LATEST-GPU-MODEL + name: gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac + - basic: + attributes: + model: + string: LATEST-GPU-MODEL + name: gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b + - basic: + attributes: + model: + string: LATEST-GPU-MODEL + name: gpu-18db0e85-99e9-c746-8531-ffeb86328b39 + - basic: + attributes: + model: + string: LATEST-GPU-MODEL + name: gpu-93d37703-997c-c46f-a531-755e3e0dc2ac + - basic: + attributes: + model: + string: LATEST-GPU-MODEL + name: gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744 + - basic: + attributes: + model: + string: LATEST-GPU-MODEL + name: gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243 kind: List metadata: resourceVersion: "" @@ -173,28 +211,28 @@ This should produce output similar to the following: ```bash gpu-test1: pod0 ctr0: -declare -x GPU_DEVICE_0="GPU-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b" +declare -x GPU_DEVICE_0="gpu-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747" pod1 ctr0: -declare -x GPU_DEVICE_0="GPU-ee3e4b55-fcda-44b8-0605-64b7a9967744" +declare -x GPU_DEVICE_0="gpu-f11773a1-5bfb-e48b-3d98-1beb5baaf08e" gpu-test2: pod0 ctr0: -declare -x GPU_DEVICE_0="GPU-9ede7e32-5825-a11b-fa3d-bab6d47e0243" +declare -x GPU_DEVICE_0="gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac" pod0 ctr1: -declare -x GPU_DEVICE_0="GPU-9ede7e32-5825-a11b-fa3d-bab6d47e0243" +declare -x GPU_DEVICE_0="gpu-0159f35e-99ee-b2b5-74f1-9d18df3f22ac" gpu-test3: pod0 ctr0: -declare -x GPU_DEVICE_0="GPU-93d37703-997c-c46f-a531-755e3e0dc2ac" +declare -x GPU_DEVICE_0="gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b" pod1 ctr0: -declare -x GPU_DEVICE_0="GPU-93d37703-997c-c46f-a531-755e3e0dc2ac" +declare -x GPU_DEVICE_0="gpu-657bd2e7-f5c2-a7f2-fbaa-0d1cdc32f81b" gpu-test4: pod0 ctr0: -declare -x GPU_DEVICE_0="GPU-18db0e85-99e9-c746-8531-ffeb86328b39" -declare -x GPU_DEVICE_1="GPU-e7b42cb1-4fd8-91b2-bc77-352a0c1f5747" -declare -x GPU_DEVICE_2="GPU-f11773a1-5bfb-e48b-3d98-1beb5baaf08e" -declare -x GPU_DEVICE_3="GPU-0159f35e-99ee-b2b5-74f1-9d18df3f22ac" +declare -x GPU_DEVICE_0="gpu-18db0e85-99e9-c746-8531-ffeb86328b39" +declare -x GPU_DEVICE_1="gpu-93d37703-997c-c46f-a531-755e3e0dc2ac" +declare -x GPU_DEVICE_2="gpu-ee3e4b55-fcda-44b8-0605-64b7a9967744" +declare -x GPU_DEVICE_3="gpu-9ede7e32-5825-a11b-fa3d-bab6d47e0243" ``` In this example resource driver, no "actual" GPUs are made available to any diff --git a/cmd/dra-example-controller/claimparametersgen.go b/cmd/dra-example-controller/claimparametersgen.go deleted file mode 100644 index dd21d1fb..00000000 --- a/cmd/dra-example-controller/claimparametersgen.go +++ /dev/null @@ -1,229 +0,0 @@ -package main - -import ( - "context" - "encoding/json" - "fmt" - "strings" - - resourceapi "k8s.io/api/resource/v1alpha3" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/watch" - "k8s.io/client-go/dynamic" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/tools/cache" - "k8s.io/klog/v2" - "k8s.io/utils/ptr" - - gpucrd "sigs.k8s.io/dra-example-driver/api/example.com/resource/gpu/v1alpha1" -) - -const ( - DriverAPIGroup = gpucrd.GroupName - DriverName = gpucrd.GroupName -) - -func StartClaimParametersGenerator(ctx context.Context, config *Config) error { - // Build a client set config - csconfig, err := config.flags.kubeClientConfig.NewClientSetConfig() - if err != nil { - return fmt.Errorf("error creating client set config: %w", err) - } - - // Create a new dynamic client - dynamicClient, err := dynamic.NewForConfig(csconfig) - if err != nil { - return fmt.Errorf("error creating dynamic client: %w", err) - } - - klog.Info("Starting ResourceClaimParamaters generator") - - // Set up informer to watch for GpuClaimParameters objects - gpuClaimParametersInformer := newGpuClaimParametersInformer(ctx, dynamicClient) - - // Set up handler for events - _, err = gpuClaimParametersInformer.AddEventHandler(newGpuClaimParametersHandler(ctx, config.clientSets.Core, dynamicClient)) - if err != nil { - return fmt.Errorf("error adding event handler: %w", err) - } - - // Start informer - go gpuClaimParametersInformer.Run(ctx.Done()) - - return nil -} - -func newGpuClaimParametersInformer(ctx context.Context, dynamicClient dynamic.Interface) cache.SharedIndexInformer { - // Set up shared index informer for GpuClaimParameters objects - gvr := schema.GroupVersionResource{ - Group: gpucrd.GroupName, - Version: gpucrd.Version, - Resource: strings.ToLower(gpucrd.GpuClaimParametersKind), - } - - informer := cache.NewSharedIndexInformer( - &cache.ListWatch{ - ListFunc: func(options metav1.ListOptions) (runtime.Object, error) { - return dynamicClient.Resource(gvr).List(ctx, metav1.ListOptions{}) - }, - WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) { - return dynamicClient.Resource(gvr).Watch(ctx, metav1.ListOptions{}) - }, - }, - &unstructured.Unstructured{}, - 0, // resyncPeriod - cache.Indexers{}, - ) - - return informer -} - -func newGpuClaimParametersHandler(ctx context.Context, clientset kubernetes.Interface, dynamicClient dynamic.Interface) cache.ResourceEventHandler { - return cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj any) { - unstructured, ok := obj.(*unstructured.Unstructured) - if !ok { - klog.Errorf("Error converting object to *unstructured.Unstructured: %v", obj) - } - - var gpuClaimParameters gpucrd.GpuClaimParameters - err := runtime.DefaultUnstructuredConverter.FromUnstructured(unstructured.Object, &gpuClaimParameters) - if err != nil { - klog.Errorf("Error converting *unstructured.Unstructured to GpuClaimParameters: %v", err) - return - } - - if err := createOrUpdateResourceClaimParameters(ctx, clientset, &gpuClaimParameters); err != nil { - klog.Errorf("Error creating ResourceClaimParameters: %v", err) - return - } - }, - UpdateFunc: func(oldObj any, newObj any) { - unstructured, ok := newObj.(*unstructured.Unstructured) - if !ok { - klog.Errorf("Error converting object to *unstructured.Unstructured: %v", newObj) - } - - var gpuClaimParameters gpucrd.GpuClaimParameters - err := runtime.DefaultUnstructuredConverter.FromUnstructured(unstructured.Object, &gpuClaimParameters) - if err != nil { - klog.Errorf("Error converting *unstructured.Unstructured to GpuClaimParameters: %v", err) - return - } - - if err := createOrUpdateResourceClaimParameters(ctx, clientset, &gpuClaimParameters); err != nil { - klog.Errorf("Error updating ResourceClaimParameters: %v", err) - return - } - }, - } -} - -func newResourceClaimParametersFromGpuClaimParameters(gpuClaimParameters *gpucrd.GpuClaimParameters) (*resourceapi.ResourceClaimParameters, error) { - namespace := gpuClaimParameters.Namespace - - rawSpec, err := json.Marshal(gpuClaimParameters.Spec) - if err != nil { - return nil, fmt.Errorf("error marshaling GpuClaimParamaters to JSON: %w", err) - } - - resourceCount := 1 - if gpuClaimParameters.Spec.Count != nil { - resourceCount = *gpuClaimParameters.Spec.Count - } - - selector := "true" - shareable := true - - var resourceRequests []resourceapi.ResourceRequest - for i := 0; i < resourceCount; i++ { - request := resourceapi.ResourceRequest{ - ResourceRequestModel: resourceapi.ResourceRequestModel{ - NamedResources: &resourceapi.NamedResourcesRequest{ - Selector: selector, - }, - }, - } - resourceRequests = append(resourceRequests, request) - } - - resourceClaimParameters := &resourceapi.ResourceClaimParameters{ - ObjectMeta: metav1.ObjectMeta{ - GenerateName: "resource-claim-parameters-", - Namespace: namespace, - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: gpuClaimParameters.APIVersion, - Kind: gpuClaimParameters.Kind, - Name: gpuClaimParameters.Name, - UID: gpuClaimParameters.UID, - BlockOwnerDeletion: ptr.To(true), - }, - }, - }, - GeneratedFrom: &resourceapi.ResourceClaimParametersReference{ - APIGroup: gpucrd.GroupName, - Kind: gpuClaimParameters.Kind, - Name: gpuClaimParameters.Name, - }, - DriverRequests: []resourceapi.DriverRequests{ - { - DriverName: DriverName, - VendorParameters: runtime.RawExtension{Raw: rawSpec}, - Requests: resourceRequests, - }, - }, - Shareable: shareable, - } - - return resourceClaimParameters, nil -} - -func createOrUpdateResourceClaimParameters(ctx context.Context, clientset kubernetes.Interface, gpuClaimParameters *gpucrd.GpuClaimParameters) error { - namespace := gpuClaimParameters.Namespace - - // Get a list of existing ResourceClaimParameters in the same namespace as the incoming GpuClaimParameters - existing, err := clientset.ResourceV1alpha2().ResourceClaimParameters(namespace).List(ctx, metav1.ListOptions{}) - if err != nil { - return fmt.Errorf("error listing existing ResourceClaimParameters: %w", err) - } - - // Build a new ResourceClaimParameters object from the incoming GpuClaimParameters object - resourceClaimParameters, err := newResourceClaimParametersFromGpuClaimParameters(gpuClaimParameters) - if err != nil { - return fmt.Errorf("error building new ResourceClaimParameters object from a GpuClaimParameters object: %w", err) - } - - // If there is an existing ResourceClaimParameters generated from the incoming GpuClaimParameters object, then update it - if len(existing.Items) > 0 { - for _, item := range existing.Items { - if (item.GeneratedFrom.APIGroup == gpucrd.GroupName) && - (item.GeneratedFrom.Kind == gpuClaimParameters.Kind) && - (item.GeneratedFrom.Name == gpuClaimParameters.Name) { - klog.Infof("ResourceClaimParameters already exists for GpuClaimParameters %s/%s, updating it", namespace, gpuClaimParameters.Name) - - // Copy the matching ResourceClaimParameters metadata into the new ResourceClaimParameters object before updating it - resourceClaimParameters.ObjectMeta = *item.ObjectMeta.DeepCopy() - - _, err = clientset.ResourceV1alpha2().ResourceClaimParameters(namespace).Update(ctx, resourceClaimParameters, metav1.UpdateOptions{}) - if err != nil { - return fmt.Errorf("error updating ResourceClaimParameters object: %w", err) - } - - return nil - } - } - } - - // Otherwise create a new ResourceClaimParameters object from the incoming GpuClaimParameters object - _, err = clientset.ResourceV1alpha2().ResourceClaimParameters(namespace).Create(ctx, resourceClaimParameters, metav1.CreateOptions{}) - if err != nil { - return fmt.Errorf("error creating ResourceClaimParameters object from GpuClaimParameters object: %w", err) - } - - klog.Infof("Created ResourceClaimParameters for GpuClaimParameters %s/%s", namespace, gpuClaimParameters.Name) - return nil -} diff --git a/cmd/dra-example-controller/main.go b/cmd/dra-example-controller/main.go deleted file mode 100644 index fcc8e26a..00000000 --- a/cmd/dra-example-controller/main.go +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Copyright 2023 The Kubernetes Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package main - -import ( - "context" - "fmt" - "net" - "net/http" - "net/http/pprof" - "os" - "path" - - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promhttp" - "github.com/urfave/cli/v2" - - "k8s.io/component-base/metrics/legacyregistry" - "k8s.io/klog/v2" - - _ "k8s.io/component-base/metrics/prometheus/restclient" // for client metric registration - _ "k8s.io/component-base/metrics/prometheus/version" // for version metric registration - _ "k8s.io/component-base/metrics/prometheus/workqueue" // register work queues in the default legacy registry - - "sigs.k8s.io/dra-example-driver/pkg/flags" -) - -type Flags struct { - kubeClientConfig flags.KubeClientConfig - loggingConfig *flags.LoggingConfig - crdConfig flags.CRDConfig - - workers int - - httpEndpoint string - metricsPath string - profilePath string -} - -type Config struct { - namespace string - flags *Flags - clientSets flags.ClientSets - mux *http.ServeMux -} - -func main() { - if err := newApp().Run(os.Args); err != nil { - fmt.Fprintf(os.Stderr, "Error: %v\n", err) - os.Exit(1) - } -} - -func newApp() *cli.App { - flags := &Flags{ - loggingConfig: flags.NewLoggingConfig(), - } - cliFlags := []cli.Flag{ - &cli.IntFlag{ - Name: "workers", - Usage: "Concurrency to process multiple claims", - Value: 10, - Destination: &flags.workers, - EnvVars: []string{"WORKERS"}, - }, - - &cli.StringFlag{ - Category: "HTTP server:", - Name: "http-endpoint", - Usage: "The TCP network `address` where the HTTP server for diagnostics, including pprof and metrics will listen (example: `:8080`). The default is the empty string, which means the server is disabled.", - Destination: &flags.httpEndpoint, - EnvVars: []string{"HTTP_ENDPOINT"}, - }, - &cli.StringFlag{ - Category: "HTTP server:", - Name: "metrics-path", - Usage: "The HTTP `path` where Prometheus metrics will be exposed, disabled if empty.", - Value: "/metrics", - Destination: &flags.metricsPath, - EnvVars: []string{"METRICS_PATH"}, - }, - &cli.StringFlag{ - Category: "HTTP server:", - Name: "pprof-path", - Usage: "The HTTP `path` where pprof profiling will be available, disabled if empty.", - Destination: &flags.profilePath, - EnvVars: []string{"PPROF_PATH"}, - }, - } - - cliFlags = append(cliFlags, flags.kubeClientConfig.Flags()...) - cliFlags = append(cliFlags, flags.loggingConfig.Flags()...) - flags.crdConfig.HideNodeName = true - cliFlags = append(cliFlags, flags.crdConfig.Flags()...) - - app := &cli.App{ - Name: "dra-example-controller", - Usage: "dra-example-controller implements a DRA driver controller.", - ArgsUsage: " ", - HideHelpCommand: true, - Flags: cliFlags, - Before: func(c *cli.Context) error { - if c.Args().Len() > 0 { - return fmt.Errorf("arguments not supported: %v", c.Args().Slice()) - } - return flags.loggingConfig.Apply() - }, - Action: func(c *cli.Context) error { - ctx := c.Context - mux := http.NewServeMux() - - clientSets, err := flags.kubeClientConfig.NewClientSets() - if err != nil { - return fmt.Errorf("create client: %v", err) - } - - config := &Config{ - mux: mux, - flags: flags, - namespace: flags.crdConfig.Namespace, - clientSets: clientSets, - } - - if flags.httpEndpoint != "" { - err = SetupHTTPEndpoint(ctx, config) - if err != nil { - return fmt.Errorf("create http endpoint: %v", err) - } - } - - err = StartClaimParametersGenerator(ctx, config) - if err != nil { - return fmt.Errorf("start claim parameters generator: %w", err) - } - - <-ctx.Done() - return nil - }, - } - - return app -} - -func SetupHTTPEndpoint(ctx context.Context, config *Config) error { - logger := klog.FromContext(ctx) - logger = klog.LoggerWithName(logger, "http-server") - if config.flags.metricsPath != "" { - // To collect metrics data from the metric handler itself, we - // let it register itself and then collect from that registry. - reg := prometheus.NewRegistry() - gatherers := prometheus.Gatherers{ - // Include Go runtime and process metrics: - // https://github.com/kubernetes/kubernetes/blob/9780d88cb6a4b5b067256ecb4abf56892093ee87/staging/src/k8s.io/component-base/metrics/legacyregistry/registry.go#L46-L49 - legacyregistry.DefaultGatherer, - } - gatherers = append(gatherers, reg) - - actualPath := path.Join("/", config.flags.metricsPath) - logger.Info("Starting metrics", "path", actualPath) - // This is similar to k8s.io/component-base/metrics HandlerWithReset - // except that we gather from multiple sources. - config.mux.Handle(actualPath, - promhttp.InstrumentMetricHandler( - reg, - promhttp.HandlerFor(gatherers, promhttp.HandlerOpts{}))) - } - - if config.flags.profilePath != "" { - actualPath := path.Join("/", config.flags.profilePath) - logger.Info("Starting profiling", "path", actualPath) - config.mux.HandleFunc(actualPath, pprof.Index) - config.mux.HandleFunc(path.Join(actualPath, "cmdline"), pprof.Cmdline) - config.mux.HandleFunc(path.Join(actualPath, "profile"), pprof.Profile) - config.mux.HandleFunc(path.Join(actualPath, "symbol"), pprof.Symbol) - config.mux.HandleFunc(path.Join(actualPath, "trace"), pprof.Trace) - } - - listener, err := net.Listen("tcp", config.flags.httpEndpoint) - if err != nil { - return fmt.Errorf("listen on HTTP endpoint: %v", err) - } - - go func() { - logger.Info("Starting HTTP server", "endpoint", config.flags.httpEndpoint) - err := http.Serve(listener, config.mux) - if err != nil { - logger.Error(err, "HTTP server failed") - klog.FlushAndExit(klog.ExitFlushTimeout, 1) - } - }() - - return nil -} diff --git a/cmd/dra-example-kubeletplugin/cdi.go b/cmd/dra-example-kubeletplugin/cdi.go index 9fbbc167..1c3b48c8 100644 --- a/cmd/dra-example-kubeletplugin/cdi.go +++ b/cmd/dra-example-kubeletplugin/cdi.go @@ -22,8 +22,6 @@ import ( cdiapi "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" cdispec "github.com/container-orchestrated-devices/container-device-interface/specs-go" - - gpucrd "sigs.k8s.io/dra-example-driver/api/example.com/resource/gpu/v1alpha1" ) const ( @@ -89,7 +87,7 @@ func (cdi *CDIHandler) CreateCommonSpecFile() error { return cdi.registry.SpecDB().WriteSpec(spec, specName) } -func (cdi *CDIHandler) CreateClaimSpecFile(claimUID string, devices *PreparedDevices) error { +func (cdi *CDIHandler) CreateClaimSpecFile(claimUID string, devices PreparedDevices) error { specName := cdiapi.GenerateTransientSpecName(cdiVendor, cdiClass, claimUID) spec := &cdispec.Spec{ @@ -98,22 +96,17 @@ func (cdi *CDIHandler) CreateClaimSpecFile(claimUID string, devices *PreparedDev } gpuIdx := 0 - switch devices.Type() { - case gpucrd.GpuDeviceType: - for _, device := range devices.Gpu.Devices { - cdiDevice := cdispec.Device{ - Name: device.UUID, - ContainerEdits: cdispec.ContainerEdits{ - Env: []string{ - fmt.Sprintf("GPU_DEVICE_%d=%s", gpuIdx, device.UUID), - }, + for _, device := range devices { + cdiDevice := cdispec.Device{ + Name: device.DeviceName, + ContainerEdits: cdispec.ContainerEdits{ + Env: []string{ + fmt.Sprintf("GPU_DEVICE_%d=%s", gpuIdx, device.DeviceName), }, - } - spec.Devices = append(spec.Devices, cdiDevice) - gpuIdx++ + }, } - default: - return fmt.Errorf("unknown device type: %v", devices.Type()) + spec.Devices = append(spec.Devices, cdiDevice) + gpuIdx++ } minVersion, err := cdiapi.MinimumRequiredVersion(spec) @@ -130,20 +123,15 @@ func (cdi *CDIHandler) DeleteClaimSpecFile(claimUID string) error { return cdi.registry.SpecDB().RemoveSpec(specName) } -func (cdi *CDIHandler) GetClaimDevices(claimUID string, devices *PreparedDevices) ([]string, error) { +func (cdi *CDIHandler) GetClaimDevices(devices []string) []string { cdiDevices := []string{ cdiapi.QualifiedName(cdiVendor, cdiClass, cdiCommonDeviceName), } - switch devices.Type() { - case gpucrd.GpuDeviceType: - for _, device := range devices.Gpu.Devices { - cdiDevice := cdiapi.QualifiedName(cdiVendor, cdiClass, device.UUID) - cdiDevices = append(cdiDevices, cdiDevice) - } - default: - return nil, fmt.Errorf("unknown device type: %v", devices.Type()) + for _, device := range devices { + cdiDevice := cdiapi.QualifiedName(cdiVendor, cdiClass, device) + cdiDevices = append(cdiDevices, cdiDevice) } - return cdiDevices, nil + return cdiDevices } diff --git a/cmd/dra-example-kubeletplugin/checkpoint.go b/cmd/dra-example-kubeletplugin/checkpoint.go index 0762fdb9..7311e760 100644 --- a/cmd/dra-example-kubeletplugin/checkpoint.go +++ b/cmd/dra-example-kubeletplugin/checkpoint.go @@ -19,7 +19,7 @@ func newCheckpoint() *Checkpoint { pc := &Checkpoint{ Checksum: 0, V1: &CheckpointV1{ - PreparedClaims: make(map[string]*PreparedDevices), + PreparedClaims: make(PreparedClaims), }, } return pc diff --git a/cmd/dra-example-kubeletplugin/discovery.go b/cmd/dra-example-kubeletplugin/discovery.go index 9608a451..19fa9c63 100644 --- a/cmd/dra-example-kubeletplugin/discovery.go +++ b/cmd/dra-example-kubeletplugin/discovery.go @@ -20,6 +20,9 @@ import ( "math/rand" "os" + resourceapi "k8s.io/api/resource/v1alpha3" + "k8s.io/utils/ptr" + "github.com/google/uuid" ) @@ -30,13 +33,17 @@ func enumerateAllPossibleDevices() (AllocatableDevices, error) { alldevices := make(AllocatableDevices) for _, uuid := range uuids { - deviceInfo := &AllocatableDeviceInfo{ - GpuInfo: &GpuInfo{ - UUID: uuid, - model: "LATEST-GPU-MODEL", + device := resourceapi.Device{ + Name: uuid, + Basic: &resourceapi.BasicDevice{ + Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{ + "model": { + StringValue: ptr.To("LATEST-GPU-MODEL"), + }, + }, }, } - alldevices[uuid] = deviceInfo + alldevices[uuid] = device } return alldevices, nil } @@ -49,7 +56,7 @@ func generateUUIDs(seed string, count int) []string { charset := make([]byte, 16) rand.Read(charset) uuid, _ := uuid.FromBytes(charset) - uuids[i] = "GPU-" + uuid.String() + uuids[i] = "gpu-" + uuid.String() } return uuids diff --git a/cmd/dra-example-kubeletplugin/driver.go b/cmd/dra-example-kubeletplugin/driver.go index 60c1fc6f..5f2e1b09 100644 --- a/cmd/dra-example-kubeletplugin/driver.go +++ b/cmd/dra-example-kubeletplugin/driver.go @@ -20,63 +20,65 @@ import ( "context" "fmt" - resourceapi "k8s.io/api/resource/v1alpha3" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + coreclientset "k8s.io/client-go/kubernetes" + "k8s.io/dynamic-resource-allocation/kubeletplugin" "k8s.io/klog/v2" + drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1alpha4" ) var _ drapbv1.NodeServer = &driver{} type driver struct { - doneCh chan struct{} + client coreclientset.Interface + plugin kubeletplugin.DRAPlugin state *DeviceState } func NewDriver(ctx context.Context, config *Config) (*driver, error) { + driver := &driver{ + client: config.coreclient, + } + state, err := NewDeviceState(config) if err != nil { return nil, err } + driver.state = state + + plugin, err := kubeletplugin.Start( + ctx, + driver, + kubeletplugin.KubeClient(config.coreclient), + kubeletplugin.NodeName(config.flags.nodeName), + kubeletplugin.DriverName(DriverName), + kubeletplugin.RegistrarSocketPath(PluginRegistrationPath), + kubeletplugin.PluginSocketPath(DriverPluginSocketPath), + kubeletplugin.KubeletPluginSocketPath(DriverPluginSocketPath)) + if err != nil { + return nil, err + } + driver.plugin = plugin - d := &driver{ - state: state, + var resources kubeletplugin.Resources + for _, device := range state.allocatable { + resources.Devices = append(resources.Devices, device) } + plugin.PublishResources(ctx, resources) - return d, nil + return driver, nil } func (d *driver) Shutdown(ctx context.Context) error { - close(d.doneCh) + d.plugin.Stop() return nil } -func (d *driver) NodeListAndWatchResources(req *drapbv1.NodeListAndWatchResourcesRequest, stream drapbv1.Node_NodeListAndWatchResourcesServer) error { - model := d.state.getResourceModelFromAllocatableDevices() - resp := &drapbv1.NodeListAndWatchResourcesResponse{ - Resources: []*resourceapi.ResourceModel{&model}, - } - - if err := stream.Send(resp); err != nil { - return err - } - - //nolint:all,S1000: should use for range instead of for { select {} } (gosimple) - for { - select { - case <-d.doneCh: - return nil - } - // TODO: Update with case for when GPUs go unhealthy - } -} - func (d *driver) NodePrepareResources(ctx context.Context, req *drapbv1.NodePrepareResourcesRequest) (*drapbv1.NodePrepareResourcesResponse, error) { klog.Infof("NodePrepareResource is called: number of claims: %d", len(req.Claims)) preparedResources := &drapbv1.NodePrepareResourcesResponse{Claims: map[string]*drapbv1.NodePrepareResourceResponse{}} - // In production version some common operations of d.nodeUnprepareResources - // should be done outside of the loop, for instance updating the CR could - // be done once after all HW was prepared. for _, claim := range req.Claims { preparedResources.Claims[claim.Uid] = d.nodePrepareResource(ctx, claim) } @@ -85,28 +87,25 @@ func (d *driver) NodePrepareResources(ctx context.Context, req *drapbv1.NodePrep } func (d *driver) nodePrepareResource(ctx context.Context, claim *drapbv1.Claim) *drapbv1.NodePrepareResourceResponse { - if len(claim.StructuredResourceHandle) == 0 { - return &drapbv1.NodePrepareResourceResponse{ - Error: "driver only supports structured parameters", - } - } - - allocated, err := d.getAllocatedDevices(ctx, claim) + resourceClaim, err := d.client.ResourceV1alpha3().ResourceClaims(claim.Namespace).Get( + context.TODO(), + claim.Name, + metav1.GetOptions{}) if err != nil { return &drapbv1.NodePrepareResourceResponse{ - Error: fmt.Sprintf("error allocating devices for claim %v: %v", claim.Uid, err), + Error: fmt.Sprintf("failed to fetch ResourceClaim %s in namespace %s", claim.Name, claim.Namespace), } } - prepared, err := d.state.Prepare(claim.Uid, allocated) + prepared, err := d.state.Prepare(resourceClaim) if err != nil { return &drapbv1.NodePrepareResourceResponse{ Error: fmt.Sprintf("error preparing devices for claim %v: %v", claim.Uid, err), } } - klog.Infof("Returning newly prepared devices for claim '%v': %s", claim.Uid, prepared) - return &drapbv1.NodePrepareResourceResponse{CDIDevices: prepared} + klog.Infof("Returning newly prepared devices for claim '%v': %v", claim.Uid, prepared) + return &drapbv1.NodePrepareResourceResponse{Devices: prepared} } func (d *driver) NodeUnprepareResources(ctx context.Context, req *drapbv1.NodeUnprepareResourcesRequest) (*drapbv1.NodeUnprepareResourcesResponse, error) { @@ -121,12 +120,6 @@ func (d *driver) NodeUnprepareResources(ctx context.Context, req *drapbv1.NodeUn } func (d *driver) nodeUnprepareResource(ctx context.Context, claim *drapbv1.Claim) *drapbv1.NodeUnprepareResourceResponse { - if len(claim.StructuredResourceHandle) == 0 { - return &drapbv1.NodeUnprepareResourceResponse{ - Error: "driver only supports structured parameters", - } - } - if err := d.state.Unprepare(claim.Uid); err != nil { return &drapbv1.NodeUnprepareResourceResponse{ Error: fmt.Sprintf("error unpreparing devices for claim %v: %v", claim.Uid, err), @@ -135,17 +128,3 @@ func (d *driver) nodeUnprepareResource(ctx context.Context, claim *drapbv1.Claim return &drapbv1.NodeUnprepareResourceResponse{} } - -func (d *driver) getAllocatedDevices(ctx context.Context, claim *drapbv1.Claim) (AllocatedDevices, error) { - allocated := AllocatedDevices{ - Gpu: &AllocatedGpus{}, - } - - for _, r := range claim.StructuredResourceHandle[0].Results { - name := r.AllocationResultModel.NamedResources.Name - gpu := fmt.Sprintf("GPU-%s", name[4:]) - allocated.Gpu.Devices = append(allocated.Gpu.Devices, gpu) - } - - return allocated, nil -} diff --git a/cmd/dra-example-kubeletplugin/main.go b/cmd/dra-example-kubeletplugin/main.go index 967ef5f6..1f1aaa5f 100644 --- a/cmd/dra-example-kubeletplugin/main.go +++ b/cmd/dra-example-kubeletplugin/main.go @@ -25,16 +25,14 @@ import ( "github.com/urfave/cli/v2" - plugin "k8s.io/dynamic-resource-allocation/kubeletplugin" + coreclientset "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" - gpucrd "sigs.k8s.io/dra-example-driver/api/example.com/resource/gpu/v1alpha1" - exampleclientset "sigs.k8s.io/dra-example-driver/pkg/example.com/resource/clientset/versioned" "sigs.k8s.io/dra-example-driver/pkg/flags" ) const ( - DriverName = gpucrd.GroupName + DriverName = "gpu.example.com" PluginRegistrationPath = "/var/lib/kubelet/plugins_registry/" + DriverName + ".sock" DriverPluginPath = "/var/lib/kubelet/plugins/" + DriverName @@ -44,15 +42,15 @@ const ( type Flags struct { kubeClientConfig flags.KubeClientConfig - crdConfig flags.CRDConfig loggingConfig *flags.LoggingConfig - cdiRoot string + nodeName string + cdiRoot string } type Config struct { - flags *Flags - exampleclient exampleclientset.Interface + flags *Flags + coreclient coreclientset.Interface } func main() { @@ -67,6 +65,13 @@ func newApp() *cli.App { loggingConfig: flags.NewLoggingConfig(), } cliFlags := []cli.Flag{ + &cli.StringFlag{ + Name: "node-name", + Usage: "The name of the node to be worked on.", + Required: true, + Destination: &flags.nodeName, + EnvVars: []string{"NODE_NAME"}, + }, &cli.StringFlag{ Name: "cdi-root", Usage: "Absolute path to the directory where CDI files will be generated.", @@ -76,7 +81,6 @@ func newApp() *cli.App { }, } cliFlags = append(cliFlags, flags.kubeClientConfig.Flags()...) - cliFlags = append(cliFlags, flags.crdConfig.Flags()...) cliFlags = append(cliFlags, flags.loggingConfig.Flags()...) app := &cli.App{ @@ -99,8 +103,8 @@ func newApp() *cli.App { } config := &Config{ - flags: flags, - exampleclient: clientSets.Example, + flags: flags, + coreclient: clientSets.Core, } return StartPlugin(ctx, config) @@ -134,22 +138,10 @@ func StartPlugin(ctx context.Context, config *Config) error { return err } - dp, err := plugin.Start( - driver, - plugin.DriverName(DriverName), - plugin.RegistrarSocketPath(PluginRegistrationPath), - plugin.PluginSocketPath(DriverPluginSocketPath), - plugin.KubeletPluginSocketPath(DriverPluginSocketPath)) - if err != nil { - return err - } - sigc := make(chan os.Signal, 1) signal.Notify(sigc, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) <-sigc - dp.Stop() - err = driver.Shutdown(ctx) if err != nil { klog.FromContext(ctx).Error(err, "Unable to cleanly shutdown driver") diff --git a/cmd/dra-example-kubeletplugin/state.go b/cmd/dra-example-kubeletplugin/state.go index a0fee6d7..a8fc901c 100644 --- a/cmd/dra-example-kubeletplugin/state.go +++ b/cmd/dra-example-kubeletplugin/state.go @@ -18,56 +18,16 @@ package main import ( "fmt" - "strings" "sync" resourceapi "k8s.io/api/resource/v1alpha3" + drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1alpha4" "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" - - gpucrd "sigs.k8s.io/dra-example-driver/api/example.com/resource/gpu/v1alpha1" ) -type AllocatableDevices map[string]*AllocatableDeviceInfo -type PreparedClaims map[string]*PreparedDevices - -type GpuInfo struct { - UUID string `json:"uuid"` - model string -} - -type AllocatedGpus struct { - Devices []string `json:"devices"` -} - -type AllocatedDevices struct { - Gpu *AllocatedGpus `json:"gpu"` -} - -type PreparedGpus struct { - Devices []*GpuInfo `json:"devices"` -} - -type PreparedDevices struct { - Gpu *PreparedGpus `json:"gpu"` -} - -func (d AllocatedDevices) Type() string { - if d.Gpu != nil { - return gpucrd.GpuDeviceType - } - return gpucrd.UnknownDeviceType -} - -func (d PreparedDevices) Type() string { - if d.Gpu != nil { - return gpucrd.GpuDeviceType - } - return gpucrd.UnknownDeviceType -} - -type AllocatableDeviceInfo struct { - *GpuInfo -} +type AllocatableDevices map[string]resourceapi.Device +type PreparedDevices []*drapbv1.Device +type PreparedClaims map[string]PreparedDevices type DeviceState struct { sync.Mutex @@ -122,10 +82,12 @@ func NewDeviceState(config *Config) (*DeviceState, error) { return state, nil } -func (s *DeviceState) Prepare(claimUID string, allocation AllocatedDevices) ([]string, error) { +func (s *DeviceState) Prepare(claim *resourceapi.ResourceClaim) ([]*drapbv1.Device, error) { s.Lock() defer s.Unlock() + claimUID := string(claim.UID) + checkpoint := newCheckpoint() if err := s.checkpointManager.GetCheckpoint(DriverPluginCheckpointFile, checkpoint); err != nil { return nil, fmt.Errorf("unable to sync from checkpoint: %v", err) @@ -133,42 +95,24 @@ func (s *DeviceState) Prepare(claimUID string, allocation AllocatedDevices) ([]s preparedClaims := checkpoint.V1.PreparedClaims if preparedClaims[claimUID] != nil { - cdiDevices, err := s.cdi.GetClaimDevices(claimUID, preparedClaims[claimUID]) - if err != nil { - return nil, fmt.Errorf("unable to get CDI devices names: %v", err) - } - return cdiDevices, nil + return preparedClaims[claimUID], nil } - preparedDevices := &PreparedDevices{} - - var err error - switch allocation.Type() { - case gpucrd.GpuDeviceType: - preparedDevices.Gpu, err = s.prepareGpus(claimUID, allocation.Gpu) - default: - err = fmt.Errorf("unknown device type: %v", allocation.Type()) - } + preparedDevices, err := s.prepareDevices(claim) if err != nil { - return nil, fmt.Errorf("praparation failed: %v", err) + return nil, fmt.Errorf("prepare failed: %v", err) } - err = s.cdi.CreateClaimSpecFile(claimUID, preparedDevices) - if err != nil { + if err = s.cdi.CreateClaimSpecFile(claimUID, preparedDevices); err != nil { return nil, fmt.Errorf("unable to create CDI spec file for claim: %v", err) } - cdiDevices, err := s.cdi.GetClaimDevices(claimUID, preparedDevices) - if err != nil { - return nil, fmt.Errorf("unable to get CDI devices names: %v", err) - } - preparedClaims[claimUID] = preparedDevices if err := s.checkpointManager.CreateCheckpoint(DriverPluginCheckpointFile, checkpoint); err != nil { return nil, fmt.Errorf("unable to sync to checkpoint: %v", err) } - return cdiDevices, nil + return preparedClaims[claimUID], nil } func (s *DeviceState) Unprepare(claimUID string) error { @@ -185,14 +129,8 @@ func (s *DeviceState) Unprepare(claimUID string) error { return nil } - switch preparedClaims[claimUID].Type() { - case gpucrd.GpuDeviceType: - err := s.unprepareGpus(claimUID, preparedClaims[claimUID]) - if err != nil { - return fmt.Errorf("unprepare failed: %v", err) - } - default: - return fmt.Errorf("unknown device type: %v", preparedClaims[claimUID].Type()) + if err := s.unprepareDevices(claimUID, preparedClaims[claimUID]); err != nil { + return fmt.Errorf("unprepare failed: %v", err) } err := s.cdi.DeleteClaimSpecFile(claimUID) @@ -208,38 +146,30 @@ func (s *DeviceState) Unprepare(claimUID string) error { return nil } -func (s *DeviceState) prepareGpus(claimUID string, allocated *AllocatedGpus) (*PreparedGpus, error) { - prepared := &PreparedGpus{} +func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (PreparedDevices, error) { + if claim.Status.Allocation == nil { + return nil, fmt.Errorf("claim not yet allocated") + } - for _, device := range allocated.Devices { - gpuInfo := s.allocatable[device].GpuInfo + var preparedDevices PreparedDevices + for _, result := range claim.Status.Allocation.Devices.Results { + if _, exists := s.allocatable[result.Device]; !exists { + return nil, fmt.Errorf("requested GPU is not allocatable: %v", result.Device) + } - if _, exists := s.allocatable[device]; !exists { - return nil, fmt.Errorf("requested GPU is not allocatable: %v", device) + device := &drapbv1.Device{ + RequestNames: []string{result.Request}, + PoolName: result.Pool, + DeviceName: result.Device, + CDIDeviceIDs: s.cdi.GetClaimDevices([]string{result.Device}), } - prepared.Devices = append(prepared.Devices, gpuInfo) + preparedDevices = append(preparedDevices, device) } - return prepared, nil + return preparedDevices, nil } -func (s *DeviceState) unprepareGpus(claimUID string, devices *PreparedDevices) error { +func (s *DeviceState) unprepareDevices(claimUID string, devices PreparedDevices) error { return nil } - -func (s *DeviceState) getResourceModelFromAllocatableDevices() resourceapi.ResourceModel { - var instances []resourceapi.NamedResourcesInstance - for _, device := range s.allocatable { - instance := resourceapi.NamedResourcesInstance{ - Name: strings.ToLower(device.UUID), - } - instances = append(instances, instance) - } - - model := resourceapi.ResourceModel{ - NamedResources: &resourceapi.NamedResourcesResources{Instances: instances}, - } - - return model -} diff --git a/demo/gpu-test1.yaml b/demo/gpu-test1.yaml index b1fa51b7..ad1082dd 100644 --- a/demo/gpu-test1.yaml +++ b/demo/gpu-test1.yaml @@ -15,7 +15,10 @@ metadata: name: gpu.example.com spec: spec: - resourceClassName: gpu.example.com + devices: + requests: + - name: gpu + deviceClassName: gpu.example.com --- apiVersion: v1 @@ -36,8 +39,7 @@ spec: - name: gpu resourceClaims: - name: gpu - source: - resourceClaimTemplateName: gpu.example.com + resourceClaimTemplateName: gpu.example.com --- apiVersion: v1 @@ -58,5 +60,4 @@ spec: - name: gpu resourceClaims: - name: gpu - source: - resourceClaimTemplateName: gpu.example.com + resourceClaimTemplateName: gpu.example.com diff --git a/demo/gpu-test2.yaml b/demo/gpu-test2.yaml index dee7d52d..1f466b39 100644 --- a/demo/gpu-test2.yaml +++ b/demo/gpu-test2.yaml @@ -15,7 +15,10 @@ metadata: name: gpu.example.com spec: spec: - resourceClassName: gpu.example.com + devices: + requests: + - name: gpu + deviceClassName: gpu.example.com --- apiVersion: v1 @@ -41,5 +44,4 @@ spec: - name: shared-gpu resourceClaims: - name: shared-gpu - source: - resourceClaimTemplateName: gpu.example.com + resourceClaimTemplateName: gpu.example.com diff --git a/demo/gpu-test3.yaml b/demo/gpu-test3.yaml index 54e7146d..02dd4c50 100644 --- a/demo/gpu-test3.yaml +++ b/demo/gpu-test3.yaml @@ -14,7 +14,10 @@ metadata: namespace: gpu-test3 name: shared-gpu spec: - resourceClassName: gpu.example.com + devices: + requests: + - name: gpu + deviceClassName: gpu.example.com --- apiVersion: v1 @@ -35,8 +38,7 @@ spec: - name: shared-gpu resourceClaims: - name: shared-gpu - source: - resourceClaimName: shared-gpu + resourceClaimName: shared-gpu --- apiVersion: v1 @@ -57,5 +59,4 @@ spec: - name: shared-gpu resourceClaims: - name: shared-gpu - source: - resourceClaimName: shared-gpu + resourceClaimName: shared-gpu diff --git a/demo/gpu-test4.yaml b/demo/gpu-test4.yaml index e24ada37..4bc87e03 100644 --- a/demo/gpu-test4.yaml +++ b/demo/gpu-test4.yaml @@ -7,15 +7,6 @@ kind: Namespace metadata: name: gpu-test4 ---- -apiVersion: gpu.resource.example.com/v1alpha1 -kind: GpuClaimParameters -metadata: - namespace: gpu-test4 - name: multiple-gpus -spec: - count: 4 - --- apiVersion: resource.k8s.io/v1alpha3 kind: ResourceClaimTemplate @@ -24,11 +15,12 @@ metadata: name: multiple-gpus spec: spec: - resourceClassName: gpu.example.com - parametersRef: - apiGroup: gpu.resource.example.com - kind: GpuClaimParameters - name: multiple-gpus + devices: + requests: + - name: gpus + deviceClassName: gpu.example.com + countMode: Exact + count: 4 --- apiVersion: v1 @@ -49,5 +41,4 @@ spec: - name: gpus resourceClaims: - name: gpus - source: - resourceClaimTemplateName: multiple-gpus + resourceClaimTemplateName: multiple-gpus diff --git a/deployments/container/Dockerfile b/deployments/container/Dockerfile index bb0b597b..dd6c340b 100644 --- a/deployments/container/Dockerfile +++ b/deployments/container/Dockerfile @@ -34,5 +34,4 @@ LABEL release="N/A" LABEL summary="Example DRA resource driver for Kubernetes" LABEL description="See summary" -COPY --from=build /artifacts/dra-example-controller /usr/bin/dra-example-controller COPY --from=build /artifacts/dra-example-kubeletplugin /usr/bin/dra-example-kubeletplugin diff --git a/deployments/helm/dra-example-driver/templates/controller.yaml b/deployments/helm/dra-example-driver/templates/controller.yaml deleted file mode 100644 index ad07ba91..00000000 --- a/deployments/helm/dra-example-driver/templates/controller.yaml +++ /dev/null @@ -1,58 +0,0 @@ ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "dra-example-driver.fullname" . }}-controller - namespace: {{ include "dra-example-driver.namespace" . }} - labels: - {{- include "dra-example-driver.labels" . | nindent 4 }} -spec: - replicas: 1 - selector: - matchLabels: - {{- include "dra-example-driver.selectorLabels" . | nindent 6 }} - template: - metadata: - {{- with .Values.controller.podAnnotations }} - annotations: - {{- toYaml . | nindent 8 }} - {{- end }} - labels: - {{- include "dra-example-driver.templateLabels" . | nindent 8 }} - spec: - {{- if .Values.controller.priorityClassName }} - priorityClassName: {{ .Values.controller.priorityClassName }} - {{- end }} - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ include "dra-example-driver.serviceAccountName" . }} - securityContext: - {{- toYaml .Values.controller.podSecurityContext | nindent 8 }} - containers: - - name: controller - securityContext: - {{- toYaml .Values.controller.containers.controller.securityContext | nindent 10 }} - image: {{ include "dra-example-driver.fullimage" . }} - imagePullPolicy: {{ .Values.image.pullPolicy }} - command: ["dra-example-controller"] - resources: - {{- toYaml .Values.controller.containers.controller.resources | nindent 10 }} - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - {{- with .Values.controller.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.controller.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.controller.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} diff --git a/deployments/helm/dra-example-driver/templates/deviceclass.yaml b/deployments/helm/dra-example-driver/templates/deviceclass.yaml new file mode 100644 index 00000000..51cf395f --- /dev/null +++ b/deployments/helm/dra-example-driver/templates/deviceclass.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: resource.k8s.io/v1alpha3 +kind: DeviceClass +metadata: + name: gpu.example.com +spec: + selectors: + - cel: + expression: "device.driver == 'gpu.example.com'" diff --git a/deployments/helm/dra-example-driver/templates/resourceclass.yaml b/deployments/helm/dra-example-driver/templates/resourceclass.yaml deleted file mode 100644 index 2f59a68a..00000000 --- a/deployments/helm/dra-example-driver/templates/resourceclass.yaml +++ /dev/null @@ -1,7 +0,0 @@ ---- -apiVersion: resource.k8s.io/v1alpha3 -kind: ResourceClass -metadata: - name: gpu.example.com -driverName: gpu.resource.example.com -structuredParameters: true diff --git a/go.mod b/go.mod index 22890532..f6329529 100644 --- a/go.mod +++ b/go.mod @@ -7,15 +7,14 @@ replace ( k8s.io/apimachinery => github.com/pohly/kubernetes/staging/src/k8s.io/apimachinery v0.0.0-20240712091519-93ab1a4e8a5f k8s.io/client-go => github.com/pohly/kubernetes/staging/src/k8s.io/client-go v0.0.0-20240712091519-93ab1a4e8a5f k8s.io/component-base => github.com/pohly/kubernetes/staging/src/k8s.io/component-base v0.0.0-20240712091519-93ab1a4e8a5f - k8s.io/dynamic-resource-allocation => github.com/pohly/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20240712091519-93ab1a4e8a5f - k8s.io/kubelet => github.com/pohly/kubernetes/staging/src/k8s.io/kubelet v0.0.0-20240712091519-93ab1a4e8a5f + k8s.io/dynamic-resource-allocation => github.com/pohly/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20240712183215-a537356172a4 + k8s.io/kubelet => github.com/pohly/kubernetes/staging/src/k8s.io/kubelet v0.0.0-20240712183215-a537356172a4 k8s.io/kubernetes => github.com/pohly/kubernetes v1.10.0-alpha.3.0.20240712091519-93ab1a4e8a5f ) require ( github.com/container-orchestrated-devices/container-device-interface v0.5.4 github.com/google/uuid v1.6.0 - github.com/prometheus/client_golang v1.19.1 github.com/spf13/pflag v1.0.5 github.com/urfave/cli/v2 v2.25.3 k8s.io/api v0.0.0 @@ -60,6 +59,7 @@ require ( github.com/opencontainers/runtime-spec v1.0.3-0.20220909204839-494a5a6aca78 // indirect github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect github.com/pkg/errors v0.9.1 // indirect + github.com/prometheus/client_golang v1.19.1 // indirect github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/common v0.55.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect diff --git a/go.sum b/go.sum index a57f81cf..e3022005 100644 --- a/go.sum +++ b/go.sum @@ -112,10 +112,10 @@ github.com/pohly/kubernetes/staging/src/k8s.io/client-go v0.0.0-20240712091519-9 github.com/pohly/kubernetes/staging/src/k8s.io/client-go v0.0.0-20240712091519-93ab1a4e8a5f/go.mod h1:IpUaqYx61bn61aCxEwjc/gaSmg0xOHtjg5rE/38ZPZ0= github.com/pohly/kubernetes/staging/src/k8s.io/component-base v0.0.0-20240712091519-93ab1a4e8a5f h1:y6/vLA/fpqjZfcv1MKqEF+uBQG+KiSGxOR5fs+O3RCE= github.com/pohly/kubernetes/staging/src/k8s.io/component-base v0.0.0-20240712091519-93ab1a4e8a5f/go.mod h1:btdhYbDSRljO9W9EVEeqW1RwQio6cUyp9HlqJybUaFo= -github.com/pohly/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20240712091519-93ab1a4e8a5f h1:tMzIqcWUHH+yHl6Bv8fx3uzyhoHdFBoy5WxG+tZCzlw= -github.com/pohly/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20240712091519-93ab1a4e8a5f/go.mod h1:QOIco1H/avdrWBXMZamNe1dSXM0h0JWPW1Y5mGGrQns= -github.com/pohly/kubernetes/staging/src/k8s.io/kubelet v0.0.0-20240712091519-93ab1a4e8a5f h1:zQPD3euW2dbQeakFD7ny9ifJSR9CYQxSxSBkTJaOfRY= -github.com/pohly/kubernetes/staging/src/k8s.io/kubelet v0.0.0-20240712091519-93ab1a4e8a5f/go.mod h1:xc7Px2Q1Cwc4X85lFlV3m4HQxLw7gYenzfCF6hFm+2s= +github.com/pohly/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20240712183215-a537356172a4 h1:EHcsUkIe551eqrinKP7ewV74cQYozxqsZtxtp2F08Wc= +github.com/pohly/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20240712183215-a537356172a4/go.mod h1:QOIco1H/avdrWBXMZamNe1dSXM0h0JWPW1Y5mGGrQns= +github.com/pohly/kubernetes/staging/src/k8s.io/kubelet v0.0.0-20240712183215-a537356172a4 h1:4EMMg2SCc2u++KZnRakt0IEA/uJauZqGZq8AG8iB2TE= +github.com/pohly/kubernetes/staging/src/k8s.io/kubelet v0.0.0-20240712183215-a537356172a4/go.mod h1:xc7Px2Q1Cwc4X85lFlV3m4HQxLw7gYenzfCF6hFm+2s= github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= diff --git a/pkg/flags/crds.go b/pkg/flags/crds.go deleted file mode 100644 index d98c4fc7..00000000 --- a/pkg/flags/crds.go +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright 2023 The Kubernetes Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package flags - -import ( - "github.com/urfave/cli/v2" -) - -type CRDConfig struct { - NodeName string - Namespace string - - HideNodeName bool -} - -func (n *CRDConfig) Flags() []cli.Flag { - flags := []cli.Flag{ - &cli.StringFlag{ - Name: "namespace", - Usage: "The namespace used for the custom resources.", - Value: "default", - Destination: &n.Namespace, - EnvVars: []string{"NAMESPACE"}, - }, - } - if !n.HideNodeName { - flags = append(flags, &cli.StringFlag{ - Name: "node-name", - Usage: "The name of the node to be worked on.", - Required: true, - Destination: &n.NodeName, - EnvVars: []string{"NODE_NAME"}, - }) - } - - return flags -} diff --git a/pkg/flags/kubeclient.go b/pkg/flags/kubeclient.go index 04b0d394..88112ec2 100644 --- a/pkg/flags/kubeclient.go +++ b/pkg/flags/kubeclient.go @@ -24,8 +24,6 @@ import ( coreclientset "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" - - exampleclientset "sigs.k8s.io/dra-example-driver/pkg/example.com/resource/clientset/versioned" ) type KubeClientConfig struct { @@ -35,8 +33,7 @@ type KubeClientConfig struct { } type ClientSets struct { - Core coreclientset.Interface - Example exampleclientset.Interface + Core coreclientset.Interface } func (k *KubeClientConfig) Flags() []cli.Flag { @@ -102,13 +99,7 @@ func (k *KubeClientConfig) NewClientSets() (ClientSets, error) { return ClientSets{}, fmt.Errorf("create core client: %v", err) } - exampleclient, err := exampleclientset.NewForConfig(csconfig) - if err != nil { - return ClientSets{}, fmt.Errorf("create example.com client: %v", err) - } - return ClientSets{ - Core: coreclient, - Example: exampleclient, + Core: coreclient, }, nil }