diff --git a/Makefile b/Makefile index b5350260..57260d03 100644 --- a/Makefile +++ b/Makefile @@ -152,12 +152,16 @@ build: manifests generate fmt vet ## Build manager binary. go build -o bin/manager ./cmd/manager/... OPENSHIFT ?= true +RUN_ARGS ?= .PHONY: run run: install fmt vet ## Run a controller from your host against openshift cluster go run ./cmd/manager/... \ - --kepler.image=$(KEPLER_IMG) --kepler.image.libbpf=$(KEPLER_IMG_LIBBPF) \ - --zap-devel --zap-log-level=8 --openshift=$(OPENSHIFT) 2>&1 | tee tmp/operator.log + --kepler.image=$(KEPLER_IMG) --kepler.image.libbpf=$(KEPLER_IMG_LIBBPF) \ + --zap-devel --zap-log-level=8 \ + --openshift=$(OPENSHIFT) \ + $(RUN_ARGS) \ + 2>&1 | tee tmp/operator.log # docker_tag accepts an image:tag and a list of additional tags comma-separated # it tags the image with the additional tags diff --git a/bundle/manifests/kepler-operator.clusterserviceversion.yaml b/bundle/manifests/kepler-operator.clusterserviceversion.yaml index bbf55379..6647b20d 100644 --- a/bundle/manifests/kepler-operator.clusterserviceversion.yaml +++ b/bundle/manifests/kepler-operator.clusterserviceversion.yaml @@ -27,9 +27,13 @@ metadata: capabilities: Basic Install categories: Monitoring containerImage: quay.io/sustainable_computing_io/kepler-operator:0.9.2 - createdAt: "2023-11-01T12:15:43Z" + createdAt: "2023-11-30T08:58:07Z" description: 'Deploys and Manages Kepler on Kubernetes ' operators.operatorframework.io/builder: operator-sdk-v1.27.0 + operators.operatorframework.io/internal-objects: |- + [ + "keplerinternals.kepler.system.sustainable.computing.io" + ] operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 repository: https://github.com/sustainable-computing-io/kepler-operator name: kepler-operator.v0.9.2 @@ -38,13 +42,27 @@ spec: apiservicedefinitions: {} customresourcedefinitions: owned: + - description: KeplerInternal is the Schema for internal/unsupported API + displayName: KeplerInternal + kind: KeplerInternal + name: keplerinternals.kepler.system.sustainable.computing.io + statusDescriptors: + - description: conditions represent the latest available observations of the + kepler-exporter + displayName: Conditions + path: exporter.conditions + x-descriptors: + - urn:alm:descriptor:com.tectonic.ui:conditions + version: v1alpha1 - description: Kepler is the Schema for the keplers API displayName: Kepler kind: Kepler name: keplers.kepler.system.sustainable.computing.io statusDescriptors: - - displayName: Conditions - path: conditions + - description: conditions represent the latest available observations of the + kepler-exporter + displayName: Conditions + path: exporter.conditions x-descriptors: - urn:alm:descriptor:com.tectonic.ui:conditions version: v1alpha1 diff --git a/bundle/manifests/kepler.system.sustainable.computing.io_keplerinternals.yaml b/bundle/manifests/kepler.system.sustainable.computing.io_keplerinternals.yaml new file mode 100644 index 00000000..2037642b --- /dev/null +++ b/bundle/manifests/kepler.system.sustainable.computing.io_keplerinternals.yaml @@ -0,0 +1,275 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.12.1 + creationTimestamp: null + name: keplerinternals.kepler.system.sustainable.computing.io +spec: + group: kepler.system.sustainable.computing.io + names: + kind: KeplerInternal + listKind: KeplerInternalList + plural: keplerinternals + singular: keplerinternal + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .spec.exporter.deployment.port + name: Port + type: integer + - jsonPath: .status.exporter.desiredNumberScheduled + name: Desired + type: integer + - jsonPath: .status.exporter.currentNumberScheduled + name: Current + type: integer + - jsonPath: .status.updatedNumberScheduled + name: Up-to-date + type: integer + - jsonPath: .status.exporter.numberReady + name: Ready + type: integer + - jsonPath: .status.exporter.numberAvailable + name: Available + type: integer + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + - jsonPath: .spec.exporter.deployment.image + name: Image + type: string + - jsonPath: .spec.exporter.deployment.nodeSelector + name: Node-Selector + priority: 10 + type: string + - jsonPath: .spec.exporter.deployment.tolerations + name: Tolerations + priority: 10 + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: KeplerInternal is the Schema for the keplers internal API + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: KeplerInternalSpec defines the desired state of KeplerInternal + properties: + exporter: + properties: + deployment: + properties: + image: + description: Image of kepler-exporter to be deployed + minLength: 3 + type: string + namespace: + description: Namespace where kepler-exporter will be deployed + minLength: 1 + type: string + nodeSelector: + additionalProperties: + type: string + default: + kubernetes.io/os: linux + description: Defines which Nodes the Pod is scheduled on + type: object + port: + default: 9103 + format: int32 + maximum: 65535 + minimum: 1 + type: integer + tolerations: + default: + - effect: "" + key: "" + operator: Exists + value: "" + description: If specified, define Pod's tolerations + items: + description: The pod this Toleration is attached to tolerates + any taint that matches the triple using + the matching operator . + properties: + effect: + description: Effect indicates the taint effect to match. + Empty means match all taint effects. When specified, + allowed values are NoSchedule, PreferNoSchedule and + NoExecute. + type: string + key: + description: Key is the taint key that the toleration + applies to. Empty means match all taint keys. If the + key is empty, operator must be Exists; this combination + means to match all values and all keys. + type: string + operator: + description: Operator represents a key's relationship + to the value. Valid operators are Exists and Equal. + Defaults to Equal. Exists is equivalent to wildcard + for value, so that a pod can tolerate all taints of + a particular category. + type: string + tolerationSeconds: + description: TolerationSeconds represents the period + of time the toleration (which must be of effect NoExecute, + otherwise this field is ignored) tolerates the taint. + By default, it is not set, which means tolerate the + taint forever (do not evict). Zero and negative values + will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: Value is the taint value the toleration + matches to. If the operator is Exists, the value should + be empty, otherwise just a regular string. + type: string + type: object + type: array + required: + - image + - namespace + type: object + required: + - deployment + type: object + openshift: + properties: + dashboard: + properties: + enabled: + default: false + type: boolean + type: object + enabled: + default: true + type: boolean + required: + - enabled + type: object + required: + - exporter + type: object + status: + description: KeplerInternalStatus represents status of KeplerInternal + properties: + exporter: + description: ExporterStatus defines the observed state of Kepler Exporter + properties: + conditions: + description: conditions represent the latest available observations + of the kepler-exporter + items: + properties: + lastTransitionTime: + description: lastTransitionTime is the last time the condition + transitioned from one status to another. This should be + when the underlying condition changed. If that is not + known, then using the time when the API field changed + is acceptable. + format: date-time + type: string + message: + description: message is a human readable message indicating + details about the transition. This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: observedGeneration represents the .metadata.generation + that the condition was set based upon. For instance, if + .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration + is 9, the condition is out of date with respect to the + current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: reason contains a programmatic identifier indicating + the reason for the condition's last transition. + type: string + status: + description: status of the condition, one of True, False, + Unknown. + type: string + type: + description: Type of Kepler Condition - Reconciled, Available + ... + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-type: atomic + currentNumberScheduled: + description: The number of nodes that are running at least 1 kepler + pod and are supposed to run the kepler pod. + format: int32 + type: integer + desiredNumberScheduled: + description: The total number of nodes that should be running + the kepler pod (including nodes correctly running the kepler + pod). + format: int32 + type: integer + numberAvailable: + description: The number of nodes that should be running the kepler + pod and have one or more of the kepler pod running and available + format: int32 + type: integer + numberMisscheduled: + description: The number of nodes that are running the kepler pod, + but are not supposed to run the kepler pod. + format: int32 + type: integer + numberReady: + description: numberReady is the number of nodes that should be + running the kepler pod and have one or more of the kepler pod + running with a Ready Condition. + format: int32 + type: integer + numberUnavailable: + description: The number of nodes that should be running the kepler + pod and have none of the kepler pod running and available + format: int32 + type: integer + updatedNumberScheduled: + description: The total number of nodes that are running updated + kepler pod + format: int32 + type: integer + required: + - conditions + - currentNumberScheduled + - desiredNumberScheduled + - numberMisscheduled + - numberReady + type: object + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: null + storedVersions: null diff --git a/bundle/manifests/kepler.system.sustainable.computing.io_keplers.yaml b/bundle/manifests/kepler.system.sustainable.computing.io_keplers.yaml index 72295f8d..b60f6bcf 100644 --- a/bundle/manifests/kepler.system.sustainable.computing.io_keplers.yaml +++ b/bundle/manifests/kepler.system.sustainable.computing.io_keplers.yaml @@ -18,19 +18,19 @@ spec: - jsonPath: .spec.exporter.deployment.port name: Port type: integer - - jsonPath: .status.desiredNumberScheduled + - jsonPath: .status.exporter.desiredNumberScheduled name: Desired type: integer - - jsonPath: .status.currentNumberScheduled + - jsonPath: .status.exporter.currentNumberScheduled name: Current type: integer - - jsonPath: .status.numberReady + - jsonPath: .status.exporter.numberReady name: Ready type: integer - - jsonPath: .status.updatedNumberScheduled + - jsonPath: .status.exporter.updatedNumberScheduled name: Up-to-date type: integer - - jsonPath: .status.numberAvailable + - jsonPath: .status.exporter.numberAvailable name: Available type: integer - jsonPath: .metadata.creationTimestamp @@ -134,92 +134,101 @@ spec: status: description: KeplerStatus defines the observed state of Kepler properties: - conditions: - items: - properties: - lastTransitionTime: - description: lastTransitionTime is the last time the condition - transitioned from one status to another. This should be when - the underlying condition changed. If that is not known, then - using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: message is a human readable message indicating - details about the transition. This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: observedGeneration represents the .metadata.generation - that the condition was set based upon. For instance, if .metadata.generation - is currently 12, but the .status.conditions[x].observedGeneration - is 9, the condition is out of date with respect to the current - state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: reason contains a programmatic identifier indicating - the reason for the condition's last transition. - type: string - status: - description: status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of Kepler Condition - Reconciled, Available - ... - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array - x-kubernetes-list-type: atomic - currentNumberScheduled: - description: The number of nodes that are running at least 1 kepler - pod and are supposed to run the kepler pod. - format: int32 - type: integer - desiredNumberScheduled: - description: The total number of nodes that should be running the - kepler pod (including nodes correctly running the kepler pod). - format: int32 - type: integer - numberAvailable: - description: The number of nodes that should be running the kepler - pod and have one or more of the kepler pod running and available - format: int32 - type: integer - numberMisscheduled: - description: The number of nodes that are running the kepler pod, - but are not supposed to run the kepler pod. - format: int32 - type: integer - numberReady: - description: numberReady is the number of nodes that should be running - the kepler pod and have one or more of the kepler pod running with - a Ready Condition. - format: int32 - type: integer - numberUnavailable: - description: The number of nodes that should be running the kepler - pod and have none of the kepler pod running and available - format: int32 - type: integer - updatedNumberScheduled: - description: The total number of nodes that are running updated kepler - pod - format: int32 - type: integer - required: - - conditions - - currentNumberScheduled - - desiredNumberScheduled - - numberMisscheduled - - numberReady + exporter: + description: ExporterStatus defines the observed state of Kepler Exporter + properties: + conditions: + description: conditions represent the latest available observations + of the kepler-exporter + items: + properties: + lastTransitionTime: + description: lastTransitionTime is the last time the condition + transitioned from one status to another. This should be + when the underlying condition changed. If that is not + known, then using the time when the API field changed + is acceptable. + format: date-time + type: string + message: + description: message is a human readable message indicating + details about the transition. This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: observedGeneration represents the .metadata.generation + that the condition was set based upon. For instance, if + .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration + is 9, the condition is out of date with respect to the + current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: reason contains a programmatic identifier indicating + the reason for the condition's last transition. + type: string + status: + description: status of the condition, one of True, False, + Unknown. + type: string + type: + description: Type of Kepler Condition - Reconciled, Available + ... + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-type: atomic + currentNumberScheduled: + description: The number of nodes that are running at least 1 kepler + pod and are supposed to run the kepler pod. + format: int32 + type: integer + desiredNumberScheduled: + description: The total number of nodes that should be running + the kepler pod (including nodes correctly running the kepler + pod). + format: int32 + type: integer + numberAvailable: + description: The number of nodes that should be running the kepler + pod and have one or more of the kepler pod running and available + format: int32 + type: integer + numberMisscheduled: + description: The number of nodes that are running the kepler pod, + but are not supposed to run the kepler pod. + format: int32 + type: integer + numberReady: + description: numberReady is the number of nodes that should be + running the kepler pod and have one or more of the kepler pod + running with a Ready Condition. + format: int32 + type: integer + numberUnavailable: + description: The number of nodes that should be running the kepler + pod and have none of the kepler pod running and available + format: int32 + type: integer + updatedNumberScheduled: + description: The total number of nodes that are running updated + kepler pod + format: int32 + type: integer + required: + - conditions + - currentNumberScheduled + - desiredNumberScheduled + - numberMisscheduled + - numberReady + type: object type: object type: object served: true diff --git a/cmd/manager/main.go b/cmd/manager/main.go index 276e73f8..e7fddb68 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -19,6 +19,7 @@ package main import ( "flag" "os" + "strings" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. @@ -38,7 +39,6 @@ import ( securityv1 "github.com/openshift/api/security/v1" keplersystemv1alpha1 "github.com/sustainable.computing.io/kepler-operator/pkg/api/v1alpha1" - "github.com/sustainable.computing.io/kepler-operator/pkg/components" "github.com/sustainable.computing.io/kepler-operator/pkg/components/exporter" "github.com/sustainable.computing.io/kepler-operator/pkg/controllers" "github.com/sustainable.computing.io/kepler-operator/pkg/utils/k8s" @@ -59,11 +59,24 @@ func init() { //+kubebuilder:scaffold:scheme } +type stringList []string + +func (f *stringList) String() string { + return "multiple values" +} + +func (s *stringList) Set(value string) error { + values := strings.Split(value, ",") + *s = append(*s, values...) + return nil +} + func main() { var metricsAddr string var enableLeaderElection bool var openshift bool var probeAddr string + var additionalNamespaces stringList flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") @@ -71,14 +84,20 @@ func main() { "Enable leader election for controller manager. "+ "Enabling this will ensure there is only one active controller manager.") + flag.StringVar(&controllers.KeplerDeploymentNS, "deployment-namespace", controllers.KeplerDeploymentNS, + "Namespace where kepler and its components are deployed.") + + flag.CommandLine.Var(flag.Value(&additionalNamespaces), "watch-namespaces", + "Namespaces other than deployment-namespace where kepler-internal may be deployed.") + flag.BoolVar(&openshift, "openshift", false, "Indicate if the operator is running on an OpenShift cluster.") // NOTE: RELATED_IMAGE_KEPLER can be set as env or flag, flag takes precedence over env keplerImage := os.Getenv("RELATED_IMAGE_KEPLER") keplerImageLibbpf := os.Getenv("RELATED_IMAGE_KEPLER_LIBBPF") - flag.StringVar(&exporter.Config.Image, "kepler.image", keplerImage, "kepler image") - flag.StringVar(&exporter.Config.ImageLibbpf, "kepler.image.libbpf", keplerImageLibbpf, "kepler libbpf image") + flag.StringVar(&controllers.Config.Image, "kepler.image", keplerImage, "kepler image") + flag.StringVar(&controllers.Config.ImageLibbpf, "kepler.image.libbpf", keplerImageLibbpf, "kepler libbpf image") opts := zap.Options{ Development: true, @@ -88,6 +107,10 @@ func main() { ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + if openshift { + controllers.Config.Cluster = k8s.OpenShift + } + mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ Scheme: scheme, Metrics: metricsserver.Options{ @@ -95,11 +118,14 @@ func main() { }, NewCache: func(config *rest.Config, opts cache.Options) (cache.Cache, error) { cacheNs := map[string]cache.Config{ - components.Namespace: {}, + controllers.KeplerDeploymentNS: {}, } if openshift { cacheNs[exporter.DashboardNs] = cache.Config{} } + for _, ns := range additionalNamespaces { + cacheNs[ns] = cache.Config{} + } opts.DefaultNamespaces = cacheNs return cache.New(config, opts) }, @@ -125,17 +151,18 @@ func main() { os.Exit(1) } - cluster := k8s.Kubernetes - if openshift { - cluster = k8s.OpenShift - } - if err = (&controllers.KeplerReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Cluster: cluster, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "kepler") + os.Exit(1) + } + if err = (&controllers.KeplerInternalReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), }).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Kepler") + setupLog.Error(err, "unable to create controller", "controller", "kepler-internal") os.Exit(1) } //+kubebuilder:scaffold:builder diff --git a/config/crd/bases/kepler.system.sustainable.computing.io_keplerinternals.yaml b/config/crd/bases/kepler.system.sustainable.computing.io_keplerinternals.yaml new file mode 100644 index 00000000..9c709148 --- /dev/null +++ b/config/crd/bases/kepler.system.sustainable.computing.io_keplerinternals.yaml @@ -0,0 +1,269 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.12.1 + name: keplerinternals.kepler.system.sustainable.computing.io +spec: + group: kepler.system.sustainable.computing.io + names: + kind: KeplerInternal + listKind: KeplerInternalList + plural: keplerinternals + singular: keplerinternal + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .spec.exporter.deployment.port + name: Port + type: integer + - jsonPath: .status.exporter.desiredNumberScheduled + name: Desired + type: integer + - jsonPath: .status.exporter.currentNumberScheduled + name: Current + type: integer + - jsonPath: .status.updatedNumberScheduled + name: Up-to-date + type: integer + - jsonPath: .status.exporter.numberReady + name: Ready + type: integer + - jsonPath: .status.exporter.numberAvailable + name: Available + type: integer + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + - jsonPath: .spec.exporter.deployment.image + name: Image + type: string + - jsonPath: .spec.exporter.deployment.nodeSelector + name: Node-Selector + priority: 10 + type: string + - jsonPath: .spec.exporter.deployment.tolerations + name: Tolerations + priority: 10 + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: KeplerInternal is the Schema for the keplers internal API + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: KeplerInternalSpec defines the desired state of KeplerInternal + properties: + exporter: + properties: + deployment: + properties: + image: + description: Image of kepler-exporter to be deployed + minLength: 3 + type: string + namespace: + description: Namespace where kepler-exporter will be deployed + minLength: 1 + type: string + nodeSelector: + additionalProperties: + type: string + default: + kubernetes.io/os: linux + description: Defines which Nodes the Pod is scheduled on + type: object + port: + default: 9103 + format: int32 + maximum: 65535 + minimum: 1 + type: integer + tolerations: + default: + - effect: "" + key: "" + operator: Exists + value: "" + description: If specified, define Pod's tolerations + items: + description: The pod this Toleration is attached to tolerates + any taint that matches the triple using + the matching operator . + properties: + effect: + description: Effect indicates the taint effect to match. + Empty means match all taint effects. When specified, + allowed values are NoSchedule, PreferNoSchedule and + NoExecute. + type: string + key: + description: Key is the taint key that the toleration + applies to. Empty means match all taint keys. If the + key is empty, operator must be Exists; this combination + means to match all values and all keys. + type: string + operator: + description: Operator represents a key's relationship + to the value. Valid operators are Exists and Equal. + Defaults to Equal. Exists is equivalent to wildcard + for value, so that a pod can tolerate all taints of + a particular category. + type: string + tolerationSeconds: + description: TolerationSeconds represents the period + of time the toleration (which must be of effect NoExecute, + otherwise this field is ignored) tolerates the taint. + By default, it is not set, which means tolerate the + taint forever (do not evict). Zero and negative values + will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: Value is the taint value the toleration + matches to. If the operator is Exists, the value should + be empty, otherwise just a regular string. + type: string + type: object + type: array + required: + - image + - namespace + type: object + required: + - deployment + type: object + openshift: + properties: + dashboard: + properties: + enabled: + default: false + type: boolean + type: object + enabled: + default: true + type: boolean + required: + - enabled + type: object + required: + - exporter + type: object + status: + description: KeplerInternalStatus represents status of KeplerInternal + properties: + exporter: + description: ExporterStatus defines the observed state of Kepler Exporter + properties: + conditions: + description: conditions represent the latest available observations + of the kepler-exporter + items: + properties: + lastTransitionTime: + description: lastTransitionTime is the last time the condition + transitioned from one status to another. This should be + when the underlying condition changed. If that is not + known, then using the time when the API field changed + is acceptable. + format: date-time + type: string + message: + description: message is a human readable message indicating + details about the transition. This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: observedGeneration represents the .metadata.generation + that the condition was set based upon. For instance, if + .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration + is 9, the condition is out of date with respect to the + current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: reason contains a programmatic identifier indicating + the reason for the condition's last transition. + type: string + status: + description: status of the condition, one of True, False, + Unknown. + type: string + type: + description: Type of Kepler Condition - Reconciled, Available + ... + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-type: atomic + currentNumberScheduled: + description: The number of nodes that are running at least 1 kepler + pod and are supposed to run the kepler pod. + format: int32 + type: integer + desiredNumberScheduled: + description: The total number of nodes that should be running + the kepler pod (including nodes correctly running the kepler + pod). + format: int32 + type: integer + numberAvailable: + description: The number of nodes that should be running the kepler + pod and have one or more of the kepler pod running and available + format: int32 + type: integer + numberMisscheduled: + description: The number of nodes that are running the kepler pod, + but are not supposed to run the kepler pod. + format: int32 + type: integer + numberReady: + description: numberReady is the number of nodes that should be + running the kepler pod and have one or more of the kepler pod + running with a Ready Condition. + format: int32 + type: integer + numberUnavailable: + description: The number of nodes that should be running the kepler + pod and have none of the kepler pod running and available + format: int32 + type: integer + updatedNumberScheduled: + description: The total number of nodes that are running updated + kepler pod + format: int32 + type: integer + required: + - conditions + - currentNumberScheduled + - desiredNumberScheduled + - numberMisscheduled + - numberReady + type: object + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/bases/kepler.system.sustainable.computing.io_keplers.yaml b/config/crd/bases/kepler.system.sustainable.computing.io_keplers.yaml index a53ac4de..7816020e 100644 --- a/config/crd/bases/kepler.system.sustainable.computing.io_keplers.yaml +++ b/config/crd/bases/kepler.system.sustainable.computing.io_keplers.yaml @@ -18,19 +18,19 @@ spec: - jsonPath: .spec.exporter.deployment.port name: Port type: integer - - jsonPath: .status.desiredNumberScheduled + - jsonPath: .status.exporter.desiredNumberScheduled name: Desired type: integer - - jsonPath: .status.currentNumberScheduled + - jsonPath: .status.exporter.currentNumberScheduled name: Current type: integer - - jsonPath: .status.numberReady + - jsonPath: .status.exporter.numberReady name: Ready type: integer - - jsonPath: .status.updatedNumberScheduled + - jsonPath: .status.exporter.updatedNumberScheduled name: Up-to-date type: integer - - jsonPath: .status.numberAvailable + - jsonPath: .status.exporter.numberAvailable name: Available type: integer - jsonPath: .metadata.creationTimestamp @@ -134,92 +134,101 @@ spec: status: description: KeplerStatus defines the observed state of Kepler properties: - conditions: - items: - properties: - lastTransitionTime: - description: lastTransitionTime is the last time the condition - transitioned from one status to another. This should be when - the underlying condition changed. If that is not known, then - using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: message is a human readable message indicating - details about the transition. This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: observedGeneration represents the .metadata.generation - that the condition was set based upon. For instance, if .metadata.generation - is currently 12, but the .status.conditions[x].observedGeneration - is 9, the condition is out of date with respect to the current - state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: reason contains a programmatic identifier indicating - the reason for the condition's last transition. - type: string - status: - description: status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of Kepler Condition - Reconciled, Available - ... - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array - x-kubernetes-list-type: atomic - currentNumberScheduled: - description: The number of nodes that are running at least 1 kepler - pod and are supposed to run the kepler pod. - format: int32 - type: integer - desiredNumberScheduled: - description: The total number of nodes that should be running the - kepler pod (including nodes correctly running the kepler pod). - format: int32 - type: integer - numberAvailable: - description: The number of nodes that should be running the kepler - pod and have one or more of the kepler pod running and available - format: int32 - type: integer - numberMisscheduled: - description: The number of nodes that are running the kepler pod, - but are not supposed to run the kepler pod. - format: int32 - type: integer - numberReady: - description: numberReady is the number of nodes that should be running - the kepler pod and have one or more of the kepler pod running with - a Ready Condition. - format: int32 - type: integer - numberUnavailable: - description: The number of nodes that should be running the kepler - pod and have none of the kepler pod running and available - format: int32 - type: integer - updatedNumberScheduled: - description: The total number of nodes that are running updated kepler - pod - format: int32 - type: integer - required: - - conditions - - currentNumberScheduled - - desiredNumberScheduled - - numberMisscheduled - - numberReady + exporter: + description: ExporterStatus defines the observed state of Kepler Exporter + properties: + conditions: + description: conditions represent the latest available observations + of the kepler-exporter + items: + properties: + lastTransitionTime: + description: lastTransitionTime is the last time the condition + transitioned from one status to another. This should be + when the underlying condition changed. If that is not + known, then using the time when the API field changed + is acceptable. + format: date-time + type: string + message: + description: message is a human readable message indicating + details about the transition. This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: observedGeneration represents the .metadata.generation + that the condition was set based upon. For instance, if + .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration + is 9, the condition is out of date with respect to the + current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: reason contains a programmatic identifier indicating + the reason for the condition's last transition. + type: string + status: + description: status of the condition, one of True, False, + Unknown. + type: string + type: + description: Type of Kepler Condition - Reconciled, Available + ... + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-type: atomic + currentNumberScheduled: + description: The number of nodes that are running at least 1 kepler + pod and are supposed to run the kepler pod. + format: int32 + type: integer + desiredNumberScheduled: + description: The total number of nodes that should be running + the kepler pod (including nodes correctly running the kepler + pod). + format: int32 + type: integer + numberAvailable: + description: The number of nodes that should be running the kepler + pod and have one or more of the kepler pod running and available + format: int32 + type: integer + numberMisscheduled: + description: The number of nodes that are running the kepler pod, + but are not supposed to run the kepler pod. + format: int32 + type: integer + numberReady: + description: numberReady is the number of nodes that should be + running the kepler pod and have one or more of the kepler pod + running with a Ready Condition. + format: int32 + type: integer + numberUnavailable: + description: The number of nodes that should be running the kepler + pod and have none of the kepler pod running and available + format: int32 + type: integer + updatedNumberScheduled: + description: The total number of nodes that are running updated + kepler pod + format: int32 + type: integer + required: + - conditions + - currentNumberScheduled + - desiredNumberScheduled + - numberMisscheduled + - numberReady + type: object type: object type: object served: true diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml index 76de63e4..fdf1da23 100644 --- a/config/crd/kustomization.yaml +++ b/config/crd/kustomization.yaml @@ -3,6 +3,7 @@ # It should be run by config/default resources: - bases/kepler.system.sustainable.computing.io_keplers.yaml +- bases/kepler.system.sustainable.computing.io_keplerinternals.yaml #+kubebuilder:scaffold:crdkustomizeresource patchesStrategicMerge: diff --git a/config/manifests/bases/kepler-operator.clusterserviceversion.yaml b/config/manifests/bases/kepler-operator.clusterserviceversion.yaml index 0091c8de..30efac97 100644 --- a/config/manifests/bases/kepler-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/kepler-operator.clusterserviceversion.yaml @@ -8,6 +8,10 @@ metadata: createdAt: "2023-01-31 16:20:00" description: 'Deploys and Manages Kepler on Kubernetes ' operators.operatorframework.io/builder: operator-sdk-v1.27.0 + operators.operatorframework.io/internal-objects: |- + [ + "keplerinternals.kepler.system.sustainable.computing.io" + ] operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 repository: https://github.com/sustainable-computing-io/kepler-operator name: kepler-operator.v0.0.0 @@ -16,13 +20,27 @@ spec: apiservicedefinitions: {} customresourcedefinitions: owned: + - description: KeplerInternal is the Schema for internal/unsupported API + displayName: KeplerInternal + kind: KeplerInternal + name: keplerinternals.kepler.system.sustainable.computing.io + statusDescriptors: + - description: conditions represent the latest available observations of the + kepler-exporter + displayName: Conditions + path: exporter.conditions + x-descriptors: + - urn:alm:descriptor:com.tectonic.ui:conditions + version: v1alpha1 - description: Kepler is the Schema for the keplers API displayName: Kepler kind: Kepler name: keplers.kepler.system.sustainable.computing.io statusDescriptors: - - displayName: Conditions - path: conditions + - description: conditions represent the latest available observations of the + kepler-exporter + displayName: Conditions + path: exporter.conditions x-descriptors: - urn:alm:descriptor:com.tectonic.ui:conditions version: v1alpha1 diff --git a/docs/api.md b/docs/api.md index 004204cd..d2c1293a 100644 --- a/docs/api.md +++ b/docs/api.md @@ -8,11 +8,497 @@ Packages: Resource Types: +- [KeplerInternal](#keplerinternal) + - [Kepler](#kepler) +## KeplerInternal +[↩ Parent](#keplersystemsustainablecomputingiov1alpha1 ) + + + + + + +KeplerInternal is the Schema for the keplers internal API + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
apiVersionstringkepler.system.sustainable.computing.io/v1alpha1true
kindstringKeplerInternaltrue
metadataobjectRefer to the Kubernetes API documentation for the fields of the `metadata` field.true
specobject + KeplerInternalSpec defines the desired state of KeplerInternal
+
false
statusobject + KeplerInternalStatus represents status of KeplerInternal
+
false
+ + +### KeplerInternal.spec +[↩ Parent](#keplerinternal) + + + +KeplerInternalSpec defines the desired state of KeplerInternal + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
exporterobject +
+
true
openshiftobject +
+
false
+ + +### KeplerInternal.spec.exporter +[↩ Parent](#keplerinternalspec) + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
deploymentobject +
+
true
+ + +### KeplerInternal.spec.exporter.deployment +[↩ Parent](#keplerinternalspecexporter) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
imagestring + Image of kepler-exporter to be deployed
+
true
namespacestring + Namespace where kepler-exporter will be deployed
+
true
nodeSelectormap[string]string + Defines which Nodes the Pod is scheduled on
+
+ Default: map[kubernetes.io/os:linux]
+
false
portinteger +
+
+ Format: int32
+ Default: 9103
+ Minimum: 1
+ Maximum: 65535
+
false
tolerations[]object + If specified, define Pod's tolerations
+
+ Default: [map[effect: key: operator:Exists value:]]
+
false
+ + +### KeplerInternal.spec.exporter.deployment.tolerations[index] +[↩ Parent](#keplerinternalspecexporterdeployment) + + + +The pod this Toleration is attached to tolerates any taint that matches the triple using the matching operator . + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
effectstring + Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
+
false
keystring + Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.
+
false
operatorstring + Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.
+
false
tolerationSecondsinteger + TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.
+
+ Format: int64
+
false
valuestring + Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.
+
false
+ + +### KeplerInternal.spec.openshift +[↩ Parent](#keplerinternalspec) + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
enabledboolean +
+
+ Default: true
+
true
dashboardobject +
+
false
+ + +### KeplerInternal.spec.openshift.dashboard +[↩ Parent](#keplerinternalspecopenshift) + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
enabledboolean +
+
+ Default: false
+
false
+ + +### KeplerInternal.status +[↩ Parent](#keplerinternal) + + + +KeplerInternalStatus represents status of KeplerInternal + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
exporterobject + ExporterStatus defines the observed state of Kepler Exporter
+
false
+ + +### KeplerInternal.status.exporter +[↩ Parent](#keplerinternalstatus) + + + +ExporterStatus defines the observed state of Kepler Exporter + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
conditions[]object + conditions represent the latest available observations of the kepler-exporter
+
true
currentNumberScheduledinteger + The number of nodes that are running at least 1 kepler pod and are supposed to run the kepler pod.
+
+ Format: int32
+
true
desiredNumberScheduledinteger + The total number of nodes that should be running the kepler pod (including nodes correctly running the kepler pod).
+
+ Format: int32
+
true
numberMisscheduledinteger + The number of nodes that are running the kepler pod, but are not supposed to run the kepler pod.
+
+ Format: int32
+
true
numberReadyinteger + numberReady is the number of nodes that should be running the kepler pod and have one or more of the kepler pod running with a Ready Condition.
+
+ Format: int32
+
true
numberAvailableinteger + The number of nodes that should be running the kepler pod and have one or more of the kepler pod running and available
+
+ Format: int32
+
false
numberUnavailableinteger + The number of nodes that should be running the kepler pod and have none of the kepler pod running and available
+
+ Format: int32
+
false
updatedNumberScheduledinteger + The total number of nodes that are running updated kepler pod
+
+ Format: int32
+
false
+ + +### KeplerInternal.status.exporter.conditions[index] +[↩ Parent](#keplerinternalstatusexporter) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
lastTransitionTimestring + lastTransitionTime is the last time the condition transitioned from one status to another. This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable.
+
+ Format: date-time
+
true
messagestring + message is a human readable message indicating details about the transition. This may be an empty string.
+
true
reasonstring + reason contains a programmatic identifier indicating the reason for the condition's last transition.
+
true
statusstring + status of the condition, one of True, False, Unknown.
+
true
typestring + Type of Kepler Condition - Reconciled, Available ...
+
true
observedGenerationinteger + observedGeneration represents the .metadata.generation that the condition was set based upon. For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date with respect to the current state of the instance.
+
+ Format: int64
+ Minimum: 0
+
false
+ ## Kepler [↩ Parent](#keplersystemsustainablecomputingiov1alpha1 ) @@ -245,10 +731,37 @@ KeplerStatus defines the observed state of Kepler - conditions + exporter + object + + ExporterStatus defines the observed state of Kepler Exporter
+ + false + + + + +### Kepler.status.exporter +[↩ Parent](#keplerstatus) + + + +ExporterStatus defines the observed state of Kepler Exporter + + + + + + + + + + + + @@ -318,8 +831,8 @@ KeplerStatus defines the observed state of Kepler
NameTypeDescriptionRequired
conditions []object -
+ conditions represent the latest available observations of the kepler-exporter
true
-### Kepler.status.conditions[index] -[↩ Parent](#keplerstatus) +### Kepler.status.exporter.conditions[index] +[↩ Parent](#keplerstatusexporter) diff --git a/pkg/api/v1alpha1/kepler_internal_types.go b/pkg/api/v1alpha1/kepler_internal_types.go new file mode 100644 index 00000000..4bd62600 --- /dev/null +++ b/pkg/api/v1alpha1/kepler_internal_types.go @@ -0,0 +1,116 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// NOTE: all internal types can depend on public types +// e.g. kepler-internal.spec.exporter can reuse ExporterSpec because the API is +// considered stable but not vice-versa. + +type InternalExporterDeploymentSpec struct { + ExporterDeploymentSpec `json:",inline"` + // Image of kepler-exporter to be deployed + // +kubebuilder:validation:MinLength=3 + Image string `json:"image"` + + // Namespace where kepler-exporter will be deployed + // +kubebuilder:validation:MinLength=1 + Namespace string `json:"namespace"` +} + +type InternalExporterSpec struct { + // +kubebuilder:validation:Required + Deployment InternalExporterDeploymentSpec `json:"deployment"` +} + +type DashboardSpec struct { + // +kubebuilder:default=false + Enabled bool `json:"enabled,omitempty"` +} + +type OpenShiftSpec struct { + // +kubebuilder:default=true + Enabled bool `json:"enabled"` + Dashboard DashboardSpec `json:"dashboard,omitempty"` +} + +// KeplerInternalSpec defines the desired state of KeplerInternal +type KeplerInternalSpec struct { + Exporter InternalExporterSpec `json:"exporter"` + OpenShift OpenShiftSpec `json:"openshift,omitempty"` +} + +//+kubebuilder:object:root=true +//+kubebuilder:resource:scope="Cluster" +//+kubebuilder:subresource:status + +// +kubebuilder:printcolumn:name="Port",type=integer,JSONPath=`.spec.exporter.deployment.port` +// +kubebuilder:printcolumn:name="Desired",type=integer,JSONPath=`.status.exporter.desiredNumberScheduled` +// +kubebuilder:printcolumn:name="Current",type=integer,JSONPath=`.status.exporter.currentNumberScheduled` +// +kubebuilder:printcolumn:name="Up-to-date",type=integer,JSONPath=`.status.updatedNumberScheduled` +// +kubebuilder:printcolumn:name="Ready",type=integer,JSONPath=`.status.exporter.numberReady` +// +kubebuilder:printcolumn:name="Available",type=integer,JSONPath=`.status.exporter.numberAvailable` +// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" +// +kubebuilder:printcolumn:name="Image",type=string,JSONPath=`.spec.exporter.deployment.image` +// +kubebuilder:printcolumn:name="Node-Selector",type=string,JSONPath=`.spec.exporter.deployment.nodeSelector`,priority=10 +// +kubebuilder:printcolumn:name="Tolerations",type=string,JSONPath=`.spec.exporter.deployment.tolerations`,priority=10 +// +// KeplerInternal is the Schema for the keplers internal API +type KeplerInternal struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec KeplerInternalSpec `json:"spec,omitempty"` + Status KeplerInternalStatus `json:"status,omitempty"` +} + +// KeplerInternalStatus represents status of KeplerInternal +type KeplerInternalStatus struct { + Exporter ExporterStatus `json:"exporter,omitempty"` +} + +func (ki KeplerInternal) Namespace() string { + return ki.Spec.Exporter.Deployment.Namespace +} + +func (ki KeplerInternal) DaemonsetName() string { + return ki.Name +} + +func (ki KeplerInternal) ServiceAccountName() string { + return ki.Name +} + +func (ki KeplerInternal) FQServiceAccountName() string { + return "system:serviceaccount:" + ki.Namespace() + ":" + ki.Name +} + +//+kubebuilder:object:root=true + +// KeplerInternalList contains a list of Kepler +type KeplerInternalList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []KeplerInternal `json:"items"` +} + +func init() { + SchemeBuilder.Register(&KeplerInternal{}, &KeplerInternalList{}) +} diff --git a/pkg/api/v1alpha1/kepler_types.go b/pkg/api/v1alpha1/kepler_types.go index 1a927548..41ec8e3a 100644 --- a/pkg/api/v1alpha1/kepler_types.go +++ b/pkg/api/v1alpha1/kepler_types.go @@ -21,77 +21,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -// NOTE: json tags are required. Any new fields you add must have json tags for -// the fields to be serialized. - -type Cgroupv2 string - -type RatioMetrics struct { - Global string `json:"global,omitempty"` - Core string `json:"core,omitempty"` - Uncore string `json:"uncore,omitempty"` - Dram string `json:"dram,omitempty"` -} - -type Sources struct { - Cgroupv2 Cgroupv2 `json:"cgroupv2,omitempty"` - Bpf string `json:"bpf,omitempty"` - Counters string `json:"counters,omitempty"` - Kubelet string `json:"kubelet,omitempty"` -} - -type HTTPHeader struct { - Key string `json:"headerKey,omitempty"` - Value string `json:"headerValue,omitempty"` -} - -type ModelServerTrainerSpec struct { - // TODO: consider namespacing all Prometheus related fields - - // +kubebuilder:default=20 - PromQueryInterval int `json:"promQueryInterval,omitempty"` - - // +kubebuilder:default=3 - PromQueryStep int `json:"promQueryStep,omitempty"` - - PromHeaders []HTTPHeader `json:"promHeaders,omitempty"` - - // +kubebuilder:default=true - PromSSLDisable bool `json:"promSSLDisable,omitempty"` - - // +kubebuilder:default="" - InitialModelsEndpoint string `json:"initialModelsEndpoint,omitempty"` - - // +kubebuilder:default="" - InitialModelNames string `json:"initialModelNames,omitempty"` -} - -type ModelServerSpec struct { - - // +kubebuilder:default="" - URL string `json:"url,omitempty"` - - // +kubebuilder:default=8100 - Port int `json:"port,omitempty"` - - // +kubebuilder:default="" - Path string `json:"path,omitempty"` - - // +kubebuilder:default="" - RequiredPath string `json:"requiredPath,omitempty"` - - // +kubebuilder:default="" - PromServer string `json:"promServer,omitempty"` - - Trainer *ModelServerTrainerSpec `json:"trainer,omitempty"` -} - -type EstimatorSpec struct { - ModelName string `json:"modelName,omitempty"` - FilterConditions string `json:"filterConditions,omitempty"` - InitUrl string `json:"initUrl,omitempty"` -} - type ExporterDeploymentSpec struct { // +kubebuilder:default=9103 // +kubebuilder:validation:Maximum=65535 @@ -192,8 +121,8 @@ type Condition struct { Message string `json:"message"` } -// KeplerStatus defines the observed state of Kepler -type KeplerStatus struct { +// ExporterStatus defines the observed state of Kepler Exporter +type ExporterStatus struct { // The number of nodes that are running at least 1 kepler pod and are // supposed to run the kepler pod. CurrentNumberScheduled int32 `json:"currentNumberScheduled"` @@ -224,8 +153,7 @@ type KeplerStatus struct { // +optional NumberUnavailable int32 `json:"numberUnavailable,omitempty"` - // conditions represent the latest available observations of the kepler-system - + // conditions represent the latest available observations of the kepler-exporter // +operator-sdk:csv:customresourcedefinitions:type=status,xDescriptors="urn:alm:descriptor:com.tectonic.ui:conditions" // +listType=atomic Conditions []Condition `json:"conditions"` @@ -236,11 +164,11 @@ type KeplerStatus struct { //+kubebuilder:subresource:status // +kubebuilder:printcolumn:name="Port",type=integer,JSONPath=`.spec.exporter.deployment.port` -// +kubebuilder:printcolumn:name="Desired",type=integer,JSONPath=`.status.desiredNumberScheduled` -// +kubebuilder:printcolumn:name="Current",type=integer,JSONPath=`.status.currentNumberScheduled` -// +kubebuilder:printcolumn:name="Ready",type=integer,JSONPath=`.status.numberReady` -// +kubebuilder:printcolumn:name="Up-to-date",type=integer,JSONPath=`.status.updatedNumberScheduled` -// +kubebuilder:printcolumn:name="Available",type=integer,JSONPath=`.status.numberAvailable` +// +kubebuilder:printcolumn:name="Desired",type=integer,JSONPath=`.status.exporter.desiredNumberScheduled` +// +kubebuilder:printcolumn:name="Current",type=integer,JSONPath=`.status.exporter.currentNumberScheduled` +// +kubebuilder:printcolumn:name="Ready",type=integer,JSONPath=`.status.exporter.numberReady` +// +kubebuilder:printcolumn:name="Up-to-date",type=integer,JSONPath=`.status.exporter.updatedNumberScheduled` +// +kubebuilder:printcolumn:name="Available",type=integer,JSONPath=`.status.exporter.numberAvailable` // +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" // +kubebuilder:printcolumn:name="Node-Selector",type=string,JSONPath=`.spec.exporter.deployment.nodeSelector`,priority=10 // +kubebuilder:printcolumn:name="Tolerations",type=string,JSONPath=`.spec.exporter.deployment.tolerations`,priority=10 @@ -254,6 +182,11 @@ type Kepler struct { Status KeplerStatus `json:"status,omitempty"` } +// KeplerStatus defines the observed state of Kepler +type KeplerStatus struct { + Exporter ExporterStatus `json:"exporter,omitempty"` +} + //+kubebuilder:object:root=true // KeplerList contains a list of Kepler diff --git a/pkg/api/v1alpha1/zz_generated.deepcopy.go b/pkg/api/v1alpha1/zz_generated.deepcopy.go index fd0aefc1..8fe6e8de 100644 --- a/pkg/api/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/api/v1alpha1/zz_generated.deepcopy.go @@ -43,16 +43,16 @@ func (in *Condition) DeepCopy() *Condition { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *EstimatorSpec) DeepCopyInto(out *EstimatorSpec) { +func (in *DashboardSpec) DeepCopyInto(out *DashboardSpec) { *out = *in } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EstimatorSpec. -func (in *EstimatorSpec) DeepCopy() *EstimatorSpec { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DashboardSpec. +func (in *DashboardSpec) DeepCopy() *DashboardSpec { if in == nil { return nil } - out := new(EstimatorSpec) + out := new(DashboardSpec) in.DeepCopyInto(out) return out } @@ -103,16 +103,55 @@ func (in *ExporterSpec) DeepCopy() *ExporterSpec { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *HTTPHeader) DeepCopyInto(out *HTTPHeader) { +func (in *ExporterStatus) DeepCopyInto(out *ExporterStatus) { *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HTTPHeader. -func (in *HTTPHeader) DeepCopy() *HTTPHeader { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExporterStatus. +func (in *ExporterStatus) DeepCopy() *ExporterStatus { if in == nil { return nil } - out := new(HTTPHeader) + out := new(ExporterStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InternalExporterDeploymentSpec) DeepCopyInto(out *InternalExporterDeploymentSpec) { + *out = *in + in.ExporterDeploymentSpec.DeepCopyInto(&out.ExporterDeploymentSpec) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InternalExporterDeploymentSpec. +func (in *InternalExporterDeploymentSpec) DeepCopy() *InternalExporterDeploymentSpec { + if in == nil { + return nil + } + out := new(InternalExporterDeploymentSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InternalExporterSpec) DeepCopyInto(out *InternalExporterSpec) { + *out = *in + in.Deployment.DeepCopyInto(&out.Deployment) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InternalExporterSpec. +func (in *InternalExporterSpec) DeepCopy() *InternalExporterSpec { + if in == nil { + return nil + } + out := new(InternalExporterSpec) in.DeepCopyInto(out) return out } @@ -145,31 +184,58 @@ func (in *Kepler) DeepCopyObject() runtime.Object { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *KeplerList) DeepCopyInto(out *KeplerList) { +func (in *KeplerInternal) DeepCopyInto(out *KeplerInternal) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KeplerInternal. +func (in *KeplerInternal) DeepCopy() *KeplerInternal { + if in == nil { + return nil + } + out := new(KeplerInternal) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *KeplerInternal) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *KeplerInternalList) DeepCopyInto(out *KeplerInternalList) { *out = *in out.TypeMeta = in.TypeMeta in.ListMeta.DeepCopyInto(&out.ListMeta) if in.Items != nil { in, out := &in.Items, &out.Items - *out = make([]Kepler, len(*in)) + *out = make([]KeplerInternal, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KeplerList. -func (in *KeplerList) DeepCopy() *KeplerList { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KeplerInternalList. +func (in *KeplerInternalList) DeepCopy() *KeplerInternalList { if in == nil { return nil } - out := new(KeplerList) + out := new(KeplerInternalList) in.DeepCopyInto(out) return out } // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *KeplerList) DeepCopyObject() runtime.Object { +func (in *KeplerInternalList) DeepCopyObject() runtime.Object { if c := in.DeepCopy(); c != nil { return c } @@ -177,109 +243,114 @@ func (in *KeplerList) DeepCopyObject() runtime.Object { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *KeplerSpec) DeepCopyInto(out *KeplerSpec) { +func (in *KeplerInternalSpec) DeepCopyInto(out *KeplerInternalSpec) { *out = *in in.Exporter.DeepCopyInto(&out.Exporter) + out.OpenShift = in.OpenShift } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KeplerSpec. -func (in *KeplerSpec) DeepCopy() *KeplerSpec { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KeplerInternalSpec. +func (in *KeplerInternalSpec) DeepCopy() *KeplerInternalSpec { if in == nil { return nil } - out := new(KeplerSpec) + out := new(KeplerInternalSpec) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *KeplerStatus) DeepCopyInto(out *KeplerStatus) { +func (in *KeplerInternalStatus) DeepCopyInto(out *KeplerInternalStatus) { *out = *in - if in.Conditions != nil { - in, out := &in.Conditions, &out.Conditions - *out = make([]Condition, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } + in.Exporter.DeepCopyInto(&out.Exporter) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KeplerStatus. -func (in *KeplerStatus) DeepCopy() *KeplerStatus { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KeplerInternalStatus. +func (in *KeplerInternalStatus) DeepCopy() *KeplerInternalStatus { if in == nil { return nil } - out := new(KeplerStatus) + out := new(KeplerInternalStatus) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ModelServerSpec) DeepCopyInto(out *ModelServerSpec) { +func (in *KeplerList) DeepCopyInto(out *KeplerList) { *out = *in - if in.Trainer != nil { - in, out := &in.Trainer, &out.Trainer - *out = new(ModelServerTrainerSpec) - (*in).DeepCopyInto(*out) + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]Kepler, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelServerSpec. -func (in *ModelServerSpec) DeepCopy() *ModelServerSpec { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KeplerList. +func (in *KeplerList) DeepCopy() *KeplerList { if in == nil { return nil } - out := new(ModelServerSpec) + out := new(KeplerList) in.DeepCopyInto(out) return out } +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *KeplerList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ModelServerTrainerSpec) DeepCopyInto(out *ModelServerTrainerSpec) { +func (in *KeplerSpec) DeepCopyInto(out *KeplerSpec) { *out = *in - if in.PromHeaders != nil { - in, out := &in.PromHeaders, &out.PromHeaders - *out = make([]HTTPHeader, len(*in)) - copy(*out, *in) - } + in.Exporter.DeepCopyInto(&out.Exporter) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelServerTrainerSpec. -func (in *ModelServerTrainerSpec) DeepCopy() *ModelServerTrainerSpec { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KeplerSpec. +func (in *KeplerSpec) DeepCopy() *KeplerSpec { if in == nil { return nil } - out := new(ModelServerTrainerSpec) + out := new(KeplerSpec) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *RatioMetrics) DeepCopyInto(out *RatioMetrics) { +func (in *KeplerStatus) DeepCopyInto(out *KeplerStatus) { *out = *in + in.Exporter.DeepCopyInto(&out.Exporter) } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RatioMetrics. -func (in *RatioMetrics) DeepCopy() *RatioMetrics { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KeplerStatus. +func (in *KeplerStatus) DeepCopy() *KeplerStatus { if in == nil { return nil } - out := new(RatioMetrics) + out := new(KeplerStatus) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *Sources) DeepCopyInto(out *Sources) { +func (in *OpenShiftSpec) DeepCopyInto(out *OpenShiftSpec) { *out = *in + out.Dashboard = in.Dashboard } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Sources. -func (in *Sources) DeepCopy() *Sources { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OpenShiftSpec. +func (in *OpenShiftSpec) DeepCopy() *OpenShiftSpec { if in == nil { return nil } - out := new(Sources) + out := new(OpenShiftSpec) in.DeepCopyInto(out) return out } diff --git a/pkg/components/components.go b/pkg/components/components.go index dee9b851..b4a34ffd 100644 --- a/pkg/components/components.go +++ b/pkg/components/components.go @@ -32,22 +32,17 @@ const ( var ( CommonLabels = k8s.StringMap{ "app.kubernetes.io/managed-by": "kepler-operator", - "app.kubernetes.io/part-of": "kepler", } ) -const ( - Namespace = "openshift-kepler-operator" -) - -func NewKeplerNamespace() *corev1.Namespace { +func NewNamespace(ns string) *corev1.Namespace { return &corev1.Namespace{ TypeMeta: metav1.TypeMeta{ APIVersion: corev1.SchemeGroupVersion.String(), Kind: "Namespace", }, ObjectMeta: metav1.ObjectMeta{ - Name: Namespace, + Name: ns, Labels: CommonLabels.Merge(k8s.StringMap{ // NOTE: Fixes the following error On Openshift 4.14 // Warning FailedCreate daemonset-controller diff --git a/pkg/components/exporter/assets/dashboards/power-monitoring-by-ns.json b/pkg/components/exporter/assets/dashboards/power-monitoring-by-ns.json index ad98de53..cdde3f5c 100644 --- a/pkg/components/exporter/assets/dashboards/power-monitoring-by-ns.json +++ b/pkg/components/exporter/assets/dashboards/power-monitoring-by-ns.json @@ -84,7 +84,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(kepler:container_package_watts:1m:by_ns_pod{container_namespace=~\"$namespace\"})", + "expr": "sum(kepler:kepler:container_package_watts:1m:by_ns_pod{container_namespace=~\"$namespace\"})", "hide": false, "interval": "", "legendFormat": "PKG", @@ -97,7 +97,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(kepler:container_dram_watts:1m:by_ns_pod{container_namespace=~\"$namespace\"})", + "expr": "sum(kepler:kepler:container_dram_watts:1m:by_ns_pod{container_namespace=~\"$namespace\"})", "hide": false, "interval": "", "legendFormat": "DRAM", @@ -110,7 +110,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(kepler:container_other_watts:1m:by_ns_pod{container_namespace=~\"$namespace\"})", + "expr": "sum(kepler:kepler:container_other_watts:1m:by_ns_pod{container_namespace=~\"$namespace\"})", "hide": false, "legendFormat": "OTHER", "range": true, @@ -122,7 +122,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(kepler:container_gpu_watts:1m:by_ns_pod{container_namespace=~\"$namespace\"})", + "expr": "sum(kepler:kepler:container_gpu_watts:1m:by_ns_pod{container_namespace=~\"$namespace\"})", "hide": false, "legendFormat": "GPU", "range": true, @@ -216,7 +216,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "kepler:container_package_joules_total:consumed:1h:by_ns{container_namespace=~\"$namespace\"} * 0.000000277777777777778", + "expr": "kepler:kepler:container_package_joules_total:consumed:1h:by_ns{container_namespace=~\"$namespace\"} * 0.000000277777777777778", "hide": false, "interval": "", "legendFormat": "PKG (CORE + UNCORE)", @@ -229,7 +229,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "kepler:container_dram_joules_total:consumed:1h:by_ns{container_namespace=~\"$namespace\"} * 0.000000277777777777778", + "expr": "kepler:kepler:container_dram_joules_total:consumed:1h:by_ns{container_namespace=~\"$namespace\"} * 0.000000277777777777778", "hide": false, "interval": "", "legendFormat": "DRAM", @@ -242,7 +242,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "kepler:container_other_joules_total:consumed:1h:by_ns{container_namespace=~\"$namespace\"} * 0.000000277777777777778", + "expr": "kepler:kepler:container_other_joules_total:consumed:1h:by_ns{container_namespace=~\"$namespace\"} * 0.000000277777777777778", "hide": false, "legendFormat": "OTHER", "range": true, @@ -254,7 +254,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "kepler:container_gpu_joules_total:consumed:1h:by_ns{container_namespace=~\"$namespace\"} * 0.000000277777777777778", + "expr": "kepler:kepler:container_gpu_joules_total:consumed:1h:by_ns{container_namespace=~\"$namespace\"} * 0.000000277777777777778", "hide": false, "legendFormat": " GPU", "range": true, @@ -345,7 +345,7 @@ { "datasource": "prometheus", "editorMode": "code", - "expr": "kepler:container_package_watts:1m:by_ns_pod{container_namespace=~\"$namespace\", pod_name=~\"$pod\"}", + "expr": "kepler:kepler:container_package_watts:1m:by_ns_pod{container_namespace=~\"$namespace\", pod_name=~\"$pod\"}", "hide": false, "interval": "", "legendFormat": "{{pod_name}}", @@ -437,7 +437,7 @@ { "datasource": "prometheus", "editorMode": "code", - "expr": "kepler:container_dram_watts:1m:by_ns_pod{container_namespace=~\"$namespace\", pod_name=~\"$pod\"}", + "expr": "kepler:kepler:container_dram_watts:1m:by_ns_pod{container_namespace=~\"$namespace\", pod_name=~\"$pod\"}", "hide": false, "interval": "", "legendFormat": "{{pod_name}}", @@ -529,7 +529,7 @@ { "datasource": "prometheus", "editorMode": "code", - "expr": "sum by (pod_name) (kepler:container_gpu_watts:1m:by_ns_pod{container_namespace=~\"$namespace\", pod_name=~\"$pod\"})", + "expr": "sum by (pod_name) (kepler:kepler:container_gpu_watts:1m:by_ns_pod{container_namespace=~\"$namespace\", pod_name=~\"$pod\"})", "hide": false, "interval": "", "legendFormat": "{{pod_name}}", @@ -615,7 +615,7 @@ { "datasource": "prometheus", "editorMode": "code", - "expr": "sum by (pod_name) (kepler:other_joules_watts:1m:by_ns_pod{container_namespace=~\"$namespace\", pod_name=~\"$pod\"})", + "expr": "sum by (pod_name) (kepler:kepler:other_joules_watts:1m:by_ns_pod{container_namespace=~\"$namespace\", pod_name=~\"$pod\"})", "hide": false, "interval": "", "legendFormat": "{{pod_name}} }}", @@ -657,7 +657,9 @@ "refresh": "", "schemaVersion": 36, "style": "dark", - "tags": ["kepler-mixin"], + "tags": [ + "kepler-mixin" + ], "templating": { "list": [ { diff --git a/pkg/components/exporter/assets/dashboards/power-monitoring-overview.json b/pkg/components/exporter/assets/dashboards/power-monitoring-overview.json index 393b3069..18e37167 100644 --- a/pkg/components/exporter/assets/dashboards/power-monitoring-overview.json +++ b/pkg/components/exporter/assets/dashboards/power-monitoring-overview.json @@ -120,7 +120,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "count by (cpu_architecture)(kepler_node_info) ", + "expr": "count by (cpu_architecture)(kepler_node_info{container=\"kepler\"}) ", "format": "table", "instant": true, "legendFormat": "", @@ -186,7 +186,7 @@ "type": "prometheus", "uid": "To6-2So4k" }, - "expr": "kepler:container_joules_total:consumed:24h:all * 0.000000277777777777778", + "expr": "kepler:kepler:container_joules_total:consumed:24h:all * 0.000000277777777777778", "refId": "A" } ], @@ -291,7 +291,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "topk(10, kepler:container_joules_total:consumed:24h:by_ns) * 0.000000277777777777778", + "expr": "topk(10, kepler:kepler:container_joules_total:consumed:24h:by_ns) * 0.000000277777777777778", "format": "table", "interval": "", "legendFormat": "{{container_namespace}}", diff --git a/pkg/components/exporter/exporter.go b/pkg/components/exporter/exporter.go index 3366c7a4..ae88961f 100644 --- a/pkg/components/exporter/exporter.go +++ b/pkg/components/exporter/exporter.go @@ -18,8 +18,9 @@ package exporter import ( _ "embed" + "fmt" + "regexp" "strconv" - "strings" "github.com/sustainable.computing.io/kepler-operator/pkg/api/v1alpha1" "github.com/sustainable.computing.io/kepler-operator/pkg/components" @@ -37,52 +38,14 @@ import ( ) const ( - prefix = "kepler-exporter-" - - SCCName = prefix + "scc" - - ServiceAccountName = prefix + "sa" - FQServiceAccountName = "system:serviceaccount:" + components.Namespace + ":" + ServiceAccountName - - ClusterRoleName = prefix + "cr" - ClusterRoleBindingName = prefix + "crb" - - ConfigmapName = prefix + "cm" - DaemonSetName = prefix + "ds" - - ServiceName = prefix + "svc" - ServicePortName = "http" - ServiceMonitorName = prefix + "smon" + ServicePortName = "http" overviewDashboardName = "power-monitoring-overview" nsInfoDashboardName = "power-monitoring-by-ns" DashboardNs = "openshift-config-managed" - - PrometheusRuleName = prefix + "prom-rules" - - KeplerBpfAttachMethodAnnotation = "kepler.sustainable.computing.io/bpf-attach-method" - KeplerBpfAttachMethodBCC = "bcc" - KeplerBpfAttachMethodLibbpf = "libbpf" -) - -// Config that will be set from outside -var ( - Config = struct { - Image string - ImageLibbpf string - }{} ) var ( - labels = components.CommonLabels.Merge(k8s.StringMap{ - "app.kubernetes.io/component": "exporter", - "sustainable-computing.io/app": "kepler", - }) - - podSelector = labels.Merge(k8s.StringMap{ - "app.kubernetes.io/name": "kepler-exporter", - }) - linuxNodeSelector = k8s.StringMap{ "kubernetes.io/os": "linux", } @@ -94,28 +57,33 @@ var ( nsInfoDashboardJson string ) -func NewDaemonSet(detail components.Detail, k *v1alpha1.Kepler) *appsv1.DaemonSet { +// TODO: +func NewDaemonSet(detail components.Detail, k *v1alpha1.KeplerInternal) *appsv1.DaemonSet { if detail == components.Metadata { return &appsv1.DaemonSet{ - TypeMeta: metav1.TypeMeta{APIVersion: appsv1.SchemeGroupVersion.String(), Kind: "DaemonSet"}, + TypeMeta: metav1.TypeMeta{ + APIVersion: appsv1.SchemeGroupVersion.String(), + Kind: "DaemonSet", + }, ObjectMeta: metav1.ObjectMeta{ - Name: DaemonSetName, - Namespace: components.Namespace, - Labels: labels, + Name: k.DaemonsetName(), + Namespace: k.Namespace(), + Labels: labels(k), }, } } - deployment := k.Spec.Exporter.Deployment + deployment := k.Spec.Exporter.Deployment.ExporterDeploymentSpec + image := k.Spec.Exporter.Deployment.Image nodeSelector := deployment.NodeSelector tolerations := deployment.Tolerations + port := deployment.Port + // NOTE: since 2 or more KeplerInternals can be deployed to the same namespace, + // we need to make sure that the pod selector of each of the DaemonSet + // create of each kepler is unique. Thus the daemonset name is added as + // label to the pod - bindAddress := "0.0.0.0:" + strconv.Itoa(int(deployment.Port)) - - keplerImage := Config.Image - if IsLibbpfAttachType(k) { - keplerImage = Config.ImageLibbpf - } + bindAddress := "0.0.0.0:" + strconv.Itoa(int(port)) return &appsv1.DaemonSet{ TypeMeta: metav1.TypeMeta{ @@ -123,28 +91,28 @@ func NewDaemonSet(detail components.Detail, k *v1alpha1.Kepler) *appsv1.DaemonSe Kind: "DaemonSet", }, ObjectMeta: metav1.ObjectMeta{ - Name: DaemonSetName, - Namespace: components.Namespace, - Labels: labels, + Name: k.Name, + Namespace: k.Namespace(), + Labels: labels(k), }, Spec: appsv1.DaemonSetSpec{ - Selector: &metav1.LabelSelector{MatchLabels: podSelector}, + Selector: &metav1.LabelSelector{MatchLabels: podSelector(k)}, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ - Name: DaemonSetName, - Namespace: components.Namespace, - Labels: podSelector, + Name: k.DaemonsetName(), + Namespace: k.Namespace(), + Labels: podSelector(k), }, Spec: corev1.PodSpec{ HostPID: true, NodeSelector: linuxNodeSelector.Merge(nodeSelector), - ServiceAccountName: ServiceAccountName, + ServiceAccountName: k.Name, DNSPolicy: corev1.DNSPolicy(corev1.DNSClusterFirstWithHostNet), Tolerations: tolerations, Containers: []corev1.Container{{ - Name: "kepler-exporter", + Name: k.DaemonsetName(), SecurityContext: &corev1.SecurityContext{Privileged: pointer.Bool(true)}, - Image: keplerImage, + Image: image, Command: []string{ "/usr/bin/kepler", "-address", bindAddress, @@ -155,14 +123,14 @@ func NewDaemonSet(detail components.Detail, k *v1alpha1.Kepler) *appsv1.DaemonSe "-redfish-cred-file-path=/etc/redfish/redfish.csv", }, Ports: []corev1.ContainerPort{{ - ContainerPort: int32(deployment.Port), + ContainerPort: int32(port), Name: "http", }}, LivenessProbe: &corev1.Probe{ ProbeHandler: corev1.ProbeHandler{ HTTPGet: &corev1.HTTPGetAction{ Path: "/healthz", - Port: intstr.IntOrString{Type: intstr.Int, IntVal: deployment.Port}, + Port: intstr.IntOrString{Type: intstr.Int, IntVal: port}, Scheme: "HTTP", }, }, @@ -174,8 +142,8 @@ func NewDaemonSet(detail components.Detail, k *v1alpha1.Kepler) *appsv1.DaemonSe Env: []corev1.EnvVar{ {Name: "NODE_IP", ValueFrom: k8s.EnvFromField("status.hostIP")}, {Name: "NODE_NAME", ValueFrom: k8s.EnvFromField("spec.nodeName")}, - {Name: "KEPLER_LOG_LEVEL", ValueFrom: k8s.EnvFromConfigMap("KEPLER_LOG_LEVEL", ConfigmapName)}, - {Name: "ENABLE_GPU", ValueFrom: k8s.EnvFromConfigMap("ENABLE_GPU", ConfigmapName)}}, + {Name: "KEPLER_LOG_LEVEL", ValueFrom: k8s.EnvFromConfigMap("KEPLER_LOG_LEVEL", k.Name)}, + {Name: "ENABLE_GPU", ValueFrom: k8s.EnvFromConfigMap("ENABLE_GPU", k.Name)}}, VolumeMounts: []corev1.VolumeMount{ {Name: "lib-modules", MountPath: "/lib/modules", ReadOnly: true}, {Name: "tracing", MountPath: "/sys", ReadOnly: true}, @@ -191,20 +159,19 @@ func NewDaemonSet(detail components.Detail, k *v1alpha1.Kepler) *appsv1.DaemonSe k8s.VolumeFromHost("proc", "/proc"), k8s.VolumeFromHost("kernel-src", "/usr/src/kernels"), k8s.VolumeFromHost("kernel-debug", "/sys/kernel/debug"), - k8s.VolumeFromConfigMap("cfm", ConfigmapName), + k8s.VolumeFromConfigMap("cfm", k.Name), }, // Volumes }, // PodSpec }, // PodTemplateSpec }, // Spec } - } func openshiftDashboardObjectMeta(name string) metav1.ObjectMeta { return metav1.ObjectMeta{ Name: name, Namespace: DashboardNs, - Labels: labels.Merge(k8s.StringMap{ + Labels: components.CommonLabels.Merge(k8s.StringMap{ "console.openshift.io/dashboard": "true", }), Annotations: k8s.StringMap{ @@ -264,7 +231,7 @@ func NewNamespaceInfoDashboard(d components.Detail) *corev1.ConfigMap { } } -func NewConfigMap(d components.Detail, k *v1alpha1.Kepler) *corev1.ConfigMap { +func NewConfigMap(d components.Detail, k *v1alpha1.KeplerInternal) *corev1.ConfigMap { if d == components.Metadata { return &corev1.ConfigMap{ TypeMeta: metav1.TypeMeta{ @@ -272,14 +239,14 @@ func NewConfigMap(d components.Detail, k *v1alpha1.Kepler) *corev1.ConfigMap { Kind: "ConfigMap", }, ObjectMeta: metav1.ObjectMeta{ - Name: ConfigmapName, - Namespace: components.Namespace, - Labels: labels, + Name: k.Name, + Namespace: k.Namespace(), + Labels: labels(k).ToMap(), }, } } - deployment := k.Spec.Exporter.Deployment + deployment := k.Spec.Exporter.Deployment.ExporterDeploymentSpec bindAddress := "0.0.0.0:" + strconv.Itoa(int(deployment.Port)) return &corev1.ConfigMap{ @@ -288,12 +255,12 @@ func NewConfigMap(d components.Detail, k *v1alpha1.Kepler) *corev1.ConfigMap { Kind: "ConfigMap", }, ObjectMeta: metav1.ObjectMeta{ - Name: ConfigmapName, - Namespace: components.Namespace, - Labels: labels, + Name: k.Name, + Namespace: k.Namespace(), + Labels: labels(k).ToMap(), }, Data: map[string]string{ - "KEPLER_NAMESPACE": components.Namespace, + "KEPLER_NAMESPACE": k.Namespace(), "KEPLER_LOG_LEVEL": "1", "METRIC_PATH": "/metrics", "BIND_ADDRESS": bindAddress, @@ -314,7 +281,7 @@ func NewConfigMap(d components.Detail, k *v1alpha1.Kepler) *corev1.ConfigMap { } } -func NewClusterRole(c components.Detail) *rbacv1.ClusterRole { +func NewClusterRole(c components.Detail, k *v1alpha1.KeplerInternal) *rbacv1.ClusterRole { if c == components.Metadata { return &rbacv1.ClusterRole{ TypeMeta: metav1.TypeMeta{ @@ -322,8 +289,8 @@ func NewClusterRole(c components.Detail) *rbacv1.ClusterRole { Kind: "ClusterRole", }, ObjectMeta: metav1.ObjectMeta{ - Name: ClusterRoleName, - Labels: labels, + Name: k.Name, + Labels: labels(k), }, } } @@ -334,8 +301,8 @@ func NewClusterRole(c components.Detail) *rbacv1.ClusterRole { Kind: "ClusterRole", }, ObjectMeta: metav1.ObjectMeta{ - Name: ClusterRoleName, - Labels: labels, + Name: k.Name, + Labels: labels(k), }, Rules: []rbacv1.PolicyRule{{ APIGroups: []string{""}, @@ -345,7 +312,7 @@ func NewClusterRole(c components.Detail) *rbacv1.ClusterRole { } } -func NewClusterRoleBinding(c components.Detail) *rbacv1.ClusterRoleBinding { +func NewClusterRoleBinding(c components.Detail, k *v1alpha1.KeplerInternal) *rbacv1.ClusterRoleBinding { if c == components.Metadata { return &rbacv1.ClusterRoleBinding{ TypeMeta: metav1.TypeMeta{ @@ -353,8 +320,8 @@ func NewClusterRoleBinding(c components.Detail) *rbacv1.ClusterRoleBinding { Kind: "ClusterRoleBinding", }, ObjectMeta: metav1.ObjectMeta{ - Name: ClusterRoleBindingName, - Labels: labels, + Name: k.Name, + Labels: labels(k), }, } } @@ -365,23 +332,23 @@ func NewClusterRoleBinding(c components.Detail) *rbacv1.ClusterRoleBinding { Kind: "ClusterRoleBinding", }, ObjectMeta: metav1.ObjectMeta{ - Name: ClusterRoleBindingName, - Labels: labels, + Name: k.Name, + Labels: labels(k), }, RoleRef: rbacv1.RoleRef{ APIGroup: "rbac.authorization.k8s.io", Kind: "ClusterRole", - Name: ClusterRoleName, + Name: k.Name, }, Subjects: []rbacv1.Subject{{ Kind: "ServiceAccount", - Name: ServiceAccountName, - Namespace: components.Namespace, + Name: k.Name, + Namespace: k.Namespace(), }}, } } -func NewSCC(d components.Detail, k *v1alpha1.Kepler) *secv1.SecurityContextConstraints { +func NewSCC(d components.Detail, ki *v1alpha1.KeplerInternal) *secv1.SecurityContextConstraints { if d == components.Metadata { return &secv1.SecurityContextConstraints{ TypeMeta: metav1.TypeMeta{ @@ -390,8 +357,8 @@ func NewSCC(d components.Detail, k *v1alpha1.Kepler) *secv1.SecurityContextConst }, ObjectMeta: metav1.ObjectMeta{ - Name: SCCName, - Labels: labels, + Name: ki.Name, + Labels: labels(ki), }, } } @@ -403,8 +370,8 @@ func NewSCC(d components.Detail, k *v1alpha1.Kepler) *secv1.SecurityContextConst }, ObjectMeta: metav1.ObjectMeta{ - Name: SCCName, - Labels: labels, + Name: ki.Name, + Labels: labels(ki), }, AllowPrivilegedContainer: true, @@ -425,8 +392,7 @@ func NewSCC(d components.Detail, k *v1alpha1.Kepler) *secv1.SecurityContextConst SELinuxContext: secv1.SELinuxContextStrategyOptions{ Type: secv1.SELinuxStrategyRunAsAny, }, - //TODO: decide if "kepler" is really needed? - Users: []string{"kepler", FQServiceAccountName}, + Users: []string{ki.FQServiceAccountName()}, Volumes: []secv1.FSType{ secv1.FSType("configMap"), secv1.FSType("projected"), @@ -435,22 +401,22 @@ func NewSCC(d components.Detail, k *v1alpha1.Kepler) *secv1.SecurityContextConst } } -func NewServiceAccount() *corev1.ServiceAccount { +func NewServiceAccount(ki *v1alpha1.KeplerInternal) *corev1.ServiceAccount { return &corev1.ServiceAccount{ TypeMeta: metav1.TypeMeta{ APIVersion: corev1.SchemeGroupVersion.String(), Kind: "ServiceAccount", }, ObjectMeta: metav1.ObjectMeta{ - Name: ServiceAccountName, - Namespace: components.Namespace, - Labels: labels, + Name: ki.Name, + Namespace: ki.Namespace(), + Labels: labels(ki).ToMap(), }, } } -func NewService(k *v1alpha1.Kepler) *corev1.Service { - deployment := k.Spec.Exporter.Deployment +func NewService(k *v1alpha1.KeplerInternal) *corev1.Service { + deployment := k.Spec.Exporter.Deployment.ExporterDeploymentSpec return &corev1.Service{ TypeMeta: metav1.TypeMeta{ @@ -458,14 +424,14 @@ func NewService(k *v1alpha1.Kepler) *corev1.Service { Kind: "Service", }, ObjectMeta: metav1.ObjectMeta{ - Name: ServiceName, - Namespace: components.Namespace, - Labels: labels, + Name: k.Name, + Namespace: k.Namespace(), + Labels: labels(k).ToMap(), }, Spec: corev1.ServiceSpec{ ClusterIP: "None", - Selector: podSelector, + Selector: podSelector(k), Ports: []corev1.ServicePort{{ Name: ServicePortName, Port: int32(deployment.Port), @@ -478,7 +444,7 @@ func NewService(k *v1alpha1.Kepler) *corev1.Service { } } -func NewServiceMonitor() *monv1.ServiceMonitor { +func NewServiceMonitor(k *v1alpha1.KeplerInternal) *monv1.ServiceMonitor { relabelings := []*monv1.RelabelConfig{{ Action: "replace", Regex: "(.*)", @@ -495,9 +461,9 @@ func NewServiceMonitor() *monv1.ServiceMonitor { Kind: "ServiceMonitor", }, ObjectMeta: metav1.ObjectMeta{ - Name: ServiceMonitorName, - Namespace: components.Namespace, - Labels: labels, + Name: k.Name, + Namespace: k.Namespace(), + Labels: labels(k).ToMap(), }, Spec: monv1.ServiceMonitorSpec{ Endpoints: []monv1.Endpoint{{ @@ -508,14 +474,31 @@ func NewServiceMonitor() *monv1.ServiceMonitor { }}, JobLabel: "app.kubernetes.io/name", Selector: metav1.LabelSelector{ - MatchLabels: labels, + MatchLabels: labels(k), }, }, } } -func NewPrometheusRule() *monv1.PrometheusRule { +var ( + promRuleInvalidChars = regexp.MustCompile(`[^a-zA-Z0-9]`) +) + +func keplerRulePrefix(name string) string { + ruleName := promRuleInvalidChars.ReplaceAllString(name, "_") + return fmt.Sprintf("kepler:%s", ruleName) +} + +func NewPrometheusRule(k *v1alpha1.KeplerInternal) *monv1.PrometheusRule { interval := monv1.Duration("15s") + ns := k.Namespace() + // + // NOTE: recording rules have a kepler-internal name prefixed as + // kepler: so that there is a unique rule created per + // object and dashboards can rely on kepler:kepler: for the + // `kepler` object. + + prefix := keplerRulePrefix(k.Name) return &monv1.PrometheusRule{ TypeMeta: metav1.TypeMeta{ @@ -523,73 +506,73 @@ func NewPrometheusRule() *monv1.PrometheusRule { Kind: "PrometheusRule", }, ObjectMeta: metav1.ObjectMeta{ - Name: PrometheusRuleName, - Namespace: components.Namespace, - Labels: labels, + Name: k.Name, + Namespace: ns, + Labels: labels(k).ToMap(), }, Spec: monv1.PrometheusRuleSpec{ Groups: []monv1.RuleGroup{{ Name: "kepler.rules", Interval: &interval, Rules: []monv1.Rule{ - record("kepler:container_joules_total:consumed:24h:all", - `sum( - increase(kepler_container_joules_total[24h:1m]) - )`, + record(prefix, "container_joules_total:consumed:24h:all", + fmt.Sprintf(`sum( + increase(kepler_container_joules_total{namespace=%q}[24h:1m]) + )`, ns), ), - record("kepler:container_joules_total:consumed:24h:by_ns", - `sum by (container_namespace) ( - increase(kepler_container_joules_total[24h:1m]) - )`, + record(prefix, "container_joules_total:consumed:24h:by_ns", + fmt.Sprintf(`sum by (container_namespace) ( + increase(kepler_container_joules_total{namespace=%q}[24h:1m]) + )`, ns), ), - record("kepler:container_gpu_joules_total:consumed:1h:by_ns", - `sum by (container_namespace) ( - increase(kepler_container_gpu_joules_total[1h:15s]) - )`, + record(prefix, "container_gpu_joules_total:consumed:1h:by_ns", + fmt.Sprintf(`sum by (container_namespace) ( + increase(kepler_container_gpu_joules_total{namespace=%q}[1h:15s]) + )`, ns), ), - record("kepler:container_dram_joules_total:consumed:1h:by_ns", - `sum by (container_namespace) ( - increase(kepler_container_dram_joules_total[1h:15s]) - )`, + record(prefix, "container_dram_joules_total:consumed:1h:by_ns", + fmt.Sprintf(`sum by (container_namespace) ( + increase(kepler_container_dram_joules_total{namespace=%q}[1h:15s]) + )`, ns), ), - record("kepler:container_package_joules_total:consumed:1h:by_ns", - `sum by (container_namespace) ( - increase(kepler_container_package_joules_total[1h:15s]) - )`, + record(prefix, "container_package_joules_total:consumed:1h:by_ns", + fmt.Sprintf(`sum by (container_namespace) ( + increase(kepler_container_package_joules_total{namespace=%q}[1h:15s]) + )`, ns), ), - record("kepler:container_other_joules_total:consumed:1h:by_ns", - `sum by (container_namespace) ( - increase(kepler_container_other_joules_total[1h:15s]) - )`, + record(prefix, "container_other_joules_total:consumed:1h:by_ns", + fmt.Sprintf(`sum by (container_namespace) ( + increase(kepler_container_other_joules_total{namespace=%q}[1h:15s]) + )`, ns), ), // irate of joules = joules per second -> watts - record("kepler:container_gpu_watts:1m:by_ns_pod", - `sum by (container_namespace, pod_name) ( - irate(kepler_container_gpu_joules_total[1m]) - )`, + record(prefix, "container_gpu_watts:1m:by_ns_pod", + fmt.Sprintf(`sum by (container_namespace, pod_name) ( + irate(kepler_container_gpu_joules_total{namespace=%q}[1m]) + )`, ns), ), - record("kepler:container_package_watts:1m:by_ns_pod", - `sum by (container_namespace, pod_name) ( - irate(kepler_container_package_joules_total[1m]) - )`, + record(prefix, "container_package_watts:1m:by_ns_pod", + fmt.Sprintf(`sum by (container_namespace, pod_name) ( + irate(kepler_container_package_joules_total{namespace=%q}[1m]) + )`, ns), ), - record("kepler:container_other_watts:1m:by_ns_pod", - `sum by (container_namespace, pod_name) ( - irate(kepler_container_other_joules_total[1m]) - )`, + record(prefix, "container_other_watts:1m:by_ns_pod", + fmt.Sprintf(`sum by (container_namespace, pod_name) ( + irate(kepler_container_other_joules_total{namespace=%q}[1m]) + )`, ns), ), - record("kepler:container_dram_watts:1m:by_ns_pod", - `sum by (container_namespace, pod_name) ( - irate(kepler_container_dram_joules_total[1m]) - )`, + record(prefix, "container_dram_watts:1m:by_ns_pod", + fmt.Sprintf(`sum by (container_namespace, pod_name) ( + irate(kepler_container_dram_joules_total{namespace=%q}[1m]) + )`, ns), ), }, }}, @@ -597,14 +580,23 @@ func NewPrometheusRule() *monv1.PrometheusRule { } } -func record(name, expr string) monv1.Rule { +func record(prefix, name, expr string) monv1.Rule { return monv1.Rule{ Expr: intstr.IntOrString{Type: intstr.String, StrVal: expr}, - Record: name, + Record: prefix + ":" + name, } } -func IsLibbpfAttachType(k *v1alpha1.Kepler) bool { - bpftype, ok := k.Annotations[KeplerBpfAttachMethodAnnotation] - return ok && strings.ToLower(bpftype) == KeplerBpfAttachMethodLibbpf +func podSelector(ki *v1alpha1.KeplerInternal) k8s.StringMap { + return labels(ki).Merge(k8s.StringMap{ + "app.kubernetes.io/name": "kepler-exporter", + }) +} + +func labels(ki *v1alpha1.KeplerInternal) k8s.StringMap { + return components.CommonLabels.Merge(k8s.StringMap{ + "app.kubernetes.io/component": "exporter", + "operator.sustainable-computing.io/internal": ki.Name, + "app.kubernetes.io/part-of": ki.Name, + }) } diff --git a/pkg/components/exporter/exporter_test.go b/pkg/components/exporter/exporter_test.go index a867179e..e008c3c9 100644 --- a/pkg/components/exporter/exporter_test.go +++ b/pkg/components/exporter/exporter_test.go @@ -14,19 +14,21 @@ import ( func TestNodeSelection(t *testing.T) { tt := []struct { - spec v1alpha1.ExporterSpec + spec v1alpha1.InternalExporterSpec selector map[string]string scenario string }{ { - spec: v1alpha1.ExporterSpec{}, + spec: v1alpha1.InternalExporterSpec{}, selector: map[string]string{"kubernetes.io/os": "linux"}, scenario: "default case", }, { - spec: v1alpha1.ExporterSpec{ - Deployment: v1alpha1.ExporterDeploymentSpec{ - NodeSelector: map[string]string{"k1": "v1"}, + spec: v1alpha1.InternalExporterSpec{ + Deployment: v1alpha1.InternalExporterDeploymentSpec{ + ExporterDeploymentSpec: v1alpha1.ExporterDeploymentSpec{ + NodeSelector: map[string]string{"k1": "v1"}, + }, }, }, selector: map[string]string{"k1": "v1", "kubernetes.io/os": "linux"}, @@ -38,8 +40,8 @@ func TestNodeSelection(t *testing.T) { tc := tc t.Run(tc.scenario, func(t *testing.T) { t.Parallel() - k := v1alpha1.Kepler{ - Spec: v1alpha1.KeplerSpec{ + k := v1alpha1.KeplerInternal{ + Spec: v1alpha1.KeplerInternalSpec{ Exporter: tc.spec, }, } @@ -52,20 +54,22 @@ func TestNodeSelection(t *testing.T) { func TestTolerations(t *testing.T) { tt := []struct { - spec v1alpha1.ExporterSpec + spec v1alpha1.InternalExporterSpec tolerations []corev1.Toleration scenario string }{{ - spec: v1alpha1.ExporterSpec{}, + spec: v1alpha1.InternalExporterSpec{}, // NOTE: default toleration { "operator": "Exists" } is set by k8s API server (CRD default) // see: Kepler_Reconciliation e2e test tolerations: nil, scenario: "default case", }, { - spec: v1alpha1.ExporterSpec{ - Deployment: v1alpha1.ExporterDeploymentSpec{ - Tolerations: []corev1.Toleration{{ - Effect: corev1.TaintEffectNoSchedule, Key: "key1"}}, + spec: v1alpha1.InternalExporterSpec{ + Deployment: v1alpha1.InternalExporterDeploymentSpec{ + ExporterDeploymentSpec: v1alpha1.ExporterDeploymentSpec{ + Tolerations: []corev1.Toleration{{ + Effect: corev1.TaintEffectNoSchedule, Key: "key1"}}, + }, }, }, tolerations: []corev1.Toleration{{ @@ -78,8 +82,8 @@ func TestTolerations(t *testing.T) { tc := tc t.Run(tc.scenario, func(t *testing.T) { t.Parallel() - k := v1alpha1.Kepler{ - Spec: v1alpha1.KeplerSpec{ + k := v1alpha1.KeplerInternal{ + Spec: v1alpha1.KeplerInternalSpec{ Exporter: tc.spec, }, } @@ -91,12 +95,12 @@ func TestTolerations(t *testing.T) { func TestHostPID(t *testing.T) { tt := []struct { - spec v1alpha1.ExporterSpec + spec v1alpha1.InternalExporterSpec hostPID bool scenario string }{ { - spec: v1alpha1.ExporterSpec{}, + spec: v1alpha1.InternalExporterSpec{}, hostPID: true, scenario: "default case", }, @@ -106,8 +110,8 @@ func TestHostPID(t *testing.T) { tc := tc t.Run(tc.scenario, func(t *testing.T) { t.Parallel() - k := v1alpha1.Kepler{ - Spec: v1alpha1.KeplerSpec{ + k := v1alpha1.KeplerInternal{ + Spec: v1alpha1.KeplerInternalSpec{ Exporter: tc.spec, }, } @@ -118,12 +122,12 @@ func TestHostPID(t *testing.T) { } func TestVolumeMounts(t *testing.T) { tt := []struct { - spec v1alpha1.ExporterSpec + spec v1alpha1.InternalExporterSpec volumeMounts []corev1.VolumeMount scenario string }{ { - spec: v1alpha1.ExporterSpec{}, + spec: v1alpha1.InternalExporterSpec{}, volumeMounts: []corev1.VolumeMount{ {Name: "lib-modules", MountPath: "/lib/modules", ReadOnly: true}, {Name: "tracing", MountPath: "/sys", ReadOnly: true}, @@ -140,8 +144,8 @@ func TestVolumeMounts(t *testing.T) { tc := tc t.Run(tc.scenario, func(t *testing.T) { t.Parallel() - k := v1alpha1.Kepler{ - Spec: v1alpha1.KeplerSpec{ + k := v1alpha1.KeplerInternal{ + Spec: v1alpha1.KeplerInternalSpec{ Exporter: tc.spec, }, } @@ -152,19 +156,19 @@ func TestVolumeMounts(t *testing.T) { } func TestVolumes(t *testing.T) { tt := []struct { - spec v1alpha1.ExporterSpec + spec v1alpha1.InternalExporterSpec volumes []corev1.Volume scenario string }{ { - spec: v1alpha1.ExporterSpec{}, + spec: v1alpha1.InternalExporterSpec{}, volumes: []corev1.Volume{ k8s.VolumeFromHost("lib-modules", "/lib/modules"), k8s.VolumeFromHost("tracing", "/sys"), k8s.VolumeFromHost("proc", "/proc"), k8s.VolumeFromHost("kernel-src", "/usr/src/kernels"), k8s.VolumeFromHost("kernel-debug", "/sys/kernel/debug"), - k8s.VolumeFromConfigMap("cfm", ConfigmapName), + k8s.VolumeFromConfigMap("cfm", "kepler-internal"), }, scenario: "default case", }, @@ -174,8 +178,11 @@ func TestVolumes(t *testing.T) { tc := tc t.Run(tc.scenario, func(t *testing.T) { t.Parallel() - k := v1alpha1.Kepler{ - Spec: v1alpha1.KeplerSpec{ + k := v1alpha1.KeplerInternal{ + ObjectMeta: metav1.ObjectMeta{ + Name: "kepler-internal", + }, + Spec: v1alpha1.KeplerInternalSpec{ Exporter: tc.spec, }, } @@ -187,12 +194,10 @@ func TestVolumes(t *testing.T) { func TestSCCAllows(t *testing.T) { tt := []struct { - spec v1alpha1.ExporterSpec sccAllows k8s.SCCAllows scenario string }{ { - spec: v1alpha1.ExporterSpec{}, sccAllows: k8s.SCCAllows{ AllowPrivilegedContainer: true, AllowHostDirVolumePlugin: true, @@ -209,9 +214,9 @@ func TestSCCAllows(t *testing.T) { tc := tc t.Run(tc.scenario, func(t *testing.T) { t.Parallel() - k := v1alpha1.Kepler{ - Spec: v1alpha1.KeplerSpec{ - Exporter: tc.spec, + k := v1alpha1.KeplerInternal{ + ObjectMeta: metav1.ObjectMeta{ + Name: "kepler-internal", }, } actual := k8s.AllowsFromSCC(NewSCC(components.Full, &k)) @@ -220,54 +225,20 @@ func TestSCCAllows(t *testing.T) { } } -func TestBpfAttachMethod(t *testing.T) { - +func TestRecordingRuleName(t *testing.T) { tt := []struct { - annotations map[string]string - scenario string - IsLibbpf bool + keplerName string + recRule string }{ - { - annotations: map[string]string{}, - IsLibbpf: false, - scenario: "no annotation", - }, - { - annotations: map[string]string{ - KeplerBpfAttachMethodAnnotation: "junk", - }, - IsLibbpf: false, - scenario: "annotation present but not libbpf", - }, - { - annotations: map[string]string{ - KeplerBpfAttachMethodAnnotation: "bcc", - }, - IsLibbpf: false, - scenario: "annotation present with bcc", - }, - { - annotations: map[string]string{ - KeplerBpfAttachMethodAnnotation: "libbpf", - }, - IsLibbpf: true, - scenario: "annotation present with libbpf", - }, + {"kepler", "kepler:kepler"}, + {"kepler-internal", "kepler:kepler_internal"}, + {"kep-ler-inte.rnal", "kepler:kep_ler_inte_rnal"}, } for _, tc := range tt { tc := tc - t.Run(tc.scenario, func(t *testing.T) { - t.Parallel() - k := v1alpha1.Kepler{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: tc.annotations, - }, - Spec: v1alpha1.KeplerSpec{ - Exporter: v1alpha1.ExporterSpec{}, - }, - } - actual := IsLibbpfAttachType(&k) - assert.Equal(t, actual, tc.IsLibbpf) + t.Run(tc.keplerName, func(t *testing.T) { + actual := keplerRulePrefix(tc.keplerName) + assert.Equal(t, tc.recRule, actual) }) } } diff --git a/pkg/controllers/config.go b/pkg/controllers/config.go new file mode 100644 index 00000000..6c4aa32c --- /dev/null +++ b/pkg/controllers/config.go @@ -0,0 +1,32 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package controllers + +import "github.com/sustainable.computing.io/kepler-operator/pkg/utils/k8s" + +// Config holds configuration shared across all controllers. This struct +// should be initialized in main +var ( + Config = struct { + Image string + ImageLibbpf string + Cluster k8s.Cluster + }{ + Image: "", + ImageLibbpf: "", + Cluster: k8s.Kubernetes, + } +) diff --git a/pkg/controllers/kepler.go b/pkg/controllers/kepler.go index f00bac23..0609321b 100644 --- a/pkg/controllers/kepler.go +++ b/pkg/controllers/kepler.go @@ -2,8 +2,7 @@ package controllers import ( "context" - "fmt" - "time" + "strings" "github.com/go-logr/logr" "sigs.k8s.io/controller-runtime/pkg/builder" @@ -13,26 +12,27 @@ import ( "github.com/sustainable.computing.io/kepler-operator/pkg/api/v1alpha1" "github.com/sustainable.computing.io/kepler-operator/pkg/components" - "github.com/sustainable.computing.io/kepler-operator/pkg/components/exporter" "github.com/sustainable.computing.io/kepler-operator/pkg/reconciler" "github.com/sustainable.computing.io/kepler-operator/pkg/utils/k8s" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/retry" - secv1 "github.com/openshift/api/security/v1" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" ) const ( - KeplerFinalizer = "kepler.system.sustainable.computing.io/finalizer" + Finalizer = "kepler.system.sustainable.computing.io/finalizer" + BpfAttachMethodAnnotation = "kepler.sustainable.computing.io/bpf-attach-method" + BpfAttachMethodBCC = "bcc" + BpfAttachMethodLibbpf = "libbpf" +) + +var ( + KeplerDeploymentNS = "kepler-operator" ) // KeplerReconciler reconciles a Kepler object @@ -40,51 +40,19 @@ type KeplerReconciler struct { client.Client Scheme *runtime.Scheme - logger logr.Logger - Cluster k8s.Cluster + logger logr.Logger } // Owned resource //+kubebuilder:rbac:groups=kepler.system.sustainable.computing.io,resources=*,verbs=* -// common to all components deployed by operator -//+kubebuilder:rbac:groups=core,resources=namespaces,verbs=list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=core,resources=services;configmaps;serviceaccounts,verbs=list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=*,verbs=* - -// RBAC for running Kepler exporter -//+kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=list;watch;create;update;patch;delete;use -//+kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors;prometheusrules,verbs=list;watch;create;update;patch;delete - -// RBAC required by Kepler exporter -//+kubebuilder:rbac:groups=core,resources=nodes/metrics;nodes/proxy;nodes/stats,verbs=get;list;watch - // SetupWithManager sets up the controller with the Manager. func (r *KeplerReconciler) SetupWithManager(mgr ctrl.Manager) error { - - // We only want to trigger a reconciliation when the generation - // of a child changes. Until we need to update our the status for our own objects, - // we can save CPU cycles by avoiding reconciliations triggered by - // child status changes. - // - // TODO: consider using ResourceVersionChanged predicate for resources that support it - - genChanged := builder.WithPredicates(predicate.GenerationChangedPredicate{}) - - c := ctrl.NewControllerManagedBy(mgr). + return ctrl.NewControllerManagedBy(mgr). For(&v1alpha1.Kepler{}). - Owns(&corev1.ConfigMap{}, genChanged). - Owns(&corev1.ServiceAccount{}, genChanged). - Owns(&corev1.Service{}, genChanged). - Owns(&appsv1.DaemonSet{}, builder.WithPredicates(predicate.ResourceVersionChangedPredicate{})). - Owns(&rbacv1.ClusterRoleBinding{}, genChanged). - Owns(&rbacv1.ClusterRole{}, genChanged) - - if r.Cluster == k8s.OpenShift { - c = c.Owns(&secv1.SecurityContextConstraints{}, genChanged) - } - return c.Complete(r) + Owns(&v1alpha1.KeplerInternal{}, + builder.WithPredicates(predicate.ResourceVersionChangedPredicate{})). + Complete(r) } // Reconcile is part of the main kubernetes reconciliation loop which aims to @@ -149,214 +117,100 @@ func (r KeplerReconciler) runKeplerReconcilers(ctx context.Context, kepler *v1al }.Run(ctx) } -func (r KeplerReconciler) getKepler(ctx context.Context, req ctrl.Request) (*v1alpha1.Kepler, error) { - logger := r.logger - - kepler := v1alpha1.Kepler{} - - if err := r.Client.Get(ctx, req.NamespacedName, &kepler); err != nil { - if errors.IsNotFound(err) { - logger.V(3).Info("kepler could not be found; may be marked for deletion") - return nil, nil - } - logger.Error(err, "failed to get kepler") - return nil, err - } - - return &kepler, nil -} - func (r KeplerReconciler) updateStatus(ctx context.Context, req ctrl.Request, recErr error) error { return retry.RetryOnConflict(retry.DefaultBackoff, func() error { - kepler, _ := r.getKepler(ctx, req) + k, _ := r.getKepler(ctx, req) // may be deleted - if kepler == nil || !kepler.GetDeletionTimestamp().IsZero() { + if k == nil || !k.GetDeletionTimestamp().IsZero() { // retry since some error has occurred - r.logger.V(6).Info("Reconcile has deleted kepler; skipping update") + r.logger.V(6).Info("kepler has been deleted; skipping status update") return nil } - kepler.Status = v1alpha1.KeplerStatus{ - Conditions: []v1alpha1.Condition{}, + internal, _ := r.getInternalForKepler(ctx, k) + // may be deleted + if internal == nil || !internal.GetDeletionTimestamp().IsZero() { + // retry since some error has occurred + r.logger.V(6).Info("keplerinternal has deleted; skipping status update") + return nil } - r.updateReconciledStatus(ctx, kepler, recErr) - r.updateAvailableStatus(ctx, kepler, recErr) - - now := metav1.Now() - for i := range kepler.Status.Conditions { - kepler.Status.Conditions[i].LastTransitionTime = now + if !hasInternalStatusChanged(internal) { + r.logger.V(6).Info("keplerinternal has not changed; skipping status update") + return nil } - return r.Client.Status().Update(ctx, kepler) - + // NOTE: although, this copies the internal status, the observed generation + // should be set to kepler's current generation to indicate that the + // current generation has been "observed" + k.Status = v1alpha1.KeplerStatus{ + Exporter: internal.Status.Exporter, + } + for i := range k.Status.Exporter.Conditions { + k.Status.Exporter.Conditions[i].ObservedGeneration = k.Generation + } + return r.Client.Status().Update(ctx, k) }) } -func (r KeplerReconciler) updateReconciledStatus(ctx context.Context, k *v1alpha1.Kepler, recErr error) { - - reconciled := v1alpha1.Condition{ - Type: v1alpha1.Reconciled, - ObservedGeneration: k.Generation, - Status: v1alpha1.ConditionTrue, - Reason: v1alpha1.ReconcileComplete, - Message: "Reconcile succeeded", - } - - if recErr != nil { - reconciled.Status = v1alpha1.ConditionFalse - reconciled.Reason = v1alpha1.ReconcileError - reconciled.Message = recErr.Error() +// returns true (i.e. status has changed ) if any of the Conditions' +// ObservedGeneration is equal to the current generation +func hasInternalStatusChanged(internal *v1alpha1.KeplerInternal) bool { + for i := range internal.Status.Exporter.Conditions { + if internal.Status.Exporter.Conditions[i].ObservedGeneration == internal.Generation { + return true + } } - - k.Status.Conditions = append(k.Status.Conditions, reconciled) + return false } -func (r KeplerReconciler) updateAvailableStatus(ctx context.Context, k *v1alpha1.Kepler, recErr error) { - // get daemonset owned by kepler - dset := appsv1.DaemonSet{} - key := types.NamespacedName{Name: exporter.DaemonSetName, Namespace: components.Namespace} - if err := r.Client.Get(ctx, key, &dset); err != nil { - k.Status.Conditions = append(k.Status.Conditions, availableConditionForGetError(err)) - return - } +func (r KeplerReconciler) getKepler(ctx context.Context, req ctrl.Request) (*v1alpha1.Kepler, error) { + logger := r.logger - ds := dset.Status - k.Status.NumberMisscheduled = ds.NumberMisscheduled - k.Status.CurrentNumberScheduled = ds.CurrentNumberScheduled - k.Status.DesiredNumberScheduled = ds.DesiredNumberScheduled - k.Status.NumberReady = ds.NumberReady - k.Status.UpdatedNumberScheduled = ds.UpdatedNumberScheduled - k.Status.NumberAvailable = ds.NumberAvailable - k.Status.NumberUnavailable = ds.NumberUnavailable - - c := availableCondition(&dset) - if recErr == nil { - c.ObservedGeneration = k.Generation - } - k.Status.Conditions = append(k.Status.Conditions, c) -} + kepler := v1alpha1.Kepler{} -func availableConditionForGetError(err error) v1alpha1.Condition { - if errors.IsNotFound(err) { - return v1alpha1.Condition{ - Type: v1alpha1.Available, - Status: v1alpha1.ConditionFalse, - Reason: v1alpha1.DaemonSetNotFound, - Message: err.Error(), + if err := r.Client.Get(ctx, req.NamespacedName, &kepler); err != nil { + if errors.IsNotFound(err) { + logger.V(3).Info("kepler could not be found; may be marked for deletion") + return nil, nil } + logger.Error(err, "failed to get kepler") + return nil, err } - return v1alpha1.Condition{ - Type: v1alpha1.Available, - Status: v1alpha1.ConditionUnknown, - Reason: v1alpha1.DaemonSetError, - Message: err.Error(), - } - + return &kepler, nil } -func availableCondition(dset *appsv1.DaemonSet) v1alpha1.Condition { - ds := dset.Status - dsName := dset.Namespace + "/" + dset.Name - - if gen, ogen := dset.Generation, ds.ObservedGeneration; gen > ogen { - return v1alpha1.Condition{ - Type: v1alpha1.Available, - Status: v1alpha1.ConditionUnknown, - Reason: v1alpha1.DaemonSetOutOfSync, - Message: fmt.Sprintf( - "Generation %d of kepler daemonset %q is out of sync with the observed generation: %d", - gen, dsName, ogen), - } - } - - c := v1alpha1.Condition{Type: v1alpha1.Available} - - // NumberReady: The number of nodes that should be running the daemon pod and - // have one or more of the daemon pod running with a Ready Condition. - // - // DesiredNumberScheduled: The total number of nodes that should be running - // the daemon pod (including nodes correctly running the daemon pod). - if ds.NumberReady == 0 || ds.DesiredNumberScheduled == 0 { - c.Status = v1alpha1.ConditionFalse - c.Reason = v1alpha1.DaemonSetPodsNotRunning - c.Message = fmt.Sprintf("Kepler daemonset %q is not rolled out to any node; check nodeSelector and tolerations", dsName) - return c - } - - // UpdatedNumberScheduled: The total number of nodes that are running updated daemon pod - // - // DesiredNumberScheduled: The total number of nodes that should be running - // the daemon pod (including nodes correctly running the daemon pod). - - if ds.UpdatedNumberScheduled < ds.DesiredNumberScheduled { - c.Status = v1alpha1.ConditionUnknown - c.Reason = v1alpha1.DaemonSetRolloutInProgress - c.Message = fmt.Sprintf( - "Waiting for kepler daemonset %q rollout to finish: %d out of %d new pods have been updated", - dsName, ds.UpdatedNumberScheduled, ds.DesiredNumberScheduled) - return c - } - - // NumberAvailable: The number of nodes that should be running the daemon pod - // and have one or more of the daemon pod running and available (ready for at - // least spec.minReadySeconds) - - if ds.NumberAvailable < ds.DesiredNumberScheduled { - c.Status = v1alpha1.ConditionUnknown - c.Reason = v1alpha1.DaemonSetPartiallyAvailable - c.Message = fmt.Sprintf("Rollout of kepler daemonset %q is in progress: %d of %d updated pods are available", - dsName, ds.NumberAvailable, ds.DesiredNumberScheduled) - return c - } +func (r KeplerReconciler) getInternalForKepler(ctx context.Context, k *v1alpha1.Kepler) (*v1alpha1.KeplerInternal, error) { + logger := r.logger.WithValues("kepler-internal", k.Name) - // NumberUnavailable: The number of nodes that should be running the daemon - // pod and have none of the daemon pod running and available (ready for at - // least spec.minReadySeconds) - if ds.NumberUnavailable > 0 { - c.Status = v1alpha1.ConditionFalse - c.Reason = v1alpha1.DaemonSetPartiallyAvailable - c.Message = fmt.Sprintf("Waiting for kepler daemonset %q to rollout on %d nodes", dsName, ds.NumberUnavailable) - return c + internal := v1alpha1.KeplerInternal{} + if err := r.Client.Get(ctx, client.ObjectKey{Name: k.Name}, &internal); err != nil { + if errors.IsNotFound(err) { + logger.V(3).Info("kepler-internal could not be found; may be marked for deletion") + return nil, nil + } + logger.Error(err, "failed to get kepler-internal") + return nil, err } - - c.Status = v1alpha1.ConditionTrue - c.Reason = v1alpha1.DaemonSetReady - c.Message = fmt.Sprintf("Kepler daemonset %q is deployed to all nodes and available; ready %d/%d", - dsName, ds.NumberReady, ds.DesiredNumberScheduled) - return c + return &internal, nil } func (r KeplerReconciler) reconcilersForKepler(k *v1alpha1.Kepler) []reconciler.Reconciler { - rs := []reconciler.Reconciler{} - - cleanup := !k.DeletionTimestamp.IsZero() - if !cleanup { - // NOTE: create namespace first and for deletion, reverse the order - rs = append(rs, reconciler.Updater{ - Owner: k, - Resource: components.NewKeplerNamespace(), - OnError: reconciler.Requeue, - Logger: r.logger, - }) - } - - rs = append(rs, exporterReconcilers(k, r.Cluster)...) + op := deleteResource + detail := components.Metadata - // TODO: add this when modelServer is supported by Kepler Spec - // rs = append(rs, modelServerReconcilers(k)...) - - if cleanup { - rs = append(rs, reconciler.Deleter{ - OnError: reconciler.Requeue, - Resource: components.NewKeplerNamespace(), - WaitTimeout: 2 * time.Minute, - }) + if update := k.DeletionTimestamp.IsZero(); update { + op = newUpdaterWithOwner(k) + detail = components.Full } - // Add/Remove finalizer at the end - rs = append(rs, reconciler.Finalizer{Resource: k, Finalizer: KeplerFinalizer, Logger: r.logger}) + rs := []reconciler.Reconciler{ + op(newKeplerInternal(detail, k)), + reconciler.Finalizer{ + Resource: k, Finalizer: Finalizer, Logger: r.logger, + }, + } return rs } @@ -369,7 +223,7 @@ func (r KeplerReconciler) setInvalidStatus(ctx context.Context, req ctrl.Request return nil } - invalidKepler.Status.Conditions = []v1alpha1.Condition{{ + invalidKepler.Status.Exporter.Conditions = []v1alpha1.Condition{{ Type: v1alpha1.Reconciled, Status: v1alpha1.ConditionFalse, ObservedGeneration: invalidKepler.Generation, @@ -391,76 +245,37 @@ func (r KeplerReconciler) setInvalidStatus(ctx context.Context, req ctrl.Request return ctrl.Result{}, err } -func exporterReconcilers(k *v1alpha1.Kepler, cluster k8s.Cluster) []reconciler.Reconciler { - - if cleanup := !k.DeletionTimestamp.IsZero(); cleanup { - rs := resourceReconcilers( - deleteResource, - // cluster-scoped - exporter.NewClusterRoleBinding(components.Metadata), - exporter.NewClusterRole(components.Metadata), - ) - if cluster == k8s.OpenShift { - rs = append(rs, - resourceReconcilers(deleteResource, - exporter.NewSCC(components.Metadata, k), - exporter.NewOverviewDashboard(components.Metadata), - exporter.NewNamespaceInfoDashboard(components.Metadata), - )..., - ) +func newKeplerInternal(d components.Detail, k *v1alpha1.Kepler) *v1alpha1.KeplerInternal { + + if d == components.Metadata { + return &v1alpha1.KeplerInternal{ + TypeMeta: metav1.TypeMeta{ + Kind: "KeplerInternal", + APIVersion: v1alpha1.GroupVersion.String(), + }, + ObjectMeta: metav1.ObjectMeta{ + Name: k.Name, + Annotations: k.Annotations, + }, } - return rs } - updater := newUpdaterForKepler(k) - rs := resourceReconcilers(updater, - // cluster-scoped resources first - exporter.NewClusterRole(components.Full), - exporter.NewClusterRoleBinding(components.Full), - - // namespace scoped - exporter.NewServiceAccount(), - exporter.NewConfigMap(components.Full, k), - exporter.NewDaemonSet(components.Full, k), - exporter.NewService(k), - exporter.NewServiceMonitor(), - exporter.NewPrometheusRule(), - ) - - if cluster == k8s.OpenShift { - rs = append(rs, - resourceReconcilers( - updater, - exporter.NewSCC(components.Full, k), - exporter.NewOverviewDashboard(components.Full), - exporter.NewNamespaceInfoDashboard(components.Full), - )..., - ) - } - return rs -} - -func resourceReconcilers(fn reconcileFn, resources ...client.Object) []reconciler.Reconciler { - rs := []reconciler.Reconciler{} - for _, res := range resources { - rs = append(rs, fn(res)) + keplerImage := Config.Image + if hasLibBPFAnnotation(k) { + keplerImage = Config.ImageLibbpf } - return rs -} -// TODO: decide if this this should move to reconciler -type reconcileFn func(client.Object) reconciler.Reconciler + isOpenShift := Config.Cluster == k8s.OpenShift -// deleteResource is a resourceFn that deletes resources -func deleteResource(obj client.Object) reconciler.Reconciler { - return &reconciler.Deleter{Resource: obj} + return &v1alpha1.KeplerInternal{ + TypeMeta: metav1.TypeMeta{Kind: "KeplerInternal", APIVersion: v1alpha1.GroupVersion.String()}, + ObjectMeta: metav1.ObjectMeta{Name: k.Name, Annotations: k.Annotations}, + Spec: v1alpha1.KeplerInternalSpec{Exporter: v1alpha1.InternalExporterSpec{Deployment: v1alpha1.InternalExporterDeploymentSpec{ExporterDeploymentSpec: k.Spec.Exporter.Deployment, Image: keplerImage, Namespace: KeplerDeploymentNS}}, OpenShift: v1alpha1.OpenShiftSpec{Enabled: isOpenShift, Dashboard: v1alpha1.DashboardSpec{Enabled: isOpenShift}}}, + Status: v1alpha1.KeplerInternalStatus{}, + } } -// newUpdaterForKepler returns a reconcileFn that update the resource and -// sets the owner reference to kepler -func newUpdaterForKepler(k *v1alpha1.Kepler) reconcileFn { - return func(obj client.Object) reconciler.Reconciler { - // NOTE: Owner: k.GetObjectMeta() also works - return &reconciler.Updater{Owner: k, Resource: obj} - } +func hasLibBPFAnnotation(k *v1alpha1.Kepler) bool { + bpftype, ok := k.Annotations[BpfAttachMethodAnnotation] + return ok && strings.ToLower(bpftype) == BpfAttachMethodLibbpf } diff --git a/pkg/controllers/kepler_internal.go b/pkg/controllers/kepler_internal.go new file mode 100644 index 00000000..fb09b0a4 --- /dev/null +++ b/pkg/controllers/kepler_internal.go @@ -0,0 +1,400 @@ +package controllers + +import ( + "context" + "fmt" + "time" + + "github.com/go-logr/logr" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" + + "github.com/sustainable.computing.io/kepler-operator/pkg/api/v1alpha1" + "github.com/sustainable.computing.io/kepler-operator/pkg/components" + "github.com/sustainable.computing.io/kepler-operator/pkg/components/exporter" + "github.com/sustainable.computing.io/kepler-operator/pkg/reconciler" + "github.com/sustainable.computing.io/kepler-operator/pkg/utils/k8s" + + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" + + secv1 "github.com/openshift/api/security/v1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + ctrl "sigs.k8s.io/controller-runtime" +) + +// KeplerInternalReconciler reconciles a Kepler object +type KeplerInternalReconciler struct { + client.Client + Scheme *runtime.Scheme + + logger logr.Logger +} + +// common to all components deployed by operator +//+kubebuilder:rbac:groups=core,resources=namespaces,verbs=list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=core,resources=services;configmaps;serviceaccounts,verbs=list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=*,verbs=* + +// RBAC for running Kepler exporter +//+kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=list;watch;create;update;patch;delete;use +//+kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors;prometheusrules,verbs=list;watch;create;update;patch;delete + +// RBAC required by Kepler exporter +//+kubebuilder:rbac:groups=core,resources=nodes/metrics;nodes/proxy;nodes/stats,verbs=get;list;watch + +// SetupWithManager sets up the controller with the Manager. +func (r *KeplerInternalReconciler) SetupWithManager(mgr ctrl.Manager) error { + + // We only want to trigger a reconciliation when the generation + // of a child changes. Until we need to update our the status for our own objects, + // we can save CPU cycles by avoiding reconciliations triggered by + // child status changes. + // + // TODO: consider using ResourceVersionChanged predicate for resources that support it + + genChanged := builder.WithPredicates(predicate.GenerationChangedPredicate{}) + + c := ctrl.NewControllerManagedBy(mgr). + For(&v1alpha1.KeplerInternal{}). + Owns(&corev1.ConfigMap{}, genChanged). + Owns(&corev1.ServiceAccount{}, genChanged). + Owns(&corev1.Service{}, genChanged). + Owns(&appsv1.DaemonSet{}, builder.WithPredicates(predicate.ResourceVersionChangedPredicate{})). + Owns(&rbacv1.ClusterRoleBinding{}, genChanged). + Owns(&rbacv1.ClusterRole{}, genChanged) + + if Config.Cluster == k8s.OpenShift { + c = c.Owns(&secv1.SecurityContextConstraints{}, genChanged) + } + return c.Complete(r) +} + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +// TODO(user): Modify the Reconcile function to compare the state specified by +// the Kepler object against the actual cluster state, and then +// perform operations to make the cluster state reflect the state specified by +// the user. +// +// For more details, check Reconcile and its Result here: +// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.13.0/pkg/reconcile +func (r *KeplerInternalReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + r.logger = logger + + logger.Info("Start of reconcile") + defer logger.Info("End of reconcile") + + ki, err := r.getInternal(ctx, req) + if err != nil { + // retry since some error has occurred + logger.V(6).Info("Get Error ", "error", err) + return ctrl.Result{}, err + } + + if ki == nil { + // no kepler-internal found , so stop here + logger.V(6).Info("Kepler Nil") + return ctrl.Result{}, nil + } + + logger.V(6).Info("Running sub reconcilers", "kepler-internal", ki.Spec) + + result, recErr := r.runReconcilers(ctx, ki) + updateErr := r.updateStatus(ctx, req, err) + + if recErr != nil { + return result, recErr + } + return result, updateErr +} + +func (r KeplerInternalReconciler) runReconcilers(ctx context.Context, ki *v1alpha1.KeplerInternal) (ctrl.Result, error) { + + reconcilers := r.reconcilersForInternal(ki) + r.logger.V(6).Info("renconcilers ...", "count", len(reconcilers)) + + return reconciler.Runner{ + Reconcilers: reconcilers, + Client: r.Client, + Scheme: r.Scheme, + Logger: r.logger, + }.Run(ctx) +} + +func (r KeplerInternalReconciler) getInternal(ctx context.Context, req ctrl.Request) (*v1alpha1.KeplerInternal, error) { + logger := r.logger.WithValues("keplerinternal", req.Name) + ki := v1alpha1.KeplerInternal{} + + if err := r.Client.Get(ctx, req.NamespacedName, &ki); err != nil { + if errors.IsNotFound(err) { + logger.V(3).Info("keplerinternal could not be found; may be marked for deletion") + return nil, nil + } + logger.Error(err, "failed to get keplerinternal") + return nil, err + } + + return &ki, nil +} + +func (r KeplerInternalReconciler) updateStatus(ctx context.Context, req ctrl.Request, recErr error) error { + return retry.RetryOnConflict(retry.DefaultBackoff, func() error { + + ki, _ := r.getInternal(ctx, req) + // may be deleted + if ki == nil || !ki.GetDeletionTimestamp().IsZero() { + // retry since some error has occurred + r.logger.V(6).Info("Reconcile has deleted kepler; skipping update") + return nil + } + + ki.Status = v1alpha1.KeplerInternalStatus{ + Exporter: v1alpha1.ExporterStatus{ + Conditions: []v1alpha1.Condition{}, + }, + } + r.updateReconciledStatus(ctx, ki, recErr) + r.updateAvailableStatus(ctx, ki, recErr) + + now := metav1.Now() + for i := range ki.Status.Exporter.Conditions { + ki.Status.Exporter.Conditions[i].LastTransitionTime = now + } + + return r.Client.Status().Update(ctx, ki) + + }) +} + +func (r KeplerInternalReconciler) updateReconciledStatus(ctx context.Context, ki *v1alpha1.KeplerInternal, recErr error) { + + reconciled := v1alpha1.Condition{ + Type: v1alpha1.Reconciled, + ObservedGeneration: ki.Generation, + Status: v1alpha1.ConditionTrue, + Reason: v1alpha1.ReconcileComplete, + Message: "Reconcile succeeded", + } + + if recErr != nil { + reconciled.Status = v1alpha1.ConditionFalse + reconciled.Reason = v1alpha1.ReconcileError + reconciled.Message = recErr.Error() + } + + ki.Status.Exporter.Conditions = append(ki.Status.Exporter.Conditions, reconciled) +} + +func (r KeplerInternalReconciler) updateAvailableStatus(ctx context.Context, ki *v1alpha1.KeplerInternal, recErr error) { + // get daemonset owned by kepler + dset := appsv1.DaemonSet{} + key := types.NamespacedName{Name: ki.DaemonsetName(), Namespace: ki.Namespace()} + if err := r.Client.Get(ctx, key, &dset); err != nil { + ki.Status.Exporter.Conditions = append(ki.Status.Exporter.Conditions, availableConditionForGetError(err)) + return + } + + ds := dset.Status + ki.Status.Exporter.NumberMisscheduled = ds.NumberMisscheduled + ki.Status.Exporter.CurrentNumberScheduled = ds.CurrentNumberScheduled + ki.Status.Exporter.DesiredNumberScheduled = ds.DesiredNumberScheduled + ki.Status.Exporter.NumberReady = ds.NumberReady + ki.Status.Exporter.UpdatedNumberScheduled = ds.UpdatedNumberScheduled + ki.Status.Exporter.NumberAvailable = ds.NumberAvailable + ki.Status.Exporter.NumberUnavailable = ds.NumberUnavailable + + c := availableCondition(&dset) + if recErr == nil { + c.ObservedGeneration = ki.Generation + } + ki.Status.Exporter.Conditions = append(ki.Status.Exporter.Conditions, c) +} + +func availableConditionForGetError(err error) v1alpha1.Condition { + if errors.IsNotFound(err) { + return v1alpha1.Condition{ + Type: v1alpha1.Available, + Status: v1alpha1.ConditionFalse, + Reason: v1alpha1.DaemonSetNotFound, + Message: err.Error(), + } + } + + return v1alpha1.Condition{ + Type: v1alpha1.Available, + Status: v1alpha1.ConditionUnknown, + Reason: v1alpha1.DaemonSetError, + Message: err.Error(), + } + +} + +func availableCondition(dset *appsv1.DaemonSet) v1alpha1.Condition { + ds := dset.Status + dsName := dset.Namespace + "/" + dset.Name + + if gen, ogen := dset.Generation, ds.ObservedGeneration; gen > ogen { + return v1alpha1.Condition{ + Type: v1alpha1.Available, + Status: v1alpha1.ConditionUnknown, + Reason: v1alpha1.DaemonSetOutOfSync, + Message: fmt.Sprintf( + "Generation %d of kepler daemonset %q is out of sync with the observed generation: %d", + gen, dsName, ogen), + } + } + + c := v1alpha1.Condition{Type: v1alpha1.Available} + + // NumberReady: The number of nodes that should be running the daemon pod and + // have one or more of the daemon pod running with a Ready Condition. + // + // DesiredNumberScheduled: The total number of nodes that should be running + // the daemon pod (including nodes correctly running the daemon pod). + if ds.NumberReady == 0 || ds.DesiredNumberScheduled == 0 { + c.Status = v1alpha1.ConditionFalse + c.Reason = v1alpha1.DaemonSetPodsNotRunning + c.Message = fmt.Sprintf("Kepler daemonset %q is not rolled out to any node; check nodeSelector and tolerations", dsName) + return c + } + + // UpdatedNumberScheduled: The total number of nodes that are running updated daemon pod + // + // DesiredNumberScheduled: The total number of nodes that should be running + // the daemon pod (including nodes correctly running the daemon pod). + + if ds.UpdatedNumberScheduled < ds.DesiredNumberScheduled { + c.Status = v1alpha1.ConditionUnknown + c.Reason = v1alpha1.DaemonSetRolloutInProgress + c.Message = fmt.Sprintf( + "Waiting for kepler daemonset %q rollout to finish: %d out of %d new pods have been updated", + dsName, ds.UpdatedNumberScheduled, ds.DesiredNumberScheduled) + return c + } + + // NumberAvailable: The number of nodes that should be running the daemon pod + // and have one or more of the daemon pod running and available (ready for at + // least spec.minReadySeconds) + + if ds.NumberAvailable < ds.DesiredNumberScheduled { + c.Status = v1alpha1.ConditionUnknown + c.Reason = v1alpha1.DaemonSetPartiallyAvailable + c.Message = fmt.Sprintf("Rollout of kepler daemonset %q is in progress: %d of %d updated pods are available", + dsName, ds.NumberAvailable, ds.DesiredNumberScheduled) + return c + } + + // NumberUnavailable: The number of nodes that should be running the daemon + // pod and have none of the daemon pod running and available (ready for at + // least spec.minReadySeconds) + if ds.NumberUnavailable > 0 { + c.Status = v1alpha1.ConditionFalse + c.Reason = v1alpha1.DaemonSetPartiallyAvailable + c.Message = fmt.Sprintf("Waiting for kepler daemonset %q to rollout on %d nodes", dsName, ds.NumberUnavailable) + return c + } + + c.Status = v1alpha1.ConditionTrue + c.Reason = v1alpha1.DaemonSetReady + c.Message = fmt.Sprintf("Kepler daemonset %q is deployed to all nodes and available; ready %d/%d", + dsName, ds.NumberReady, ds.DesiredNumberScheduled) + return c +} + +func (r KeplerInternalReconciler) reconcilersForInternal(k *v1alpha1.KeplerInternal) []reconciler.Reconciler { + rs := []reconciler.Reconciler{} + + cleanup := !k.DeletionTimestamp.IsZero() + if !cleanup { + // NOTE: create namespace first and for deletion, reverse the order + rs = append(rs, reconciler.Updater{ + Owner: k, + Resource: components.NewNamespace(k.Namespace()), + OnError: reconciler.Requeue, + Logger: r.logger, + }) + } + + rs = append(rs, exporterReconcilers(k, Config.Cluster)...) + + // TODO: add this when modelServer is supported by Kepler Spec + // rs = append(rs, modelServerReconcilers(k)...) + + if cleanup { + rs = append(rs, reconciler.Deleter{ + OnError: reconciler.Requeue, + Resource: components.NewNamespace(k.Namespace()), + WaitTimeout: 2 * time.Minute, + }) + } + + // WARN: only run finalizer if theren't any errors + // this bug 🐛 must be FIXED + rs = append(rs, reconciler.Finalizer{ + Resource: k, + Finalizer: Finalizer, + Logger: r.logger, + }) + return rs +} + +func exporterReconcilers(ki *v1alpha1.KeplerInternal, cluster k8s.Cluster) []reconciler.Reconciler { + + if cleanup := !ki.DeletionTimestamp.IsZero(); cleanup { + rs := resourceReconcilers( + deleteResource, + // cluster-scoped + exporter.NewClusterRoleBinding(components.Metadata, ki), + exporter.NewClusterRole(components.Metadata, ki), + ) + rs = append(rs, resourceReconcilers(deleteResource, openshiftResources(ki, cluster)...)...) + return rs + } + + updateResource := newUpdaterWithOwner(ki) + // cluster-scoped resources first + rs := resourceReconcilers(updateResource, + exporter.NewClusterRole(components.Full, ki), + exporter.NewClusterRoleBinding(components.Full, ki), + + // namespace scoped + exporter.NewServiceAccount(ki), + exporter.NewConfigMap(components.Full, ki), + exporter.NewDaemonSet(components.Full, ki), + exporter.NewService(ki), + exporter.NewServiceMonitor(ki), + exporter.NewPrometheusRule(ki), + ) + rs = append(rs, resourceReconcilers(updateResource, openshiftResources(ki, cluster)...)...) + return rs +} + +func openshiftResources(ki *v1alpha1.KeplerInternal, cluster k8s.Cluster) []client.Object { + oshift := ki.Spec.OpenShift + + if cluster != k8s.OpenShift || !oshift.Enabled { + return nil + } + // cluster-scoped resources first + res := []client.Object{ + exporter.NewSCC(components.Full, ki), + } + if oshift.Dashboard.Enabled { + res = append(res, + exporter.NewOverviewDashboard(components.Full), + exporter.NewNamespaceInfoDashboard(components.Full), + ) + } + return res +} diff --git a/pkg/controllers/kepler_test.go b/pkg/controllers/kepler_test.go new file mode 100644 index 00000000..f98c48a0 --- /dev/null +++ b/pkg/controllers/kepler_test.go @@ -0,0 +1,76 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package controllers + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/sustainable.computing.io/kepler-operator/pkg/api/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestBpfAttachMethod(t *testing.T) { + + tt := []struct { + annotations map[string]string + scenario string + IsLibbpf bool + }{ + { + annotations: map[string]string{}, + IsLibbpf: false, + scenario: "no annotation", + }, + { + annotations: map[string]string{ + BpfAttachMethodAnnotation: "junk", + }, + IsLibbpf: false, + scenario: "annotation present but not libbpf", + }, + { + annotations: map[string]string{ + BpfAttachMethodAnnotation: "bcc", + }, + IsLibbpf: false, + scenario: "annotation present with bcc", + }, + { + annotations: map[string]string{ + BpfAttachMethodAnnotation: "libbpf", + }, + IsLibbpf: true, + scenario: "annotation present with libbpf", + }, + } + for _, tc := range tt { + tc := tc + t.Run(tc.scenario, func(t *testing.T) { + t.Parallel() + k := v1alpha1.Kepler{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: tc.annotations, + }, + Spec: v1alpha1.KeplerSpec{ + Exporter: v1alpha1.ExporterSpec{}, + }, + } + actual := hasLibBPFAnnotation(&k) + assert.Equal(t, actual, tc.IsLibbpf) + }) + } +} diff --git a/pkg/controllers/reconcilers.go b/pkg/controllers/reconcilers.go new file mode 100644 index 00000000..6ab02336 --- /dev/null +++ b/pkg/controllers/reconcilers.go @@ -0,0 +1,47 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package controllers + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/sustainable.computing.io/kepler-operator/pkg/reconciler" +) + +func resourceReconcilers(fn reconcileFn, resources ...client.Object) []reconciler.Reconciler { + rs := []reconciler.Reconciler{} + for _, res := range resources { + rs = append(rs, fn(res)) + } + return rs +} + +// TODO: decide if this this should move to reconciler +type reconcileFn func(client.Object) reconciler.Reconciler + +// newUpdaterWithOwner returns a reconcileFn that update the resource and +// sets the owner reference to the owner +func newUpdaterWithOwner(owner metav1.Object) reconcileFn { + return func(obj client.Object) reconciler.Reconciler { + return &reconciler.Updater{Owner: owner, Resource: obj} + } +} + +// deleteResource is a resourceFn that deletes resources +func deleteResource(obj client.Object) reconciler.Reconciler { + return &reconciler.Deleter{Resource: obj} +} diff --git a/pkg/reconciler/updater.go b/pkg/reconciler/updater.go index 67948381..4ed8a65f 100644 --- a/pkg/reconciler/updater.go +++ b/pkg/reconciler/updater.go @@ -50,6 +50,8 @@ func (r Updater) Reconcile(ctx context.Context, c client.Client, scheme *runtime } } + r.Logger.V(8).Info("updating resource", "resource", k8s.GVKName(r.Resource)) + if err := c.Patch(ctx, r.Resource, client.Apply, client.ForceOwnership, client.FieldOwner("kepler-operator")); err != nil { if errors.IsConflict(err) || errors.IsAlreadyExists(err) { // the cache may be stale; requests a Reconcile diff --git a/pkg/utils/test/framework.go b/pkg/utils/test/framework.go index 5779a042..0f8b2504 100644 --- a/pkg/utils/test/framework.go +++ b/pkg/utils/test/framework.go @@ -189,7 +189,7 @@ func (f Framework) WaitUntilKeplerCondition(name string, t v1alpha1.ConditionTyp return true, fmt.Errorf("kepler %s is not found", name) } - condition, _ := k8s.FindCondition(k.Status.Conditions, t) + condition, _ := k8s.FindCondition(k.Status.Exporter.Conditions, t) return condition.Status == s, nil }) return &k diff --git a/tests/e2e/kepler_test.go b/tests/e2e/kepler_test.go index 82ab780f..d6e616af 100644 --- a/tests/e2e/kepler_test.go +++ b/tests/e2e/kepler_test.go @@ -8,7 +8,7 @@ import ( "github.com/sustainable.computing.io/kepler-operator/pkg/api/v1alpha1" "github.com/sustainable.computing.io/kepler-operator/pkg/components" - "github.com/sustainable.computing.io/kepler-operator/pkg/components/exporter" + "github.com/sustainable.computing.io/kepler-operator/pkg/controllers" "github.com/sustainable.computing.io/kepler-operator/pkg/utils/k8s" "github.com/sustainable.computing.io/kepler-operator/pkg/utils/test" appsv1 "k8s.io/api/apps/v1" @@ -20,22 +20,22 @@ func TestKepler_Deletion(t *testing.T) { // pre-condition: ensure kepler exists f.CreateKepler("kepler") - f.WaitUntilKeplerCondition("kepler", v1alpha1.Available, v1alpha1.ConditionTrue) + k := f.WaitUntilKeplerCondition("kepler", v1alpha1.Available, v1alpha1.ConditionTrue) // ds := appsv1.DaemonSet{} f.AssertResourceExists( - exporter.DaemonSetName, - components.Namespace, + k.Name, + controllers.KeplerDeploymentNS, &ds, test.Timeout(10*time.Second), ) f.DeleteKepler("kepler") - ns := components.NewKeplerNamespace() + ns := components.NewNamespace(controllers.KeplerDeploymentNS) f.AssertNoResourceExists(ns.Name, "", ns) - f.AssertNoResourceExists(exporter.DaemonSetName, components.Namespace, &ds) + f.AssertNoResourceExists(ds.Name, ds.Namespace, &ds) } func TestKepler_Reconciliation(t *testing.T) { @@ -45,28 +45,27 @@ func TestKepler_Reconciliation(t *testing.T) { f.AssertNoResourceExists("kepler", "", &v1alpha1.Kepler{}, test.NoWait()) // when - f.CreateKepler("kepler") + k := f.CreateKepler("kepler") // then - f.AssertResourceExists(components.Namespace, "", &corev1.Namespace{}) + f.AssertResourceExists(controllers.KeplerDeploymentNS, "", &corev1.Namespace{}) ds := appsv1.DaemonSet{} - f.AssertResourceExists(exporter.DaemonSetName, components.Namespace, &ds) + f.AssertResourceExists(k.Name, controllers.KeplerDeploymentNS, &ds) kepler := f.WaitUntilKeplerCondition("kepler", v1alpha1.Reconciled, v1alpha1.ConditionTrue) // ensure the default toleration is set assert.Equal(t, []corev1.Toleration{{Operator: "Exists"}}, kepler.Spec.Exporter.Deployment.Tolerations) - reconciled, err := k8s.FindCondition(kepler.Status.Conditions, v1alpha1.Reconciled) + reconciled, err := k8s.FindCondition(kepler.Status.Exporter.Conditions, v1alpha1.Reconciled) assert.NoError(t, err, "unable to get reconciled condition") assert.Equal(t, reconciled.ObservedGeneration, kepler.Generation) assert.Equal(t, reconciled.Status, v1alpha1.ConditionTrue) kepler = f.WaitUntilKeplerCondition("kepler", v1alpha1.Available, v1alpha1.ConditionTrue) - available, err := k8s.FindCondition(kepler.Status.Conditions, v1alpha1.Available) + available, err := k8s.FindCondition(kepler.Status.Exporter.Conditions, v1alpha1.Available) assert.NoError(t, err, "unable to get available condition") assert.Equal(t, available.ObservedGeneration, kepler.Generation) assert.Equal(t, available.Status, v1alpha1.ConditionTrue) - } func TestBadKepler_Reconciliation(t *testing.T) { @@ -74,10 +73,10 @@ func TestBadKepler_Reconciliation(t *testing.T) { // Ensure Kepler is not deployed (by any chance) f.AssertNoResourceExists("kepler", "", &v1alpha1.Kepler{}, test.Timeout(10*time.Second)) f.AssertNoResourceExists("invalid-name", "", &v1alpha1.Kepler{}, test.NoWait()) - f.CreateKepler("invalid-name") + k := f.CreateKepler("invalid-name") ds := appsv1.DaemonSet{} - f.AssertNoResourceExists(exporter.DaemonSetName, components.Namespace, &ds) + f.AssertNoResourceExists(k.Name, controllers.BpfAttachMethodAnnotation, &ds) } func TestNodeSelector(t *testing.T) { @@ -93,20 +92,19 @@ func TestNodeSelector(t *testing.T) { err := f.AddResourceLabels("node", node.Name, labels) assert.NoError(t, err, "could not label node") - f.CreateKepler("kepler", f.WithNodeSelector(labels)) + k := f.CreateKepler("kepler", f.WithNodeSelector(labels)) - f.AssertResourceExists(components.Namespace, "", &corev1.Namespace{}) + f.AssertResourceExists(controllers.KeplerDeploymentNS, "", &corev1.Namespace{}) ds := appsv1.DaemonSet{} - f.AssertResourceExists(exporter.DaemonSetName, components.Namespace, &ds) + f.AssertResourceExists(k.Name, controllers.KeplerDeploymentNS, &ds) kepler := f.WaitUntilKeplerCondition("kepler", v1alpha1.Available, v1alpha1.ConditionTrue) - assert.EqualValues(t, 1, kepler.Status.NumberAvailable) + assert.EqualValues(t, 1, kepler.Status.Exporter.NumberAvailable) f.DeleteKepler("kepler") - ns := components.NewKeplerNamespace() - f.AssertNoResourceExists(ns.Name, "", ns) - f.AssertNoResourceExists(exporter.DaemonSetName, components.Namespace, &ds) + f.AssertNoResourceExists(controllers.KeplerDeploymentNS, "", &corev1.Namespace{}) + f.AssertNoResourceExists(ds.Name, ds.Namespace, &ds) } func TestNodeSelectorUnavailableLabel(t *testing.T) { @@ -119,20 +117,19 @@ func TestNodeSelectorUnavailableLabel(t *testing.T) { var unavailableLabels k8s.StringMap = map[string]string{"e2e-test": "true"} - f.CreateKepler("kepler", f.WithNodeSelector(unavailableLabels)) + k := f.CreateKepler("kepler", f.WithNodeSelector(unavailableLabels)) - f.AssertResourceExists(components.Namespace, "", &corev1.Namespace{}) + f.AssertResourceExists(controllers.KeplerDeploymentNS, "", &corev1.Namespace{}) ds := appsv1.DaemonSet{} - f.AssertResourceExists(exporter.DaemonSetName, components.Namespace, &ds) + f.AssertResourceExists(k.Name, controllers.KeplerDeploymentNS, &ds) kepler := f.WaitUntilKeplerCondition("kepler", v1alpha1.Available, v1alpha1.ConditionFalse) - assert.EqualValues(t, 0, kepler.Status.NumberAvailable) + assert.EqualValues(t, 0, kepler.Status.Exporter.NumberAvailable) f.DeleteKepler("kepler") - ns := components.NewKeplerNamespace() - f.AssertNoResourceExists(ns.Name, "", ns) - f.AssertNoResourceExists(exporter.DaemonSetName, components.Namespace, &ds) + f.AssertNoResourceExists(controllers.KeplerDeploymentNS, "", &corev1.Namespace{}) + f.AssertNoResourceExists(ds.Name, ds.Namespace, &ds) } func TestTaint_WithToleration(t *testing.T) { @@ -155,21 +152,20 @@ func TestTaint_WithToleration(t *testing.T) { err = f.TaintNode(node.Name, e2eTestTaint.ToString()) assert.NoError(t, err, "failed to taint node %s", node) - f.CreateKepler("kepler", f.WithTolerations(append(node.Spec.Taints, e2eTestTaint))) - f.AssertResourceExists(components.Namespace, "", &corev1.Namespace{}) + k := f.CreateKepler("kepler", f.WithTolerations(append(node.Spec.Taints, e2eTestTaint))) + f.AssertResourceExists(controllers.KeplerDeploymentNS, "", &corev1.Namespace{}) ds := appsv1.DaemonSet{} - f.AssertResourceExists(exporter.DaemonSetName, components.Namespace, &ds) + f.AssertResourceExists(k.Name, controllers.KeplerDeploymentNS, &ds) kepler := f.WaitUntilKeplerCondition("kepler", v1alpha1.Available, v1alpha1.ConditionTrue) - assert.EqualValues(t, len(nodes), kepler.Status.NumberAvailable) + assert.EqualValues(t, len(nodes), kepler.Status.Exporter.NumberAvailable) f.DeleteKepler("kepler") - ns := components.NewKeplerNamespace() - f.AssertNoResourceExists(ns.Name, "", ns) - f.AssertNoResourceExists(exporter.DaemonSetName, components.Namespace, &ds) - + f.AssertNoResourceExists(controllers.KeplerDeploymentNS, "", &corev1.Namespace{}) + f.AssertNoResourceExists(ds.Name, ds.Namespace, &ds) } + func TestBadTaint_WithToleration(t *testing.T) { f := test.NewFramework(t) @@ -193,19 +189,18 @@ func TestBadTaint_WithToleration(t *testing.T) { err := f.TaintNode(node.Name, e2eTestTaint.ToString()) assert.NoError(t, err, "failed to taint node %s", node) - f.CreateKepler("kepler", f.WithTolerations(append(node.Spec.Taints, badTestTaint))) + k := f.CreateKepler("kepler", f.WithTolerations(append(node.Spec.Taints, badTestTaint))) - f.AssertResourceExists(components.Namespace, "", &corev1.Namespace{}) + f.AssertResourceExists(controllers.KeplerDeploymentNS, "", &corev1.Namespace{}) ds := appsv1.DaemonSet{} - f.AssertResourceExists(exporter.DaemonSetName, components.Namespace, &ds) + f.AssertResourceExists(k.Name, controllers.KeplerDeploymentNS, &ds) kepler := f.WaitUntilKeplerCondition("kepler", v1alpha1.Available, v1alpha1.ConditionTrue) - assert.EqualValues(t, len(nodes)-1, kepler.Status.NumberAvailable) + assert.EqualValues(t, len(nodes)-1, kepler.Status.Exporter.NumberAvailable) f.DeleteKepler("kepler") - ns := components.NewKeplerNamespace() - f.AssertNoResourceExists(ns.Name, "", ns) - f.AssertNoResourceExists(exporter.DaemonSetName, components.Namespace, &ds) + f.AssertNoResourceExists(controllers.KeplerDeploymentNS, "", &corev1.Namespace{}) + f.AssertNoResourceExists(ds.Name, ds.Namespace, &ds) } diff --git a/tests/run-e2e.sh b/tests/run-e2e.sh index 83140b01..e2f667bb 100755 --- a/tests/run-e2e.sh +++ b/tests/run-e2e.sh @@ -151,7 +151,9 @@ run_e2e() { local ret=0 go test -v -failfast -timeout $TEST_TIMEOUT \ - ./tests/e2e/... 2>&1 | tee "$LOGS_DIR/e2e.log" || ret=1 + ./tests/e2e/... \ + -run Reconcile \ + 2>&1 | tee "$LOGS_DIR/e2e.log" || ret=1 # terminate both log_events { jobs -p | xargs -I {} -- pkill -TERM -P {}; } || true @@ -343,7 +345,7 @@ ensure_deploy_img_is_always_pulled() { local pull_policy pull_policy=$(kubectl get deploy/$OPERATOR_DEPLOY_NAME \ -n "$OPERATORS_NS" \ - -ojsonpath='{.spec.template.spec.containers[1].imagePullPolicy}') + -ojsonpath='{.spec.template.spec.containers[0].imagePullPolicy}') if [[ "$pull_policy" != "Always" ]]; then info "Edit $OPERATOR_DEPLOY_YAML imagePullPolicy and redeploy"