From 54449121388591bc2a36d0051fe1ade384e5a43e Mon Sep 17 00:00:00 2001 From: Sunyanan Choochotkaew Date: Fri, 8 Dec 2023 18:39:48 +0900 Subject: [PATCH] add model-server/estimator to KeplerInternal Signed-off-by: Sunyanan Choochotkaew --- ...kepler-operator.clusterserviceversion.yaml | 4 +- ...tainable.computing.io_keplerinternals.yaml | 338 ++++++ cmd/manager/main.go | 5 + ...tainable.computing.io_keplerinternals.yaml | 338 ++++++ config/manager/manager.yaml | 2 +- ...kepler-operator.clusterserviceversion.yaml | 1 - config/rbac/role.yaml | 2 + docs/api.md | 1044 ++++++++++++++++- pkg/api/v1alpha1/kepler_internal_types.go | 104 +- pkg/api/v1alpha1/zz_generated.deepcopy.go | 155 +++ pkg/components/estimator/estimator.go | 139 +++ pkg/components/estimator/estimator_test.go | 157 +++ pkg/components/exporter/exporter.go | 184 +-- pkg/components/exporter/exporter_test.go | 2 - pkg/components/modelserver/modelserver.go | 201 ++-- .../modelserver/modelserver_test.go | 144 +++ pkg/controllers/config.go | 8 + pkg/controllers/kepler_internal.go | 85 +- pkg/reconciler/runner.go | 1 + pkg/utils/k8s/k8s.go | 16 + pkg/utils/test/assertions.go | 21 + pkg/utils/test/framework.go | 31 + pkg/utils/test/kepler_internal_builder.go | 23 + tests/e2e/kepler_internal_test.go | 124 +- 24 files changed, 2851 insertions(+), 278 deletions(-) create mode 100644 pkg/components/estimator/estimator.go create mode 100644 pkg/components/estimator/estimator_test.go create mode 100644 pkg/components/modelserver/modelserver_test.go diff --git a/bundle/manifests/kepler-operator.clusterserviceversion.yaml b/bundle/manifests/kepler-operator.clusterserviceversion.yaml index 6f14ef33..6d1a2501 100644 --- a/bundle/manifests/kepler-operator.clusterserviceversion.yaml +++ b/bundle/manifests/kepler-operator.clusterserviceversion.yaml @@ -27,7 +27,7 @@ metadata: capabilities: Basic Install categories: Monitoring containerImage: quay.io/sustainable_computing_io/kepler-operator:0.9.2 - createdAt: "2023-12-04T08:14:16Z" + createdAt: "2023-12-08T08:56:48Z" description: 'Deploys and Manages Kepler on Kubernetes ' operators.operatorframework.io/builder: operator-sdk-v1.27.0 operators.operatorframework.io/internal-objects: |- @@ -108,6 +108,7 @@ spec: - apps resources: - daemonsets + - deployments verbs: - create - delete @@ -119,6 +120,7 @@ spec: - "" resources: - configmaps + - persistentvolumeclaims - serviceaccounts - services verbs: diff --git a/bundle/manifests/kepler.system.sustainable.computing.io_keplerinternals.yaml b/bundle/manifests/kepler.system.sustainable.computing.io_keplerinternals.yaml index 410a671f..3cec3f06 100644 --- a/bundle/manifests/kepler.system.sustainable.computing.io_keplerinternals.yaml +++ b/bundle/manifests/kepler.system.sustainable.computing.io_keplerinternals.yaml @@ -47,6 +47,12 @@ spec: name: Tolerations priority: 10 type: string + - jsonPath: .status.estimator.status + name: Estimator + type: string + - jsonPath: .status.modelServer.status + name: Model-Server + type: string name: v1alpha1 schema: openAPIV3Schema: @@ -67,6 +73,78 @@ spec: spec: description: KeplerInternalSpec defines the desired state of KeplerInternal properties: + estimator: + description: Estimator Spec + properties: + container: + properties: + components: + properties: + initUrl: + type: string + selector: + properties: + filterConditions: + type: string + modelName: + type: string + type: object + sidecar: + default: false + type: boolean + type: object + total: + properties: + initUrl: + type: string + selector: + properties: + filterConditions: + type: string + modelName: + type: string + type: object + sidecar: + default: false + type: boolean + type: object + type: object + image: + type: string + node: + properties: + components: + properties: + initUrl: + type: string + selector: + properties: + filterConditions: + type: string + modelName: + type: string + type: object + sidecar: + default: false + type: boolean + type: object + total: + properties: + initUrl: + type: string + selector: + properties: + filterConditions: + type: string + modelName: + type: string + type: object + sidecar: + default: false + type: boolean + type: object + type: object + type: object exporter: properties: deployment: @@ -146,6 +224,256 @@ spec: required: - deployment type: object + modelServer: + description: Kepler Model Server Spec + properties: + enabled: + default: false + type: boolean + errKey: + default: "" + type: string + image: + type: string + listPath: + default: "" + type: string + path: + default: "" + type: string + pipelineUrl: + default: "" + type: string + port: + default: 8100 + maximum: 65535 + minimum: 1 + type: integer + requestPath: + default: "" + type: string + storage: + properties: + persistentVolumeClaim: + description: PersistentVolumeClaimSpec describes the common + attributes of storage devices and allows a Source for provider-specific + attributes + properties: + accessModes: + description: 'accessModes contains the desired access + modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1' + items: + type: string + type: array + dataSource: + description: 'dataSource field can be used to specify + either: * An existing VolumeSnapshot object (snapshot.storage.k8s.io/VolumeSnapshot) + * An existing PVC (PersistentVolumeClaim) If the provisioner + or an external controller can support the specified + data source, it will create a new volume based on the + contents of the specified data source. When the AnyVolumeDataSource + feature gate is enabled, dataSource contents will be + copied to dataSourceRef, and dataSourceRef contents + will be copied to dataSource when dataSourceRef.namespace + is not specified. If the namespace is specified, then + dataSourceRef will not be copied to dataSource.' + properties: + apiGroup: + description: APIGroup is the group for the resource + being referenced. If APIGroup is not specified, + the specified Kind must be in the core API group. + For any other third-party types, APIGroup is required. + type: string + kind: + description: Kind is the type of resource being referenced + type: string + name: + description: Name is the name of resource being referenced + type: string + required: + - kind + - name + type: object + x-kubernetes-map-type: atomic + dataSourceRef: + description: 'dataSourceRef specifies the object from + which to populate the volume with data, if a non-empty + volume is desired. This may be any object from a non-empty + API group (non core object) or a PersistentVolumeClaim + object. When this field is specified, volume binding + will only succeed if the type of the specified object + matches some installed volume populator or dynamic provisioner. + This field will replace the functionality of the dataSource + field and as such if both fields are non-empty, they + must have the same value. For backwards compatibility, + when namespace isn''t specified in dataSourceRef, both + fields (dataSource and dataSourceRef) will be set to + the same value automatically if one of them is empty + and the other is non-empty. When namespace is specified + in dataSourceRef, dataSource isn''t set to the same + value and must be empty. There are three important differences + between dataSource and dataSourceRef: * While dataSource + only allows two specific types of objects, dataSourceRef + allows any non-core object, as well as PersistentVolumeClaim + objects. * While dataSource ignores disallowed values + (dropping them), dataSourceRef preserves all values, + and generates an error if a disallowed value is specified. + * While dataSource only allows local objects, dataSourceRef + allows objects in any namespaces. (Beta) Using this + field requires the AnyVolumeDataSource feature gate + to be enabled. (Alpha) Using the namespace field of + dataSourceRef requires the CrossNamespaceVolumeDataSource + feature gate to be enabled.' + properties: + apiGroup: + description: APIGroup is the group for the resource + being referenced. If APIGroup is not specified, + the specified Kind must be in the core API group. + For any other third-party types, APIGroup is required. + type: string + kind: + description: Kind is the type of resource being referenced + type: string + name: + description: Name is the name of resource being referenced + type: string + namespace: + description: Namespace is the namespace of resource + being referenced Note that when a namespace is specified, + a gateway.networking.k8s.io/ReferenceGrant object + is required in the referent namespace to allow that + namespace's owner to accept the reference. See the + ReferenceGrant documentation for details. (Alpha) + This field requires the CrossNamespaceVolumeDataSource + feature gate to be enabled. + type: string + required: + - kind + - name + type: object + resources: + description: 'resources represents the minimum resources + the volume should have. If RecoverVolumeExpansionFailure + feature is enabled users are allowed to specify resource + requirements that are lower than previous value but + must still be higher than capacity recorded in the status + field of the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#resources' + properties: + claims: + description: "Claims lists the names of resources, + defined in spec.resourceClaims, that are used by + this container. \n This is an alpha field and requires + enabling the DynamicResourceAllocation feature gate. + \n This field is immutable. It can only be set for + containers." + items: + description: ResourceClaim references one entry + in PodSpec.ResourceClaims. + properties: + name: + description: Name must match the name of one + entry in pod.spec.resourceClaims of the Pod + where this field is used. It makes that resource + available inside a container. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Limits describes the maximum amount + of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Requests describes the minimum amount + of compute resources required. If Requests is omitted + for a container, it defaults to Limits if that is + explicitly specified, otherwise to an implementation-defined + value. Requests cannot exceed Limits. More info: + https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + type: object + selector: + description: selector is a label query over volumes to + consider for binding. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values + array must be non-empty. If the operator is + Exists or DoesNotExist, the values array must + be empty. This array is replaced during a + strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + storageClassName: + description: 'storageClassName is the name of the StorageClass + required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1' + type: string + volumeMode: + description: volumeMode defines what type of volume is + required by the claim. Value of Filesystem is implied + when not included in claim spec. + type: string + volumeName: + description: volumeName is the binding reference to the + PersistentVolume backing this claim. + type: string + type: object + type: object + url: + default: "" + type: string + type: object openshift: properties: dashboard: @@ -166,6 +494,11 @@ spec: status: description: KeplerInternalStatus represents status of KeplerInternal properties: + estimator: + properties: + status: + type: string + type: object exporter: description: ExporterStatus defines the observed state of Kepler Exporter properties: @@ -261,6 +594,11 @@ spec: - numberMisscheduled - numberReady type: object + modelServer: + properties: + status: + type: string + type: object type: object type: object served: true diff --git a/cmd/manager/main.go b/cmd/manager/main.go index b8546fc3..9baba73f 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -39,7 +39,9 @@ import ( securityv1 "github.com/openshift/api/security/v1" keplersystemv1alpha1 "github.com/sustainable.computing.io/kepler-operator/pkg/api/v1alpha1" + "github.com/sustainable.computing.io/kepler-operator/pkg/components/estimator" "github.com/sustainable.computing.io/kepler-operator/pkg/components/exporter" + "github.com/sustainable.computing.io/kepler-operator/pkg/components/modelserver" "github.com/sustainable.computing.io/kepler-operator/pkg/controllers" "github.com/sustainable.computing.io/kepler-operator/pkg/utils/k8s" //+kubebuilder:scaffold:imports @@ -99,6 +101,9 @@ func main() { flag.StringVar(&controllers.Config.Image, "kepler.image", keplerImage, "kepler image") flag.StringVar(&controllers.Config.ImageLibbpf, "kepler.image.libbpf", keplerImageLibbpf, "kepler libbpf image") + flag.StringVar(&controllers.InternalConfig.ModelServerImage, "estimator.image", estimator.StableImage, "kepler estimator image") + flag.StringVar(&controllers.InternalConfig.EstimatorImage, "model-server.image", modelserver.StableImage, "kepler model server image") + opts := zap.Options{ Development: true, } diff --git a/config/crd/bases/kepler.system.sustainable.computing.io_keplerinternals.yaml b/config/crd/bases/kepler.system.sustainable.computing.io_keplerinternals.yaml index 7b07bfbf..ef023459 100644 --- a/config/crd/bases/kepler.system.sustainable.computing.io_keplerinternals.yaml +++ b/config/crd/bases/kepler.system.sustainable.computing.io_keplerinternals.yaml @@ -47,6 +47,12 @@ spec: name: Tolerations priority: 10 type: string + - jsonPath: .status.estimator.status + name: Estimator + type: string + - jsonPath: .status.modelServer.status + name: Model-Server + type: string name: v1alpha1 schema: openAPIV3Schema: @@ -67,6 +73,78 @@ spec: spec: description: KeplerInternalSpec defines the desired state of KeplerInternal properties: + estimator: + description: Estimator Spec + properties: + container: + properties: + components: + properties: + initUrl: + type: string + selector: + properties: + filterConditions: + type: string + modelName: + type: string + type: object + sidecar: + default: false + type: boolean + type: object + total: + properties: + initUrl: + type: string + selector: + properties: + filterConditions: + type: string + modelName: + type: string + type: object + sidecar: + default: false + type: boolean + type: object + type: object + image: + type: string + node: + properties: + components: + properties: + initUrl: + type: string + selector: + properties: + filterConditions: + type: string + modelName: + type: string + type: object + sidecar: + default: false + type: boolean + type: object + total: + properties: + initUrl: + type: string + selector: + properties: + filterConditions: + type: string + modelName: + type: string + type: object + sidecar: + default: false + type: boolean + type: object + type: object + type: object exporter: properties: deployment: @@ -146,6 +224,256 @@ spec: required: - deployment type: object + modelServer: + description: Kepler Model Server Spec + properties: + enabled: + default: false + type: boolean + errKey: + default: "" + type: string + image: + type: string + listPath: + default: "" + type: string + path: + default: "" + type: string + pipelineUrl: + default: "" + type: string + port: + default: 8100 + maximum: 65535 + minimum: 1 + type: integer + requestPath: + default: "" + type: string + storage: + properties: + persistentVolumeClaim: + description: PersistentVolumeClaimSpec describes the common + attributes of storage devices and allows a Source for provider-specific + attributes + properties: + accessModes: + description: 'accessModes contains the desired access + modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1' + items: + type: string + type: array + dataSource: + description: 'dataSource field can be used to specify + either: * An existing VolumeSnapshot object (snapshot.storage.k8s.io/VolumeSnapshot) + * An existing PVC (PersistentVolumeClaim) If the provisioner + or an external controller can support the specified + data source, it will create a new volume based on the + contents of the specified data source. When the AnyVolumeDataSource + feature gate is enabled, dataSource contents will be + copied to dataSourceRef, and dataSourceRef contents + will be copied to dataSource when dataSourceRef.namespace + is not specified. If the namespace is specified, then + dataSourceRef will not be copied to dataSource.' + properties: + apiGroup: + description: APIGroup is the group for the resource + being referenced. If APIGroup is not specified, + the specified Kind must be in the core API group. + For any other third-party types, APIGroup is required. + type: string + kind: + description: Kind is the type of resource being referenced + type: string + name: + description: Name is the name of resource being referenced + type: string + required: + - kind + - name + type: object + x-kubernetes-map-type: atomic + dataSourceRef: + description: 'dataSourceRef specifies the object from + which to populate the volume with data, if a non-empty + volume is desired. This may be any object from a non-empty + API group (non core object) or a PersistentVolumeClaim + object. When this field is specified, volume binding + will only succeed if the type of the specified object + matches some installed volume populator or dynamic provisioner. + This field will replace the functionality of the dataSource + field and as such if both fields are non-empty, they + must have the same value. For backwards compatibility, + when namespace isn''t specified in dataSourceRef, both + fields (dataSource and dataSourceRef) will be set to + the same value automatically if one of them is empty + and the other is non-empty. When namespace is specified + in dataSourceRef, dataSource isn''t set to the same + value and must be empty. There are three important differences + between dataSource and dataSourceRef: * While dataSource + only allows two specific types of objects, dataSourceRef + allows any non-core object, as well as PersistentVolumeClaim + objects. * While dataSource ignores disallowed values + (dropping them), dataSourceRef preserves all values, + and generates an error if a disallowed value is specified. + * While dataSource only allows local objects, dataSourceRef + allows objects in any namespaces. (Beta) Using this + field requires the AnyVolumeDataSource feature gate + to be enabled. (Alpha) Using the namespace field of + dataSourceRef requires the CrossNamespaceVolumeDataSource + feature gate to be enabled.' + properties: + apiGroup: + description: APIGroup is the group for the resource + being referenced. If APIGroup is not specified, + the specified Kind must be in the core API group. + For any other third-party types, APIGroup is required. + type: string + kind: + description: Kind is the type of resource being referenced + type: string + name: + description: Name is the name of resource being referenced + type: string + namespace: + description: Namespace is the namespace of resource + being referenced Note that when a namespace is specified, + a gateway.networking.k8s.io/ReferenceGrant object + is required in the referent namespace to allow that + namespace's owner to accept the reference. See the + ReferenceGrant documentation for details. (Alpha) + This field requires the CrossNamespaceVolumeDataSource + feature gate to be enabled. + type: string + required: + - kind + - name + type: object + resources: + description: 'resources represents the minimum resources + the volume should have. If RecoverVolumeExpansionFailure + feature is enabled users are allowed to specify resource + requirements that are lower than previous value but + must still be higher than capacity recorded in the status + field of the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#resources' + properties: + claims: + description: "Claims lists the names of resources, + defined in spec.resourceClaims, that are used by + this container. \n This is an alpha field and requires + enabling the DynamicResourceAllocation feature gate. + \n This field is immutable. It can only be set for + containers." + items: + description: ResourceClaim references one entry + in PodSpec.ResourceClaims. + properties: + name: + description: Name must match the name of one + entry in pod.spec.resourceClaims of the Pod + where this field is used. It makes that resource + available inside a container. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Limits describes the maximum amount + of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: 'Requests describes the minimum amount + of compute resources required. If Requests is omitted + for a container, it defaults to Limits if that is + explicitly specified, otherwise to an implementation-defined + value. Requests cannot exceed Limits. More info: + https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' + type: object + type: object + selector: + description: selector is a label query over volumes to + consider for binding. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values + array must be non-empty. If the operator is + Exists or DoesNotExist, the values array must + be empty. This array is replaced during a + strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + storageClassName: + description: 'storageClassName is the name of the StorageClass + required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1' + type: string + volumeMode: + description: volumeMode defines what type of volume is + required by the claim. Value of Filesystem is implied + when not included in claim spec. + type: string + volumeName: + description: volumeName is the binding reference to the + PersistentVolume backing this claim. + type: string + type: object + type: object + url: + default: "" + type: string + type: object openshift: properties: dashboard: @@ -166,6 +494,11 @@ spec: status: description: KeplerInternalStatus represents status of KeplerInternal properties: + estimator: + properties: + status: + type: string + type: object exporter: description: ExporterStatus defines the observed state of Kepler Exporter properties: @@ -261,6 +594,11 @@ spec: - numberMisscheduled - numberReady type: object + modelServer: + properties: + status: + type: string + type: object type: object type: object served: true diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index b0f95c6f..e1df20f5 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -74,7 +74,7 @@ spec: - name: RELATED_IMAGE_KEPLER_LIBBPF value: '' args: - # TODO: move --openshift to openshift specific kustomize directory + # TODO: move --openshift and deployment-namespace to openshift specific kustomize directory - --openshift - --leader-elect - --kepler.image=$(RELATED_IMAGE_KEPLER) diff --git a/config/manifests/bases/kepler-operator.clusterserviceversion.yaml b/config/manifests/bases/kepler-operator.clusterserviceversion.yaml index 30efac97..f7e86a1d 100644 --- a/config/manifests/bases/kepler-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/kepler-operator.clusterserviceversion.yaml @@ -100,7 +100,6 @@ spec: resources: - configmaps - persistentvolumeclaims - - persistentvolumes - serviceaccounts - services verbs: diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index c794599d..667509dc 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -8,6 +8,7 @@ rules: - apps resources: - daemonsets + - deployments verbs: - create - delete @@ -19,6 +20,7 @@ rules: - "" resources: - configmaps + - persistentvolumeclaims - serviceaccounts - services verbs: diff --git a/docs/api.md b/docs/api.md index d2c1293a..80b44432 100644 --- a/docs/api.md +++ b/docs/api.md @@ -92,23 +92,737 @@ KeplerInternalSpec defines the desired state of KeplerInternal
true + + estimator + object + + Estimator Spec
+ + false + + modelServer + object + + Kepler Model Server Spec
+ + false openshift object -
+
+ + false + + + + +### KeplerInternal.spec.exporter +[↩ Parent](#keplerinternalspec) + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
deploymentobject +
+
true
+ + +### KeplerInternal.spec.exporter.deployment +[↩ Parent](#keplerinternalspecexporter) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
imagestring + Image of kepler-exporter to be deployed
+
true
namespacestring + Namespace where kepler-exporter will be deployed
+
true
nodeSelectormap[string]string + Defines which Nodes the Pod is scheduled on
+
+ Default: map[kubernetes.io/os:linux]
+
false
portinteger +
+
+ Format: int32
+ Default: 9103
+ Minimum: 1
+ Maximum: 65535
+
false
tolerations[]object + If specified, define Pod's tolerations
+
+ Default: [map[effect: key: operator:Exists value:]]
+
false
+ + +### KeplerInternal.spec.exporter.deployment.tolerations[index] +[↩ Parent](#keplerinternalspecexporterdeployment) + + + +The pod this Toleration is attached to tolerates any taint that matches the triple using the matching operator . + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
effectstring + Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
+
false
keystring + Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.
+
false
operatorstring + Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.
+
false
tolerationSecondsinteger + TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.
+
+ Format: int64
+
false
valuestring + Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.
+
false
+ + +### KeplerInternal.spec.estimator +[↩ Parent](#keplerinternalspec) + + + +Estimator Spec + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
containerobject +
+
false
imagestring +
+
false
nodeobject +
+
false
+ + +### KeplerInternal.spec.estimator.container +[↩ Parent](#keplerinternalspecestimator) + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
componentsobject +
+
false
totalobject +
+
false
+ + +### KeplerInternal.spec.estimator.container.components +[↩ Parent](#keplerinternalspecestimatorcontainer) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
initUrlstring +
+
false
selectorobject +
+
false
sidecarboolean +
+
+ Default: false
+
false
+ + +### KeplerInternal.spec.estimator.container.components.selector +[↩ Parent](#keplerinternalspecestimatorcontainercomponents) + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
filterConditionsstring +
+
false
modelNamestring +
+
false
+ + +### KeplerInternal.spec.estimator.container.total +[↩ Parent](#keplerinternalspecestimatorcontainer) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
initUrlstring +
+
false
selectorobject +
+
false
sidecarboolean +
+
+ Default: false
+
false
+ + +### KeplerInternal.spec.estimator.container.total.selector +[↩ Parent](#keplerinternalspecestimatorcontainertotal) + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
filterConditionsstring +
+
false
modelNamestring +
+
false
+ + +### KeplerInternal.spec.estimator.node +[↩ Parent](#keplerinternalspecestimator) + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
componentsobject +
+
false
totalobject +
+
false
+ + +### KeplerInternal.spec.estimator.node.components +[↩ Parent](#keplerinternalspecestimatornode) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
initUrlstring +
+
false
selectorobject +
+
false
sidecarboolean +
+
+ Default: false
+
false
+ + +### KeplerInternal.spec.estimator.node.components.selector +[↩ Parent](#keplerinternalspecestimatornodecomponents) + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
filterConditionsstring +
+
false
modelNamestring +
+
false
+ + +### KeplerInternal.spec.estimator.node.total +[↩ Parent](#keplerinternalspecestimatornode) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
initUrlstring +
+
false
selectorobject +
+
false
sidecarboolean +
+
+ Default: false
+
false
+ + +### KeplerInternal.spec.estimator.node.total.selector +[↩ Parent](#keplerinternalspecestimatornodetotal) + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
filterConditionsstring +
+
false
modelNamestring +
+
false
+ + +### KeplerInternal.spec.modelServer +[↩ Parent](#keplerinternalspec) + + + +Kepler Model Server Spec + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
enabledboolean +
+
+ Default: false
+
false
errKeystring +
+
+ Default:
+
false
imagestring +
+
false
listPathstring +
+
+ Default:
+
false
pathstring +
+
+ Default:
+
false
pipelineUrlstring +
+
+ Default:
+
false
portinteger +
+
+ Default: 8100
+ Minimum: 1
+ Maximum: 65535
+
false
requestPathstring +
+
+ Default:
+
false
storageobject +
+
false
urlstring +
+
+ Default:
+
false
+ + +### KeplerInternal.spec.modelServer.storage +[↩ Parent](#keplerinternalspecmodelserver) + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
persistentVolumeClaimobject + PersistentVolumeClaimSpec describes the common attributes of storage devices and allows a Source for provider-specific attributes
false
-### KeplerInternal.spec.exporter -[↩ Parent](#keplerinternalspec) - +### KeplerInternal.spec.modelServer.storage.persistentVolumeClaim +[↩ Parent](#keplerinternalspecmodelserverstorage) +PersistentVolumeClaimSpec describes the common attributes of storage devices and allows a Source for provider-specific attributes @@ -120,22 +834,71 @@ KeplerInternalSpec defines the desired state of KeplerInternal - + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
deploymentaccessModes[]string + accessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1
+
false
dataSource object -
+ dataSource field can be used to specify either: * An existing VolumeSnapshot object (snapshot.storage.k8s.io/VolumeSnapshot) * An existing PVC (PersistentVolumeClaim) If the provisioner or an external controller can support the specified data source, it will create a new volume based on the contents of the specified data source. When the AnyVolumeDataSource feature gate is enabled, dataSource contents will be copied to dataSourceRef, and dataSourceRef contents will be copied to dataSource when dataSourceRef.namespace is not specified. If the namespace is specified, then dataSourceRef will not be copied to dataSource.
truefalse
dataSourceRefobject + dataSourceRef specifies the object from which to populate the volume with data, if a non-empty volume is desired. This may be any object from a non-empty API group (non core object) or a PersistentVolumeClaim object. When this field is specified, volume binding will only succeed if the type of the specified object matches some installed volume populator or dynamic provisioner. This field will replace the functionality of the dataSource field and as such if both fields are non-empty, they must have the same value. For backwards compatibility, when namespace isn't specified in dataSourceRef, both fields (dataSource and dataSourceRef) will be set to the same value automatically if one of them is empty and the other is non-empty. When namespace is specified in dataSourceRef, dataSource isn't set to the same value and must be empty. There are three important differences between dataSource and dataSourceRef: * While dataSource only allows two specific types of objects, dataSourceRef allows any non-core object, as well as PersistentVolumeClaim objects. * While dataSource ignores disallowed values (dropping them), dataSourceRef preserves all values, and generates an error if a disallowed value is specified. * While dataSource only allows local objects, dataSourceRef allows objects in any namespaces. (Beta) Using this field requires the AnyVolumeDataSource feature gate to be enabled. (Alpha) Using the namespace field of dataSourceRef requires the CrossNamespaceVolumeDataSource feature gate to be enabled.
+
false
resourcesobject + resources represents the minimum resources the volume should have. If RecoverVolumeExpansionFailure feature is enabled users are allowed to specify resource requirements that are lower than previous value but must still be higher than capacity recorded in the status field of the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#resources
+
false
selectorobject + selector is a label query over volumes to consider for binding.
+
false
storageClassNamestring + storageClassName is the name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1
+
false
volumeModestring + volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec.
+
false
volumeNamestring + volumeName is the binding reference to the PersistentVolume backing this claim.
+
false
-### KeplerInternal.spec.exporter.deployment -[↩ Parent](#keplerinternalspecexporter) - +### KeplerInternal.spec.modelServer.storage.persistentVolumeClaim.dataSource +[↩ Parent](#keplerinternalspecmodelserverstoragepersistentvolumeclaim) +dataSource field can be used to specify either: * An existing VolumeSnapshot object (snapshot.storage.k8s.io/VolumeSnapshot) * An existing PVC (PersistentVolumeClaim) If the provisioner or an external controller can support the specified data source, it will create a new volume based on the contents of the specified data source. When the AnyVolumeDataSource feature gate is enabled, dataSource contents will be copied to dataSourceRef, and dataSourceRef contents will be copied to dataSource when dataSourceRef.namespace is not specified. If the namespace is specified, then dataSourceRef will not be copied to dataSource. @@ -147,59 +910,84 @@ KeplerInternalSpec defines the desired state of KeplerInternal - + - + - - + + + +
imagekind string - Image of kepler-exporter to be deployed
+ Kind is the type of resource being referenced
true
namespacename string - Namespace where kepler-exporter will be deployed
+ Name is the name of resource being referenced
true
nodeSelectormap[string]stringapiGroupstring - Defines which Nodes the Pod is scheduled on
-
- Default: map[kubernetes.io/os:linux]
+ APIGroup is the group for the resource being referenced. If APIGroup is not specified, the specified Kind must be in the core API group. For any other third-party types, APIGroup is required.
false
+ + +### KeplerInternal.spec.modelServer.storage.persistentVolumeClaim.dataSourceRef +[↩ Parent](#keplerinternalspecmodelserverstoragepersistentvolumeclaim) + + + +dataSourceRef specifies the object from which to populate the volume with data, if a non-empty volume is desired. This may be any object from a non-empty API group (non core object) or a PersistentVolumeClaim object. When this field is specified, volume binding will only succeed if the type of the specified object matches some installed volume populator or dynamic provisioner. This field will replace the functionality of the dataSource field and as such if both fields are non-empty, they must have the same value. For backwards compatibility, when namespace isn't specified in dataSourceRef, both fields (dataSource and dataSourceRef) will be set to the same value automatically if one of them is empty and the other is non-empty. When namespace is specified in dataSourceRef, dataSource isn't set to the same value and must be empty. There are three important differences between dataSource and dataSourceRef: * While dataSource only allows two specific types of objects, dataSourceRef allows any non-core object, as well as PersistentVolumeClaim objects. * While dataSource ignores disallowed values (dropping them), dataSourceRef preserves all values, and generates an error if a disallowed value is specified. * While dataSource only allows local objects, dataSourceRef allows objects in any namespaces. (Beta) Using this field requires the AnyVolumeDataSource feature gate to be enabled. (Alpha) Using the namespace field of dataSourceRef requires the CrossNamespaceVolumeDataSource feature gate to be enabled. + + + + + + + + + + + + + + + - - + + + + + + + - - + +
NameTypeDescriptionRequired
kindstring + Kind is the type of resource being referenced
+
true
portintegernamestring -
-
- Format: int32
- Default: 9103
- Minimum: 1
- Maximum: 65535
+ Name is the name of resource being referenced
+
true
apiGroupstring + APIGroup is the group for the resource being referenced. If APIGroup is not specified, the specified Kind must be in the core API group. For any other third-party types, APIGroup is required.
false
tolerations[]objectnamespacestring - If specified, define Pod's tolerations
-
- Default: [map[effect: key: operator:Exists value:]]
+ Namespace is the namespace of resource being referenced Note that when a namespace is specified, a gateway.networking.k8s.io/ReferenceGrant object is required in the referent namespace to allow that namespace's owner to accept the reference. See the ReferenceGrant documentation for details. (Alpha) This field requires the CrossNamespaceVolumeDataSource feature gate to be enabled.
false
-### KeplerInternal.spec.exporter.deployment.tolerations[index] -[↩ Parent](#keplerinternalspecexporterdeployment) +### KeplerInternal.spec.modelServer.storage.persistentVolumeClaim.resources +[↩ Parent](#keplerinternalspecmodelserverstoragepersistentvolumeclaim) -The pod this Toleration is attached to tolerates any taint that matches the triple using the matching operator . +resources represents the minimum resources the volume should have. If RecoverVolumeExpansionFailure feature is enabled users are allowed to specify resource requirements that are lower than previous value but must still be higher than capacity recorded in the status field of the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#resources @@ -211,40 +999,128 @@ The pod this Toleration is attached to tolerates any taint that matches the trip - - + + - - + + - + + + + + +
effectstringclaims[]object - Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
+ Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container. + This is an alpha field and requires enabling the DynamicResourceAllocation feature gate. + This field is immutable. It can only be set for containers.
false
keystringlimitsmap[string]int or string - Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys.
+ Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
false
operatorrequestsmap[string]int or string + Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+
false
+ + +### KeplerInternal.spec.modelServer.storage.persistentVolumeClaim.resources.claims[index] +[↩ Parent](#keplerinternalspecmodelserverstoragepersistentvolumeclaimresources) + + + +ResourceClaim references one entry in PodSpec.ResourceClaims. + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
name string - Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.
+ Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container.
+
true
+ + +### KeplerInternal.spec.modelServer.storage.persistentVolumeClaim.selector +[↩ Parent](#keplerinternalspecmodelserverstoragepersistentvolumeclaim) + + + +selector is a label query over volumes to consider for binding. + + + + + + + + + + + + + + - - + + + +
NameTypeDescriptionRequired
matchExpressions[]object + matchExpressions is a list of label selector requirements. The requirements are ANDed.
false
tolerationSecondsintegermatchLabelsmap[string]string - TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system.
-
- Format: int64
+ matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed.
false
+ + +### KeplerInternal.spec.modelServer.storage.persistentVolumeClaim.selector.matchExpressions[index] +[↩ Parent](#keplerinternalspecmodelserverstoragepersistentvolumeclaimselector) + + + +A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. + + + + + + + + + + + + + + + - + + + + + + @@ -333,12 +1209,53 @@ KeplerInternalStatus represents status of KeplerInternal + + + + + + + + + + + +
NameTypeDescriptionRequired
keystring + key is the label key that the selector applies to.
+
true
valueoperator string - Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string.
+ operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist.
+
true
values[]string + values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch.
false
estimatorobject +
+
false
exporter object ExporterStatus defines the observed state of Kepler Exporter
false
modelServerobject +
+
false
+ + +### KeplerInternal.status.estimator +[↩ Parent](#keplerinternalstatus) + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
statusstring +
+
false
@@ -499,6 +1416,33 @@ ExporterStatus defines the observed state of Kepler Exporter + +### KeplerInternal.status.modelServer +[↩ Parent](#keplerinternalstatus) + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
statusstring +
+
false
+ ## Kepler [↩ Parent](#keplersystemsustainablecomputingiov1alpha1 ) diff --git a/pkg/api/v1alpha1/kepler_internal_types.go b/pkg/api/v1alpha1/kepler_internal_types.go index 63b95859..94074533 100644 --- a/pkg/api/v1alpha1/kepler_internal_types.go +++ b/pkg/api/v1alpha1/kepler_internal_types.go @@ -17,6 +17,7 @@ limitations under the License. package v1alpha1 import ( + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -53,8 +54,81 @@ type OpenShiftSpec struct { // KeplerInternalSpec defines the desired state of KeplerInternal type KeplerInternalSpec struct { - Exporter InternalExporterSpec `json:"exporter"` - OpenShift OpenShiftSpec `json:"openshift,omitempty"` + Exporter InternalExporterSpec `json:"exporter"` + Estimator *InternalEstimatorSpec `json:"estimator,omitempty"` + ModelServer *InternalModelServerSpec `json:"modelServer,omitempty"` + OpenShift OpenShiftSpec `json:"openshift,omitempty"` +} + +// Kepler Model Server Spec +type InternalModelServerSpec struct { + + // +kubebuilder:default=false + Enabled bool `json:"enabled,omitempty"` + + Image string `json:"image,omitempty"` + + // +kubebuilder:default="" + URL string `json:"url,omitempty"` + + // +kubebuilder:default=8100 + // +kubebuilder:validation:Maximum=65535 + // +kubebuilder:validation:Minimum=1 + Port int `json:"port,omitempty"` + + // +kubebuilder:default="" + Path string `json:"path,omitempty"` + + // +kubebuilder:default="" + RequestPath string `json:"requestPath,omitempty"` + + // +kubebuilder:default="" + ListPath string `json:"listPath,omitempty"` + + // +kubebuilder:default="" + PipelineURL string `json:"pipelineUrl,omitempty"` + + // +kubebuilder:default="" + ErrorKey string `json:"errKey,omitempty"` + + Storage ModelServerStorageSpec `json:"storage,omitempty"` +} + +type ModelServerStorageSpec struct { + PersistentVolumeClaim *corev1.PersistentVolumeClaimSpec `json:"persistentVolumeClaim,omitempty"` +} + +// Estimator Spec +type InternalEstimatorSpec struct { + Image string `json:"image,omitempty"` + Node EstimatorGroup `json:"node,omitempty"` + Container EstimatorGroup `json:"container,omitempty"` +} + +func (e InternalEstimatorSpec) Enabled() bool { + return e.Node.Enabled() || e.Container.Enabled() +} + +type EstimatorGroup struct { + Total *EstimatorConfig `json:"total,omitempty"` + Components *EstimatorConfig `json:"components,omitempty"` +} + +func (g EstimatorGroup) Enabled() bool { + return (g.Total != nil && g.Total.SidecarEnabled) || (g.Components != nil && g.Components.SidecarEnabled) +} + +type EstimatorConfig struct { + // +kubebuilder:default=false + SidecarEnabled bool `json:"sidecar,omitempty"` + + InitUrl string `json:"initUrl,omitempty"` + Selector *ModelSelectorSpec `json:"selector,omitempty"` +} + +type ModelSelectorSpec struct { + ModelName string `json:"modelName,omitempty"` + FilterConditions string `json:"filterConditions,omitempty"` } //+kubebuilder:object:root=true @@ -71,6 +145,8 @@ type KeplerInternalSpec struct { // +kubebuilder:printcolumn:name="Image",type=string,JSONPath=`.spec.exporter.deployment.image` // +kubebuilder:printcolumn:name="Node-Selector",type=string,JSONPath=`.spec.exporter.deployment.nodeSelector`,priority=10 // +kubebuilder:printcolumn:name="Tolerations",type=string,JSONPath=`.spec.exporter.deployment.tolerations`,priority=10 +// +kubebuilder:printcolumn:name="Estimator",type=string,JSONPath=`.status.estimator.status` +// +kubebuilder:printcolumn:name="Model-Server",type=string,JSONPath=`.status.modelServer.status` // // KeplerInternal is the Schema for the keplers internal API type KeplerInternal struct { @@ -81,9 +157,27 @@ type KeplerInternal struct { Status KeplerInternalStatus `json:"status,omitempty"` } +type DeploymentStatus string + +const ( + DeploymentNotInstalled DeploymentStatus = "NotInstalled" + DeploymentNotReady DeploymentStatus = "NotReady" + DeploymentRunning DeploymentStatus = "Running" +) + // KeplerInternalStatus represents status of KeplerInternal type KeplerInternalStatus struct { - Exporter ExporterStatus `json:"exporter,omitempty"` + Exporter ExporterStatus `json:"exporter,omitempty"` + Estimator EstimatorStatus `json:"estimator,omitempty"` + ModelServer ModelServerStatus `json:"modelServer,omitempty"` +} + +type EstimatorStatus struct { + Status DeploymentStatus `json:"status,omitempty"` +} + +type ModelServerStatus struct { + Status DeploymentStatus `json:"status,omitempty"` } func (ki KeplerInternal) Namespace() string { @@ -94,6 +188,10 @@ func (ki KeplerInternal) DaemonsetName() string { return ki.Name } +func (ki KeplerInternal) ModelServerDeploymentName() string { + return ki.Name + "-model-server" +} + func (ki KeplerInternal) ServiceAccountName() string { return ki.Name } diff --git a/pkg/api/v1alpha1/zz_generated.deepcopy.go b/pkg/api/v1alpha1/zz_generated.deepcopy.go index 8fe6e8de..6b650b2d 100644 --- a/pkg/api/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/api/v1alpha1/zz_generated.deepcopy.go @@ -57,6 +57,66 @@ func (in *DashboardSpec) DeepCopy() *DashboardSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EstimatorConfig) DeepCopyInto(out *EstimatorConfig) { + *out = *in + if in.Selector != nil { + in, out := &in.Selector, &out.Selector + *out = new(ModelSelectorSpec) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EstimatorConfig. +func (in *EstimatorConfig) DeepCopy() *EstimatorConfig { + if in == nil { + return nil + } + out := new(EstimatorConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EstimatorGroup) DeepCopyInto(out *EstimatorGroup) { + *out = *in + if in.Total != nil { + in, out := &in.Total, &out.Total + *out = new(EstimatorConfig) + (*in).DeepCopyInto(*out) + } + if in.Components != nil { + in, out := &in.Components, &out.Components + *out = new(EstimatorConfig) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EstimatorGroup. +func (in *EstimatorGroup) DeepCopy() *EstimatorGroup { + if in == nil { + return nil + } + out := new(EstimatorGroup) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EstimatorStatus) DeepCopyInto(out *EstimatorStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EstimatorStatus. +func (in *EstimatorStatus) DeepCopy() *EstimatorStatus { + if in == nil { + return nil + } + out := new(EstimatorStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ExporterDeploymentSpec) DeepCopyInto(out *ExporterDeploymentSpec) { *out = *in @@ -124,6 +184,23 @@ func (in *ExporterStatus) DeepCopy() *ExporterStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InternalEstimatorSpec) DeepCopyInto(out *InternalEstimatorSpec) { + *out = *in + in.Node.DeepCopyInto(&out.Node) + in.Container.DeepCopyInto(&out.Container) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InternalEstimatorSpec. +func (in *InternalEstimatorSpec) DeepCopy() *InternalEstimatorSpec { + if in == nil { + return nil + } + out := new(InternalEstimatorSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *InternalExporterDeploymentSpec) DeepCopyInto(out *InternalExporterDeploymentSpec) { *out = *in @@ -156,6 +233,22 @@ func (in *InternalExporterSpec) DeepCopy() *InternalExporterSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InternalModelServerSpec) DeepCopyInto(out *InternalModelServerSpec) { + *out = *in + in.Storage.DeepCopyInto(&out.Storage) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InternalModelServerSpec. +func (in *InternalModelServerSpec) DeepCopy() *InternalModelServerSpec { + if in == nil { + return nil + } + out := new(InternalModelServerSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Kepler) DeepCopyInto(out *Kepler) { *out = *in @@ -246,6 +339,16 @@ func (in *KeplerInternalList) DeepCopyObject() runtime.Object { func (in *KeplerInternalSpec) DeepCopyInto(out *KeplerInternalSpec) { *out = *in in.Exporter.DeepCopyInto(&out.Exporter) + if in.Estimator != nil { + in, out := &in.Estimator, &out.Estimator + *out = new(InternalEstimatorSpec) + (*in).DeepCopyInto(*out) + } + if in.ModelServer != nil { + in, out := &in.ModelServer, &out.ModelServer + *out = new(InternalModelServerSpec) + (*in).DeepCopyInto(*out) + } out.OpenShift = in.OpenShift } @@ -263,6 +366,8 @@ func (in *KeplerInternalSpec) DeepCopy() *KeplerInternalSpec { func (in *KeplerInternalStatus) DeepCopyInto(out *KeplerInternalStatus) { *out = *in in.Exporter.DeepCopyInto(&out.Exporter) + out.Estimator = in.Estimator + out.ModelServer = in.ModelServer } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KeplerInternalStatus. @@ -339,6 +444,56 @@ func (in *KeplerStatus) DeepCopy() *KeplerStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ModelSelectorSpec) DeepCopyInto(out *ModelSelectorSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelSelectorSpec. +func (in *ModelSelectorSpec) DeepCopy() *ModelSelectorSpec { + if in == nil { + return nil + } + out := new(ModelSelectorSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ModelServerStatus) DeepCopyInto(out *ModelServerStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelServerStatus. +func (in *ModelServerStatus) DeepCopy() *ModelServerStatus { + if in == nil { + return nil + } + out := new(ModelServerStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ModelServerStorageSpec) DeepCopyInto(out *ModelServerStorageSpec) { + *out = *in + if in.PersistentVolumeClaim != nil { + in, out := &in.PersistentVolumeClaim, &out.PersistentVolumeClaim + *out = new(v1.PersistentVolumeClaimSpec) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelServerStorageSpec. +func (in *ModelServerStorageSpec) DeepCopy() *ModelServerStorageSpec { + if in == nil { + return nil + } + out := new(ModelServerStorageSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *OpenShiftSpec) DeepCopyInto(out *OpenShiftSpec) { *out = *in diff --git a/pkg/components/estimator/estimator.go b/pkg/components/estimator/estimator.go new file mode 100644 index 00000000..b03d1d9a --- /dev/null +++ b/pkg/components/estimator/estimator.go @@ -0,0 +1,139 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package estimator + +import ( + "fmt" + "strings" + + "github.com/sustainable.computing.io/kepler-operator/pkg/api/v1alpha1" + "github.com/sustainable.computing.io/kepler-operator/pkg/utils/k8s" + + corev1 "k8s.io/api/core/v1" +) + +const ( + StableImage = "quay.io/sustainable_computing_io/kepler_model_server:v0.6" + waitForSocketCommand = "until [ -e /tmp/estimator.sock ]; do sleep 1; done && %s" +) + +var ( + shellCommand = []string{"/bin/sh", "-c"} +) + +// NeedsEstimatorSidecar returns true if any of estimator config has sidecar enabled +func NeedsEstimatorSidecar(es *v1alpha1.InternalEstimatorSpec) bool { + if es == nil { + return false + } + if es.Node.Total != nil && es.Node.Total.SidecarEnabled { + return true + } + if es.Node.Components != nil && es.Node.Components.SidecarEnabled { + return true + } + if es.Container.Total != nil && es.Container.Total.SidecarEnabled { + return true + } + if es.Container.Components != nil && es.Container.Components.SidecarEnabled { + return true + } + + return false +} + +// NewContainer returns sidecar container +func NewContainer(image string) corev1.Container { + mounts := []corev1.VolumeMount{{ + Name: "cfm", + MountPath: "/etc/kepler/kepler.config", + ReadOnly: true, + }, { + Name: "mnt", + MountPath: "/mnt", + }, { + Name: "tmp", + MountPath: "/tmp", + }} + + return corev1.Container{ + Image: image, + ImagePullPolicy: corev1.PullIfNotPresent, + Name: "estimator", + VolumeMounts: mounts, + Command: []string{"python3.8"}, + Args: []string{"-u", "src/estimate/estimator.py"}, + } +} + +// NewVolumes returns sidecar additional volumes +func NewVolumes() []corev1.Volume { + return []corev1.Volume{ + k8s.VolumeFromEmptyDir("mnt"), + k8s.VolumeFromEmptyDir("tmp"), + } +} + +func addTmpMount(volumeMounts []corev1.VolumeMount) []corev1.VolumeMount { + return append(volumeMounts, corev1.VolumeMount{ + Name: "tmp", + MountPath: "/tmp", + }) +} + +func addSocketWaitCmd(exporterContainer *corev1.Container) *corev1.Container { + cmd := exporterContainer.Command + exporterContainer.Command = shellCommand + exporterContainer.Args = []string{fmt.Sprintf(waitForSocketCommand, strings.Join(cmd, " "))} + return exporterContainer +} + +func AddEstimatorDependency(exporterContainer *corev1.Container) *corev1.Container { + exporterContainer = addSocketWaitCmd(exporterContainer) + exporterContainer.VolumeMounts = addTmpMount(exporterContainer.VolumeMounts) + return exporterContainer +} + +func estimatorConfig(prefix string, spec v1alpha1.EstimatorConfig) string { + var builder strings.Builder + + builder.WriteString(fmt.Sprintf("%s_ESTIMATOR=%v\n", prefix, spec.SidecarEnabled)) + if spec.InitUrl != "" { + builder.WriteString(fmt.Sprintf("%s_INIT_URL=%s\n", prefix, spec.InitUrl)) + } + + return builder.String() +} + +func ModelConfig(es *v1alpha1.InternalEstimatorSpec) string { + var builder strings.Builder + + if es.Node.Total != nil { + builder.WriteString(estimatorConfig("NODE_TOTAL", *es.Node.Total)) + } + if es.Node.Components != nil { + builder.WriteString(estimatorConfig("NODE_COMPONENTS", *es.Node.Components)) + } + if es.Container.Total != nil { + builder.WriteString(estimatorConfig("CONTAINER_TOTAL", *es.Container.Total)) + } + if es.Container.Components != nil { + builder.WriteString(estimatorConfig("CONTAINER_COMPONENTS", *es.Node.Components)) + } + + return builder.String() +} diff --git a/pkg/components/estimator/estimator_test.go b/pkg/components/estimator/estimator_test.go new file mode 100644 index 00000000..a4ca4abf --- /dev/null +++ b/pkg/components/estimator/estimator_test.go @@ -0,0 +1,157 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package estimator + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/sustainable.computing.io/kepler-operator/pkg/api/v1alpha1" + "github.com/sustainable.computing.io/kepler-operator/pkg/utils/k8s" + corev1 "k8s.io/api/core/v1" +) + +func sidecarEnabledSpec() *v1alpha1.EstimatorConfig { + return &v1alpha1.EstimatorConfig{ + InitUrl: "fake-url.zip", + SidecarEnabled: true, + } +} + +func sidecarDisabledSpec() *v1alpha1.EstimatorConfig { + return &v1alpha1.EstimatorConfig{ + InitUrl: "fake-url.json", + SidecarEnabled: false, + } +} + +func TestModelConfig(t *testing.T) { + + nodeTotalEnabled := "NODE_TOTAL_ESTIMATOR=true\nNODE_TOTAL_INIT_URL=fake-url.zip\n" + nodeComponentsEnabled := "NODE_COMPONENTS_ESTIMATOR=true\nNODE_COMPONENTS_INIT_URL=fake-url.zip\n" + + containerTotalEnabled := "CONTAINER_TOTAL_ESTIMATOR=true\nCONTAINER_TOTAL_INIT_URL=fake-url.zip\n" + containerComponentsEnabled := "CONTAINER_COMPONENTS_ESTIMATOR=true\nCONTAINER_COMPONENTS_INIT_URL=fake-url.zip\n" + + nodeTotalDisabled := "NODE_TOTAL_ESTIMATOR=false\nNODE_TOTAL_INIT_URL=fake-url.json\n" + nodeComponentsDisabled := "NODE_COMPONENTS_ESTIMATOR=false\nNODE_COMPONENTS_INIT_URL=fake-url.json\n" + + containerTotalDisabled := "CONTAINER_TOTAL_ESTIMATOR=false\nCONTAINER_TOTAL_INIT_URL=fake-url.json\n" + containerComponentsDisabled := "CONTAINER_COMPONENTS_ESTIMATOR=false\nCONTAINER_COMPONENTS_INIT_URL=fake-url.json\n" + + tt := []struct { + spec *v1alpha1.KeplerInternalSpec + configStr string + scenario string + }{ + { + spec: &v1alpha1.KeplerInternalSpec{ + Estimator: &v1alpha1.InternalEstimatorSpec{ + Node: v1alpha1.EstimatorGroup{ + Total: sidecarDisabledSpec(), + Components: sidecarDisabledSpec(), + }, + Container: v1alpha1.EstimatorGroup{ + Total: sidecarDisabledSpec(), + Components: sidecarDisabledSpec(), + }, + }, + }, + configStr: nodeTotalDisabled + nodeComponentsDisabled + containerTotalDisabled + containerComponentsDisabled, + scenario: "all enable case", + }, + { + spec: &v1alpha1.KeplerInternalSpec{ + Estimator: &v1alpha1.InternalEstimatorSpec{ + Node: v1alpha1.EstimatorGroup{ + Total: sidecarEnabledSpec(), + Components: sidecarEnabledSpec(), + }, + Container: v1alpha1.EstimatorGroup{ + Total: sidecarEnabledSpec(), + Components: sidecarEnabledSpec(), + }, + }, + }, + configStr: nodeTotalEnabled + nodeComponentsEnabled + containerTotalEnabled + containerComponentsEnabled, + scenario: "all enable case", + }, + } + + for _, tc := range tt { + tc := tc + t.Run(tc.scenario, func(t *testing.T) { + t.Parallel() + k := v1alpha1.KeplerInternal{ + Spec: *tc.spec, + } + actual := ModelConfig(k.Spec.Estimator) + assert.Equal(t, actual, tc.configStr) + }) + } +} + +func TestModifiedContainer(t *testing.T) { + exporterContainer := &corev1.Container{ + Command: []string{"kepler", "-v=1"}, + } + expectedCommand := []string{"/bin/sh", "-c"} + expectedArgs := []string{"until [ -e /tmp/estimator.sock ]; do sleep 1; done && kepler -v=1"} + expectedVolumeMounts := []string{"cfm", "mnt", "tmp"} + keplerVolumes := []corev1.Volume{k8s.VolumeFromEmptyDir("kepler-volume")} + expectedVolumes := []string{"kepler-volume", "mnt", "tmp"} + t.Run("modified container", func(t *testing.T) { + t.Parallel() + k := v1alpha1.KeplerInternal{ + Spec: v1alpha1.KeplerInternalSpec{ + Estimator: &v1alpha1.InternalEstimatorSpec{ + Image: StableImage, + Node: v1alpha1.EstimatorGroup{ + Total: sidecarEnabledSpec(), + Components: sidecarDisabledSpec(), + }, + }, + }, + } + need := NeedsEstimatorSidecar(k.Spec.Estimator) + assert.Equal(t, need, true) + container := NewContainer(k.Spec.Estimator.Image) + assert.Equal(t, len(container.VolumeMounts), len(expectedVolumeMounts)) + for index, mnt := range container.VolumeMounts { + assert.Equal(t, mnt.Name, expectedVolumeMounts[index]) + } + volumes := append(keplerVolumes, NewVolumes()...) + assert.Equal(t, len(volumes), len(expectedVolumes)) + for index, volume := range volumes { + assert.Equal(t, volume.Name, expectedVolumes[index]) + } + exporterContainer := AddEstimatorDependency(exporterContainer) + actualCommand := exporterContainer.Command + actualArgs := exporterContainer.Args + assert.Equal(t, len(actualCommand), len(expectedCommand)) + assert.Equal(t, len(actualArgs), len(expectedArgs)) + for index, actual := range actualCommand { + assert.Equal(t, actual, expectedCommand[index]) + } + for index, actual := range actualArgs { + assert.Equal(t, actual, expectedArgs[index]) + } + exporterVolumeMounts := exporterContainer.VolumeMounts + assert.Equal(t, len(exporterVolumeMounts), 1) + assert.Equal(t, exporterVolumeMounts[0].Name, "tmp") + }) +} diff --git a/pkg/components/exporter/exporter.go b/pkg/components/exporter/exporter.go index ae88961f..b9487375 100644 --- a/pkg/components/exporter/exporter.go +++ b/pkg/components/exporter/exporter.go @@ -24,6 +24,8 @@ import ( "github.com/sustainable.computing.io/kepler-operator/pkg/api/v1alpha1" "github.com/sustainable.computing.io/kepler-operator/pkg/components" + "github.com/sustainable.computing.io/kepler-operator/pkg/components/estimator" + "github.com/sustainable.computing.io/kepler-operator/pkg/components/modelserver" "github.com/sustainable.computing.io/kepler-operator/pkg/utils/k8s" secv1 "github.com/openshift/api/security/v1" @@ -74,16 +76,29 @@ func NewDaemonSet(detail components.Detail, k *v1alpha1.KeplerInternal) *appsv1. } deployment := k.Spec.Exporter.Deployment.ExporterDeploymentSpec - image := k.Spec.Exporter.Deployment.Image nodeSelector := deployment.NodeSelector tolerations := deployment.Tolerations - port := deployment.Port // NOTE: since 2 or more KeplerInternals can be deployed to the same namespace, // we need to make sure that the pod selector of each of the DaemonSet // create of each kepler is unique. Thus the daemonset name is added as // label to the pod - bindAddress := "0.0.0.0:" + strconv.Itoa(int(port)) + exporterContainer := newExporterContainer(k.Name, k.DaemonsetName(), k.Spec.Exporter.Deployment) + containers := []corev1.Container{exporterContainer} + + var volumes = []corev1.Volume{ + k8s.VolumeFromHost("lib-modules", "/lib/modules"), + k8s.VolumeFromHost("tracing", "/sys"), + k8s.VolumeFromHost("proc", "/proc"), + k8s.VolumeFromHost("kernel-src", "/usr/src/kernels"), + k8s.VolumeFromConfigMap("cfm", k.Name), + } // exporter default Volumes + + if estimator.NeedsEstimatorSidecar(k.Spec.Estimator) { + // add sidecar container and update kepler-exporter container + // add shared volumes + containers, volumes = addEstimatorSidecar(k.Spec.Estimator.Image, &exporterContainer, volumes) + } return &appsv1.DaemonSet{ TypeMeta: metav1.TypeMeta{ @@ -109,58 +124,8 @@ func NewDaemonSet(detail components.Detail, k *v1alpha1.KeplerInternal) *appsv1. ServiceAccountName: k.Name, DNSPolicy: corev1.DNSPolicy(corev1.DNSClusterFirstWithHostNet), Tolerations: tolerations, - Containers: []corev1.Container{{ - Name: k.DaemonsetName(), - SecurityContext: &corev1.SecurityContext{Privileged: pointer.Bool(true)}, - Image: image, - Command: []string{ - "/usr/bin/kepler", - "-address", bindAddress, - "-enable-cgroup-id=true", - "-enable-gpu=$(ENABLE_GPU)", - "-v=$(KEPLER_LOG_LEVEL)", - "-kernel-source-dir=/usr/share/kepler/kernel_sources", - "-redfish-cred-file-path=/etc/redfish/redfish.csv", - }, - Ports: []corev1.ContainerPort{{ - ContainerPort: int32(port), - Name: "http", - }}, - LivenessProbe: &corev1.Probe{ - ProbeHandler: corev1.ProbeHandler{ - HTTPGet: &corev1.HTTPGetAction{ - Path: "/healthz", - Port: intstr.IntOrString{Type: intstr.Int, IntVal: port}, - Scheme: "HTTP", - }, - }, - FailureThreshold: 5, - InitialDelaySeconds: 10, - PeriodSeconds: 60, - SuccessThreshold: 1, - TimeoutSeconds: 10}, - Env: []corev1.EnvVar{ - {Name: "NODE_IP", ValueFrom: k8s.EnvFromField("status.hostIP")}, - {Name: "NODE_NAME", ValueFrom: k8s.EnvFromField("spec.nodeName")}, - {Name: "KEPLER_LOG_LEVEL", ValueFrom: k8s.EnvFromConfigMap("KEPLER_LOG_LEVEL", k.Name)}, - {Name: "ENABLE_GPU", ValueFrom: k8s.EnvFromConfigMap("ENABLE_GPU", k.Name)}}, - VolumeMounts: []corev1.VolumeMount{ - {Name: "lib-modules", MountPath: "/lib/modules", ReadOnly: true}, - {Name: "tracing", MountPath: "/sys", ReadOnly: true}, - {Name: "kernel-src", MountPath: "/usr/src/kernels", ReadOnly: true}, - {Name: "kernel-debug", MountPath: "/sys/kernel/debug"}, - {Name: "proc", MountPath: "/proc"}, - {Name: "cfm", MountPath: "/etc/kepler/kepler.config"}, - }, // VolumeMounts - }}, // Container: kepler / Containers - Volumes: []corev1.Volume{ - k8s.VolumeFromHost("lib-modules", "/lib/modules"), - k8s.VolumeFromHost("tracing", "/sys"), - k8s.VolumeFromHost("proc", "/proc"), - k8s.VolumeFromHost("kernel-src", "/usr/src/kernels"), - k8s.VolumeFromHost("kernel-debug", "/sys/kernel/debug"), - k8s.VolumeFromConfigMap("cfm", k.Name), - }, // Volumes + Containers: containers, + Volumes: volumes, }, // PodSpec }, // PodTemplateSpec }, // Spec @@ -249,6 +214,40 @@ func NewConfigMap(d components.Detail, k *v1alpha1.KeplerInternal) *corev1.Confi deployment := k.Spec.Exporter.Deployment.ExporterDeploymentSpec bindAddress := "0.0.0.0:" + strconv.Itoa(int(deployment.Port)) + modelConfig := "" + if k.Spec.Estimator != nil { + modelConfig = estimator.ModelConfig(k.Spec.Estimator) + } + + exporterConfigMap := k8s.StringMap{ + "KEPLER_NAMESPACE": k.Namespace(), + "KEPLER_LOG_LEVEL": "1", + "METRIC_PATH": "/metrics", + "BIND_ADDRESS": bindAddress, + "ENABLE_GPU": "true", + "ENABLE_QAT": "false", + "ENABLE_EBPF_CGROUPID": "true", + "EXPOSE_HW_COUNTER_METRICS": "true", + "EXPOSE_IRQ_COUNTER_METRICS": "true", + "EXPOSE_KUBELET_METRICS": "true", + "EXPOSE_CGROUP_METRICS": "true", + "ENABLE_PROCESS_METRICS": "false", + "CPU_ARCH_OVERRIDE": "", + "CGROUP_METRICS": "*", + "REDFISH_PROBE_INTERVAL_IN_SECONDS": "60", + "REDFISH_SKIP_SSL_VERIFY": "true", + "MODEL_CONFIG": modelConfig, + } + + ms := k.Spec.ModelServer + if ms != nil { + if ms.Enabled { + exporterConfigMap["MODEL_SERVER_ENABLE"] = "true" + } + modelServerConfig := modelserver.ConfigForClient(k.ModelServerDeploymentName(), k.Spec.ModelServer) + exporterConfigMap = exporterConfigMap.Merge(modelServerConfig) + } + return &corev1.ConfigMap{ TypeMeta: metav1.TypeMeta{ APIVersion: corev1.SchemeGroupVersion.String(), @@ -259,25 +258,7 @@ func NewConfigMap(d components.Detail, k *v1alpha1.KeplerInternal) *corev1.Confi Namespace: k.Namespace(), Labels: labels(k).ToMap(), }, - Data: map[string]string{ - "KEPLER_NAMESPACE": k.Namespace(), - "KEPLER_LOG_LEVEL": "1", - "METRIC_PATH": "/metrics", - "BIND_ADDRESS": bindAddress, - "ENABLE_GPU": "true", - "ENABLE_QAT": "false", - "ENABLE_EBPF_CGROUPID": "true", - "EXPOSE_HW_COUNTER_METRICS": "true", - "EXPOSE_IRQ_COUNTER_METRICS": "true", - "EXPOSE_KUBELET_METRICS": "true", - "EXPOSE_CGROUP_METRICS": "true", - "ENABLE_PROCESS_METRICS": "false", - "CPU_ARCH_OVERRIDE": "", - "CGROUP_METRICS": "*", - "REDFISH_PROBE_INTERVAL_IN_SECONDS": "60", - "REDFISH_SKIP_SSL_VERIFY": "true", - "MODEL_CONFIG": "CONTAINER_COMPONENTS_ESTIMATOR=false", - }, + Data: exporterConfigMap, } } @@ -600,3 +581,58 @@ func labels(ki *v1alpha1.KeplerInternal) k8s.StringMap { "app.kubernetes.io/part-of": ki.Name, }) } + +func newExporterContainer(kiName, dsName string, deployment v1alpha1.InternalExporterDeploymentSpec) corev1.Container { + bindAddress := "0.0.0.0:" + strconv.Itoa(int(deployment.Port)) + return corev1.Container{ + Name: dsName, + SecurityContext: &corev1.SecurityContext{Privileged: pointer.Bool(true)}, + Image: deployment.Image, + Command: []string{ + "/usr/bin/kepler", + "-address", bindAddress, + "-enable-cgroup-id=true", + "-enable-gpu=$(ENABLE_GPU)", + "-v=$(KEPLER_LOG_LEVEL)", + "-kernel-source-dir=/usr/share/kepler/kernel_sources", + "-redfish-cred-file-path=/etc/redfish/redfish.csv", + }, + Ports: []corev1.ContainerPort{{ + ContainerPort: int32(deployment.Port), + Name: "http", + }}, + LivenessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/healthz", + Port: intstr.IntOrString{Type: intstr.Int, IntVal: deployment.Port}, + Scheme: "HTTP", + }, + }, + FailureThreshold: 5, + InitialDelaySeconds: 10, + PeriodSeconds: 60, + SuccessThreshold: 1, + TimeoutSeconds: 10}, + Env: []corev1.EnvVar{ + {Name: "NODE_IP", ValueFrom: k8s.EnvFromField("status.hostIP")}, + {Name: "NODE_NAME", ValueFrom: k8s.EnvFromField("spec.nodeName")}, + {Name: "KEPLER_LOG_LEVEL", ValueFrom: k8s.EnvFromConfigMap("KEPLER_LOG_LEVEL", kiName)}, + {Name: "ENABLE_GPU", ValueFrom: k8s.EnvFromConfigMap("ENABLE_GPU", kiName)}}, + VolumeMounts: []corev1.VolumeMount{ + {Name: "lib-modules", MountPath: "/lib/modules", ReadOnly: true}, + {Name: "tracing", MountPath: "/sys", ReadOnly: true}, + {Name: "kernel-src", MountPath: "/usr/src/kernels", ReadOnly: true}, + {Name: "proc", MountPath: "/proc"}, + {Name: "cfm", MountPath: "/etc/kepler/kepler.config"}, + }, + } +} + +func addEstimatorSidecar(estimatorImage string, exporterContainer *corev1.Container, volumes []corev1.Volume) ([]corev1.Container, []corev1.Volume) { + sidecarContainer := estimator.NewContainer(estimatorImage) + volumes = append(volumes, estimator.NewVolumes()...) + exporterContainer = estimator.AddEstimatorDependency(exporterContainer) + containers := []corev1.Container{*exporterContainer, sidecarContainer} + return containers, volumes +} diff --git a/pkg/components/exporter/exporter_test.go b/pkg/components/exporter/exporter_test.go index e008c3c9..f9425683 100644 --- a/pkg/components/exporter/exporter_test.go +++ b/pkg/components/exporter/exporter_test.go @@ -132,7 +132,6 @@ func TestVolumeMounts(t *testing.T) { {Name: "lib-modules", MountPath: "/lib/modules", ReadOnly: true}, {Name: "tracing", MountPath: "/sys", ReadOnly: true}, {Name: "kernel-src", MountPath: "/usr/src/kernels", ReadOnly: true}, - {Name: "kernel-debug", MountPath: "/sys/kernel/debug"}, {Name: "proc", MountPath: "/proc"}, {Name: "cfm", MountPath: "/etc/kepler/kepler.config"}, }, @@ -167,7 +166,6 @@ func TestVolumes(t *testing.T) { k8s.VolumeFromHost("tracing", "/sys"), k8s.VolumeFromHost("proc", "/proc"), k8s.VolumeFromHost("kernel-src", "/usr/src/kernels"), - k8s.VolumeFromHost("kernel-debug", "/sys/kernel/debug"), k8s.VolumeFromConfigMap("cfm", "kepler-internal"), }, scenario: "default case", diff --git a/pkg/components/modelserver/modelserver.go b/pkg/components/modelserver/modelserver.go index ac0f68a2..6da904d4 100644 --- a/pkg/components/modelserver/modelserver.go +++ b/pkg/components/modelserver/modelserver.go @@ -1,8 +1,3 @@ -//go:build ignore -// +build ignore - -// TODO: remove the tag above when model-server is added to Kepler Spec - /* Copyright 2023. @@ -23,8 +18,6 @@ package modelserver import ( "fmt" - "strconv" - "strings" "github.com/sustainable.computing.io/kepler-operator/pkg/api/v1alpha1" "github.com/sustainable.computing.io/kepler-operator/pkg/components" @@ -32,34 +25,23 @@ import ( appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" - resource "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" ) const ( - prefix = "model-server-" - // exported - PVCName = prefix + "pvc" - ConfigmapName = prefix + "cm" - ServiceName = prefix + "svc" - DeploymentName = prefix + "deploy" - ServiceAccountName = prefix + "sa" + PVCNameSuffix = "-pvc" + ConfigMapSuffix = "-cm" + ServiceSuffix = "-svc" ) const ( - defaultModelURL = "https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/main/tests/test_models" - defaultModels = ` - AbsComponentModelWeight=KerasCompWeightFullPipeline - AbsComponentPower=KerasCompFullPipeline - DynComponentPower=ScikitMixed - ` - - defaultPromServer = "http://prometheus-k8s." + components.Namespace + ".svc.cluster.local:9090/" - defaultModelServer = "http://kepler-model-server." + components.Namespace + ".cluster.local:%d" - - image = "quay.io/sustainable_computing_io/kepler_model_server:latest" + defaultPromInterval = 20 + defaultPromStep = 3 + defaultPromServer = "http://prometheus-k8s.openshift-kepler-operator.svc.cluster.local:9090/" + defaultModelServer = "http://%s.openshift-kepler-operator.svc.cluster.local:%d" + StableImage = "quay.io/sustainable_computing_io/kepler_model_server:v0.6" ) var ( @@ -74,25 +56,18 @@ var ( }) ) -func NewServiceAccount() *corev1.ServiceAccount { - return &corev1.ServiceAccount{ - TypeMeta: metav1.TypeMeta{ - APIVersion: corev1.SchemeGroupVersion.String(), - Kind: "ServiceAccount", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: ServiceAccountName, - Namespace: components.Namespace, - Labels: labels, - }, +func NewDeployment(deployName string, ms *v1alpha1.InternalModelServerSpec, namespace string) *appsv1.Deployment { + pvcName := deployName + PVCNameSuffix + configMapName := deployName + ConfigMapSuffix + var storage corev1.Volume + if ms.Storage.PersistentVolumeClaim == nil { + storage = k8s.VolumeFromEmptyDir("mnt") + } else { + storage = k8s.VolumeFromPVC("mnt", pvcName) } -} - -func NewDeployment(k *v1alpha1.Kepler) *appsv1.Deployment { - volumes := []corev1.Volume{ - k8s.VolumeFromPVC("mnt", PVCName), - k8s.VolumeFromConfigMap("cfm", ConfigmapName), + storage, + k8s.VolumeFromConfigMap("cfm", configMapName), } mounts := []corev1.VolumeMount{{ @@ -104,36 +79,28 @@ func NewDeployment(k *v1alpha1.Kepler) *appsv1.Deployment { MountPath: "/mnt", }} - // exporter will always be active - exporterPort := int32(k.Spec.Exporter.Port) - ms := k.Spec.ModelServer - + port := ms.Port containers := []corev1.Container{{ - Image: image, - ImagePullPolicy: corev1.PullAlways, - Name: "model-server-api", + Image: ms.Image, + ImagePullPolicy: corev1.PullIfNotPresent, + Name: "server-api", Ports: []corev1.ContainerPort{{ - ContainerPort: exporterPort, + ContainerPort: int32(port), Name: "http", }}, VolumeMounts: mounts, - Command: []string{"python3.8", "model_server.py"}, + Command: []string{"python3.8"}, + Args: []string{"-u", "src/server/model_server.py"}, }} - if ms.Trainer != nil { - containers = append(containers, corev1.Container{ - Image: image, - ImagePullPolicy: corev1.PullIfNotPresent, - Name: "online-trainer", - VolumeMounts: mounts, - Command: []string{"python3.8", "online_trainer.py"}, - }) - } - return &appsv1.Deployment{ + TypeMeta: metav1.TypeMeta{ + APIVersion: appsv1.SchemeGroupVersion.String(), + Kind: "Deployment", + }, ObjectMeta: metav1.ObjectMeta{ - Name: DeploymentName, - Namespace: components.Namespace, + Name: deployName, + Namespace: namespace, Labels: labels, }, @@ -154,13 +121,17 @@ func NewDeployment(k *v1alpha1.Kepler) *appsv1.Deployment { } } -func NewService(k *v1alpha1.Kepler) *corev1.Service { - exporterPort := int32(k.Spec.Exporter.Port) - +func NewService(deployName string, ms *v1alpha1.InternalModelServerSpec, namespace string) *corev1.Service { + port := ms.Port + serviceName := deployName + ServiceSuffix return &corev1.Service{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "Service", + }, ObjectMeta: metav1.ObjectMeta{ - Name: ServiceName, - Namespace: components.Namespace, + Name: serviceName, + Namespace: namespace, Labels: labels, }, Spec: corev1.ServiceSpec{ @@ -168,14 +139,25 @@ func NewService(k *v1alpha1.Kepler) *corev1.Service { Selector: labels, Ports: []corev1.ServicePort{{ Name: "http", - Port: exporterPort, + Port: int32(port), TargetPort: intstr.FromString("http"), }}, }, } } -func NewConfigMap(d components.Detail, k *v1alpha1.Kepler) *corev1.ConfigMap { +func ConfigForClient(deployName string, ms *v1alpha1.InternalModelServerSpec) k8s.StringMap { + msConfig := k8s.StringMap{ + "MODEL_SERVER_URL": defaultIfEmpty(ms.URL, serverUrl(deployName, *ms)), + } + msConfig = msConfig.AddIfNotEmpty("MODEL_SERVER_REQ_PATH", ms.RequestPath) + msConfig = msConfig.AddIfNotEmpty("MODEL_SERVER_MODEL_LIST_PATH", ms.ListPath) + + return msConfig +} + +func NewConfigMap(deployName string, d components.Detail, ms *v1alpha1.InternalModelServerSpec, namespace string) *corev1.ConfigMap { + configMapName := deployName + ConfigMapSuffix if d == components.Metadata { return &corev1.ConfigMap{ TypeMeta: metav1.TypeMeta{ @@ -183,28 +165,19 @@ func NewConfigMap(d components.Detail, k *v1alpha1.Kepler) *corev1.ConfigMap { Kind: "ConfigMap", }, ObjectMeta: metav1.ObjectMeta{ - Name: ConfigmapName, - Namespace: components.Namespace, + Name: configMapName, + Namespace: namespace, Labels: labels, }, } } - - ms := k.Spec.ModelServer - msConfig := k8s.StringMap{ - "MODEL_SERVER_ENABLE": "true", - "PROM_SERVER": defaultIfEmpty(ms.PromServer, defaultPromServer), - - "MODEL_SERVER_URL": defaultIfEmpty(ms.URL, fmt.Sprintf(defaultModelServer, ms.Port)), - "MODEL_PATH": defaultIfEmpty(ms.Path, "models"), - "MODEL_SERVER_PORT": strconv.Itoa(ms.Port), - "MODEL_SERVER_REQ_PATH": defaultIfEmpty(ms.RequiredPath, "/model"), - - "MNT_PATH": "/mnt", + "MODEL_PATH": defaultIfEmpty(ms.Path, "/mnt/models"), } - - trainerSettings := settingsForTrainer(ms.Trainer) + msConfig = msConfig.AddIfNotEmpty("MODEL_SERVER_REQ_PATH", ms.RequestPath) + msConfig = msConfig.AddIfNotEmpty("MODEL_SERVER_MODEL_LIST_PATH", ms.ListPath) + msConfig = msConfig.AddIfNotEmpty("INITIAL_PIPELINE_URL", ms.PipelineURL) + msConfig = msConfig.AddIfNotEmpty("ERROR_KEY", ms.ErrorKey) return &corev1.ConfigMap{ TypeMeta: metav1.TypeMeta{ @@ -212,59 +185,27 @@ func NewConfigMap(d components.Detail, k *v1alpha1.Kepler) *corev1.ConfigMap { Kind: "ConfigMap", }, ObjectMeta: metav1.ObjectMeta{ - Name: ConfigmapName, - Namespace: components.Namespace, + Name: configMapName, + Namespace: namespace, Labels: labels, }, - Data: msConfig.Merge(trainerSettings).ToMap(), - } -} - -func settingsForTrainer(trainer *v1alpha1.ModelServerTrainerSpec) k8s.StringMap { - if trainer == nil { - return nil - } - - // iterate through available headers and append to promHeaders (this string will be converted to dict in the image) - promHeaders := strings.Builder{} - for _, h := range trainer.PromHeaders { - promHeaders.WriteString(h.Key) - promHeaders.WriteString(":") - promHeaders.WriteString(h.Value) - promHeaders.WriteString(",") - } - - return k8s.StringMap{ - "PROM_SSL_DISABLE": strconv.FormatBool(trainer.PromSSLDisable), - "PROM_HEADERS": promHeaders.String(), - "PROM_QUERY_INTERVAL": strconv.Itoa(trainer.PromQueryInterval), - "PROM_QUERY_STEP": strconv.Itoa(trainer.PromQueryStep), - "INITIAL_MODELS_LOC": defaultIfEmpty(trainer.InitialModelsEndpoint, defaultModelURL), - "INITIAL_MODEL_NAMES": defaultIfEmpty(trainer.InitialModelNames, defaultModels), + Data: msConfig.ToMap(), } } -func NewPVC(k *v1alpha1.Kepler) *corev1.PersistentVolumeClaim { +func NewPVC(deployName string, namespace string, pvcSpec *corev1.PersistentVolumeClaimSpec) *corev1.PersistentVolumeClaim { + pvcName := deployName + PVCNameSuffix return &corev1.PersistentVolumeClaim{ TypeMeta: metav1.TypeMeta{ APIVersion: corev1.SchemeGroupVersion.String(), Kind: "PersistentVolumeClaim", }, ObjectMeta: metav1.ObjectMeta{ - Name: PVCName, - Namespace: components.Namespace, + Name: pvcName, + Namespace: namespace, Labels: labels, }, - Spec: corev1.PersistentVolumeClaimSpec{ - AccessModes: []corev1.PersistentVolumeAccessMode{ - corev1.ReadWriteOnce, - }, - Resources: corev1.ResourceRequirements{ - Requests: corev1.ResourceList{ - corev1.ResourceName(corev1.ResourceStorage): resource.MustParse("5Gi"), - }, - }, - }, + Spec: *pvcSpec, } } @@ -274,3 +215,9 @@ func defaultIfEmpty(given, defaultVal string) string { } return defaultVal } + +func serverUrl(deployName string, ms v1alpha1.InternalModelServerSpec) string { + port := ms.Port + serviceName := deployName + ServiceSuffix + return fmt.Sprintf(defaultModelServer, serviceName, port) +} diff --git a/pkg/components/modelserver/modelserver_test.go b/pkg/components/modelserver/modelserver_test.go new file mode 100644 index 00000000..b276a499 --- /dev/null +++ b/pkg/components/modelserver/modelserver_test.go @@ -0,0 +1,144 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package modelserver + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/sustainable.computing.io/kepler-operator/pkg/api/v1alpha1" + "github.com/sustainable.computing.io/kepler-operator/pkg/components" +) + +func TestConfigMap(t *testing.T) { + + tt := []struct { + spec *v1alpha1.InternalModelServerSpec + data map[string]string + scenario string + }{ + { + spec: &v1alpha1.InternalModelServerSpec{}, + data: map[string]string{ + "MODEL_PATH": "/mnt/models", + }, + scenario: "default case", + }, + { + spec: &v1alpha1.InternalModelServerSpec{ + URL: "fake-url", + Path: "fake-model-path", + RequestPath: "fake-request-path", + ListPath: "fake-model-list-path", + PipelineURL: "fake-pipeline", + ErrorKey: "fake-error-key", + }, + data: map[string]string{ + "MODEL_PATH": "fake-model-path", + "MODEL_SERVER_REQ_PATH": "fake-request-path", + "MODEL_SERVER_MODEL_LIST_PATH": "fake-model-list-path", + "INITIAL_PIPELINE_URL": "fake-pipeline", + "ERROR_KEY": "fake-error-key", + }, + scenario: "user defined server-api config", + }, + } + + for _, tc := range tt { + tc := tc + t.Run(tc.scenario, func(t *testing.T) { + t.Parallel() + k := v1alpha1.KeplerInternal{ + Spec: v1alpha1.KeplerInternalSpec{ + ModelServer: tc.spec, + }, + } + actual := NewConfigMap(k.ModelServerDeploymentName(), components.Full, k.Spec.ModelServer, k.Spec.Exporter.Deployment.Namespace) + assert.Equal(t, len(tc.data), len(actual.Data)) + for k, v := range tc.data { + assert.Equal(t, v, actual.Data[k]) + } + }) + } + +} + +func TestService(t *testing.T) { + + tt := []struct { + spec v1alpha1.InternalModelServerSpec + servicePort int32 + scenario string + }{ + { + spec: v1alpha1.InternalModelServerSpec{ + Port: 8080, + }, + servicePort: 8080, + scenario: "user defined port", + }, + } + + for _, tc := range tt { + tc := tc + t.Run(tc.scenario, func(t *testing.T) { + t.Parallel() + k := v1alpha1.KeplerInternal{ + Spec: v1alpha1.KeplerInternalSpec{ + ModelServer: &tc.spec, + }, + } + actual := NewService(k.ModelServerDeploymentName(), k.Spec.ModelServer, k.Spec.Exporter.Deployment.Namespace) + assert.Equal(t, actual.Spec.Ports[0].Port, tc.servicePort) + }) + } + +} + +func TestServerAPIContainer(t *testing.T) { + + tt := []struct { + spec v1alpha1.InternalModelServerSpec + servicePort int32 + scenario string + }{ + { + spec: v1alpha1.InternalModelServerSpec{ + Port: 8080, + }, + servicePort: 8080, + scenario: "user defined port", + }, + } + + for _, tc := range tt { + tc := tc + t.Run(tc.scenario, func(t *testing.T) { + t.Parallel() + k := v1alpha1.KeplerInternal{ + Spec: v1alpha1.KeplerInternalSpec{ + ModelServer: &tc.spec, + }, + } + actual := NewDeployment(k.ModelServerDeploymentName(), k.Spec.ModelServer, k.Spec.Exporter.Deployment.Namespace) + containers := actual.Spec.Template.Spec.Containers + assert.Equal(t, len(containers), 1) + assert.Equal(t, containers[0].Ports[0].ContainerPort, tc.servicePort) + }) + } + +} diff --git a/pkg/controllers/config.go b/pkg/controllers/config.go index 6c4aa32c..f0362098 100644 --- a/pkg/controllers/config.go +++ b/pkg/controllers/config.go @@ -29,4 +29,12 @@ var ( ImageLibbpf: "", Cluster: k8s.Kubernetes, } + + InternalConfig = struct { + ModelServerImage string + EstimatorImage string + }{ + ModelServerImage: "", + EstimatorImage: "", + } ) diff --git a/pkg/controllers/kepler_internal.go b/pkg/controllers/kepler_internal.go index fb09b0a4..3fcfa6aa 100644 --- a/pkg/controllers/kepler_internal.go +++ b/pkg/controllers/kepler_internal.go @@ -14,6 +14,7 @@ import ( "github.com/sustainable.computing.io/kepler-operator/pkg/api/v1alpha1" "github.com/sustainable.computing.io/kepler-operator/pkg/components" "github.com/sustainable.computing.io/kepler-operator/pkg/components/exporter" + "github.com/sustainable.computing.io/kepler-operator/pkg/components/modelserver" "github.com/sustainable.computing.io/kepler-operator/pkg/reconciler" "github.com/sustainable.computing.io/kepler-operator/pkg/utils/k8s" @@ -41,11 +42,11 @@ type KeplerInternalReconciler struct { // common to all components deployed by operator //+kubebuilder:rbac:groups=core,resources=namespaces,verbs=list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=core,resources=services;configmaps;serviceaccounts,verbs=list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=core,resources=services;configmaps;serviceaccounts;persistentvolumeclaims,verbs=list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=*,verbs=* // RBAC for running Kepler exporter -//+kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=apps,resources=daemonsets;deployments,verbs=list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=list;watch;create;update;patch;delete;use //+kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors;prometheusrules,verbs=list;watch;create;update;patch;delete @@ -219,6 +220,38 @@ func (r KeplerInternalReconciler) updateAvailableStatus(ctx context.Context, ki c.ObservedGeneration = ki.Generation } ki.Status.Exporter.Conditions = append(ki.Status.Exporter.Conditions, c) + + // update estimator status + estimatorStatus := &v1alpha1.EstimatorStatus{ + Status: v1alpha1.DeploymentNotInstalled, + } + if ki.Spec.Estimator != nil && len(dset.Spec.Template.Spec.Containers) > 1 { + // estimator enabled and has a sidecar container + if ds.NumberReady == ds.DesiredNumberScheduled { + estimatorStatus.Status = v1alpha1.DeploymentRunning + } else { + estimatorStatus.Status = v1alpha1.DeploymentNotReady + } + } + ki.Status.Estimator = *estimatorStatus + + // update model server status + modelServerStatus := &v1alpha1.ModelServerStatus{ + Status: v1alpha1.DeploymentNotInstalled, + } + if ki.Spec.ModelServer != nil { + key = types.NamespacedName{Name: ki.ModelServerDeploymentName(), Namespace: ki.Namespace()} + deploy := appsv1.Deployment{} + if err := r.Client.Get(ctx, key, &deploy); err == nil { + if deploy.Status.ReadyReplicas > 0 { + modelServerStatus.Status = v1alpha1.DeploymentRunning + } else { + modelServerStatus.Status = v1alpha1.DeploymentNotReady + } + } + } + ki.Status.ModelServer = *modelServerStatus + } func availableConditionForGetError(err error) v1alpha1.Condition { @@ -326,10 +359,25 @@ func (r KeplerInternalReconciler) reconcilersForInternal(k *v1alpha1.KeplerInter }) } + if k.Spec.Estimator != nil { + if k.Spec.Estimator.Image == "" { + k.Spec.Estimator.Image = InternalConfig.EstimatorImage + } + } + rs = append(rs, exporterReconcilers(k, Config.Cluster)...) - // TODO: add this when modelServer is supported by Kepler Spec - // rs = append(rs, modelServerReconcilers(k)...) + if k.Spec.ModelServer != nil && k.Spec.ModelServer.Enabled { + if k.Spec.ModelServer.Image == "" { + k.Spec.ModelServer.Image = InternalConfig.ModelServerImage + } + reconcilers, err := modelServerInternalReconcilers(k) + if err != nil { + r.logger.Info(fmt.Sprintf("cannot init model server reconciler from config: %v", err)) + } else { + rs = append(rs, reconcilers...) + } + } if cleanup { rs = append(rs, reconciler.Deleter{ @@ -398,3 +446,32 @@ func openshiftResources(ki *v1alpha1.KeplerInternal, cluster k8s.Cluster) []clie } return res } + +func modelServerInternalReconcilers(ki *v1alpha1.KeplerInternal) ([]reconciler.Reconciler, error) { + ms := ki.Spec.ModelServer + msName := ki.ModelServerDeploymentName() + namespace := ki.Spec.Exporter.Deployment.Namespace + cm := modelserver.NewConfigMap(msName, components.Full, ms, namespace) + deploy := modelserver.NewDeployment(msName, ms, namespace) + svc := modelserver.NewService(msName, ms, namespace) + + resources := []client.Object{cm, deploy, svc} + + if ms.Storage.PersistentVolumeClaim != nil { + pvc := modelserver.NewPVC(msName, namespace, ms.Storage.PersistentVolumeClaim) + resources = append(resources, pvc) + } + + rs := updatersForInternalResources(ki, resources...) + return rs, nil +} + +func updatersForInternalResources(ki *v1alpha1.KeplerInternal, resources ...client.Object) []reconciler.Reconciler { + rs := []reconciler.Reconciler{} + resourceUpdater := newUpdaterWithOwner(ki) + for _, res := range resources { + rs = append(rs, resourceUpdater(res)) + } + return rs + +} diff --git a/pkg/reconciler/runner.go b/pkg/reconciler/runner.go index 9f15fe3e..f2435a6a 100644 --- a/pkg/reconciler/runner.go +++ b/pkg/reconciler/runner.go @@ -33,6 +33,7 @@ type Runner struct { Logger logr.Logger } +// TODO: make sure that model server container (deployment) is ready before creating kepler daemonset func (runner Runner) Run(ctx context.Context) (ctrl.Result, error) { var err error diff --git a/pkg/utils/k8s/k8s.go b/pkg/utils/k8s/k8s.go index 822ed406..56f68771 100644 --- a/pkg/utils/k8s/k8s.go +++ b/pkg/utils/k8s/k8s.go @@ -61,6 +61,13 @@ func (l StringMap) ToMap() map[string]string { return l } +func (l StringMap) AddIfNotEmpty(k, v string) StringMap { + if k != "" && v != "" { + l[k] = v + } + return l +} + func VolumeFromHost(name, path string) corev1.Volume { return corev1.Volume{ Name: name, @@ -93,6 +100,15 @@ func VolumeFromPVC(name, pvcName string) corev1.Volume { } } +func VolumeFromEmptyDir(name string) corev1.Volume { + return corev1.Volume{ + Name: name, + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + } +} + func EnvFromField(path string) *corev1.EnvVarSource { return &corev1.EnvVarSource{ FieldRef: &corev1.ObjectFieldSelector{FieldPath: path}, diff --git a/pkg/utils/test/assertions.go b/pkg/utils/test/assertions.go index 83d42ca5..ab627adf 100644 --- a/pkg/utils/test/assertions.go +++ b/pkg/utils/test/assertions.go @@ -21,9 +21,11 @@ import ( "time" "github.com/stretchr/testify/assert" + "github.com/sustainable.computing.io/kepler-operator/pkg/api/v1alpha1" "github.com/sustainable.computing.io/kepler-operator/pkg/utils/k8s" "sigs.k8s.io/controller-runtime/pkg/client" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" @@ -108,3 +110,22 @@ func (f Framework) AssertNoResourceExists(name, ns string, obj client.Object, fn f.T.Errorf("%s (%v) exists after %v", k8s.GVKName(obj), key, opt.WaitTimeout) } } + +func (f Framework) AssertInternalStatus(name string) { + // the status will be updated + ki := f.WaitUntilInternalCondition(name, v1alpha1.Reconciled, v1alpha1.ConditionTrue) + assert.Equal(f.T, []corev1.Toleration{{Operator: "Exists"}}, ki.Spec.Exporter.Deployment.Tolerations) + + reconciled, err := k8s.FindCondition(ki.Status.Exporter.Conditions, v1alpha1.Reconciled) + assert.NoError(f.T, err, "unable to get reconciled condition") + assert.Equal(f.T, reconciled.ObservedGeneration, ki.Generation) + assert.Equal(f.T, reconciled.Status, v1alpha1.ConditionTrue) + // + ki = f.WaitUntilInternalCondition(ki.Name, v1alpha1.Available, v1alpha1.ConditionTrue) + available, err := k8s.FindCondition(ki.Status.Exporter.Conditions, v1alpha1.Available) + assert.NoError(f.T, err, "unable to get available condition") + assert.Equal(f.T, available.ObservedGeneration, ki.Generation) + assert.Equal(f.T, available.Status, v1alpha1.ConditionTrue) + + f.WaitUntilInternalHasExpectedRunning(ki.Name) +} diff --git a/pkg/utils/test/framework.go b/pkg/utils/test/framework.go index 2c711771..1b6046e8 100644 --- a/pkg/utils/test/framework.go +++ b/pkg/utils/test/framework.go @@ -250,6 +250,37 @@ func (f Framework) WaitUntilInternalCondition(name string, t v1alpha1.ConditionT return &k } +func (f Framework) WaitUntilInternalHasExpectedRunning(name string) *v1alpha1.KeplerInternal { + f.T.Helper() + k := v1alpha1.KeplerInternal{} + f.WaitUntil(fmt.Sprintf("kepler-internal %s has expected running status", name), + func() (bool, error) { + err := f.client.Get(context.TODO(), client.ObjectKey{Name: name}, &k) + if errors.IsNotFound(err) { + return true, fmt.Errorf("kepler-internal %s is not found", name) + } + statusOK := true + if k.Spec.Estimator != nil && k.Spec.Estimator.Enabled() { + if k.Status.Estimator.Status == v1alpha1.DeploymentNotInstalled { + return true, fmt.Errorf("kepler-internal %s should install estimator", name) + } + statusOK = k.Status.Estimator.Status == v1alpha1.DeploymentRunning + } else { + statusOK = k.Status.Estimator.Status == v1alpha1.DeploymentNotInstalled + } + if k.Spec.ModelServer != nil && k.Spec.ModelServer.Enabled { + if k.Status.ModelServer.Status == v1alpha1.DeploymentNotInstalled { + return true, fmt.Errorf("kepler-internal %s should install model-server", name) + } + statusOK = statusOK && k.Status.ModelServer.Status == v1alpha1.DeploymentRunning + } else { + statusOK = statusOK && k.Status.ModelServer.Status == v1alpha1.DeploymentNotInstalled + } + return statusOK, nil + }) + return &k +} + func (f Framework) WaitUntilKeplerCondition(name string, t v1alpha1.ConditionType, s v1alpha1.ConditionStatus) *v1alpha1.Kepler { f.T.Helper() k := v1alpha1.Kepler{} diff --git a/pkg/utils/test/kepler_internal_builder.go b/pkg/utils/test/kepler_internal_builder.go index d4be6e1e..9f8bef7b 100644 --- a/pkg/utils/test/kepler_internal_builder.go +++ b/pkg/utils/test/kepler_internal_builder.go @@ -49,3 +49,26 @@ func (b InternalBuilder) WithExporterPort(p int) func(k *v1alpha1.KeplerInternal k.Spec.Exporter.Deployment.Port = int32(p) } } + +func (b InternalBuilder) WithEstimator() func(k *v1alpha1.KeplerInternal) { + return func(k *v1alpha1.KeplerInternal) { + k.Spec.Estimator = &v1alpha1.InternalEstimatorSpec{ + Node: v1alpha1.EstimatorGroup{ + Total: &v1alpha1.EstimatorConfig{ + SidecarEnabled: true, + }, + Components: &v1alpha1.EstimatorConfig{ + SidecarEnabled: true, + }, + }, + } + } +} + +func (b InternalBuilder) WithModelServer() func(k *v1alpha1.KeplerInternal) { + return func(k *v1alpha1.KeplerInternal) { + k.Spec.ModelServer = &v1alpha1.InternalModelServerSpec{ + Enabled: true, + } + } +} diff --git a/tests/e2e/kepler_internal_test.go b/tests/e2e/kepler_internal_test.go index 7d6f3370..52a86c9b 100644 --- a/tests/e2e/kepler_internal_test.go +++ b/tests/e2e/kepler_internal_test.go @@ -21,7 +21,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/sustainable.computing.io/kepler-operator/pkg/api/v1alpha1" "github.com/sustainable.computing.io/kepler-operator/pkg/controllers" - "github.com/sustainable.computing.io/kepler-operator/pkg/utils/k8s" "github.com/sustainable.computing.io/kepler-operator/pkg/utils/test" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -54,19 +53,114 @@ func TestKeplerInternal_Reconciliation(t *testing.T) { assert.Equal(t, 1, len(containers)) assert.Equal(t, 1, len(containers[0].Ports)) assert.EqualValues(t, 9108, containers[0].Ports[0].ContainerPort) + // test expected status + f.AssertInternalStatus(ki.Name) +} + +func TestKeplerInternal_WithEstimator(t *testing.T) { + f := test.NewFramework(t) + name := "e2e-kepler-internal-with-estimator" + // Ensure Kepler is not deployed (by any chance) + f.AssertNoResourceExists(name, "", &v1alpha1.KeplerInternal{}, test.NoWait()) + + // test namespace must be the deployment namespace for controller + // to watch the deployments / daemonsets etc + testNs := controllers.KeplerDeploymentNS + + // pre-condition + f.AssertNoResourceExists(name, "", &v1alpha1.KeplerInternal{}, test.NoWait()) + // when + b := test.InternalBuilder{} + ki := f.CreateInternal(name, + b.WithNamespace(testNs), + b.WithExporterLibBpfImage(), + b.WithEstimator(), + ) + + // then the following resources will be created + f.AssertResourceExists(testNs, "", &corev1.Namespace{}) + + ds := appsv1.DaemonSet{} + f.AssertResourceExists(ki.Name, testNs, &ds) + containers := ds.Spec.Template.Spec.Containers + // deamonset must have a sidecar + assert.Equal(t, 2, len(containers)) + // test expected status + f.AssertInternalStatus(ki.Name) +} + +func TestKeplerInternal_WithModelServer(t *testing.T) { + f := test.NewFramework(t) + name := "e2e-kepler-internal-with-modelserver" + // Ensure Kepler is not deployed (by any chance) + f.AssertNoResourceExists(name, "", &v1alpha1.KeplerInternal{}, test.NoWait()) + + // test namespace must be the deployment namespace for controller + // to watch the deployments / daemonsets etc + testNs := controllers.KeplerDeploymentNS + + // pre-condition + f.AssertNoResourceExists(name, "", &v1alpha1.KeplerInternal{}, test.NoWait()) + // when + b := test.InternalBuilder{} + ki := f.CreateInternal(name, + b.WithNamespace(testNs), + b.WithExporterLibBpfImage(), + b.WithModelServer(), + ) + + // then the following resources will be created + f.AssertResourceExists(testNs, "", &corev1.Namespace{}) + + ds := appsv1.DaemonSet{} + f.AssertResourceExists(ki.Name, testNs, &ds) + containers := ds.Spec.Template.Spec.Containers + assert.Equal(t, 1, len(containers)) + // test expected status + f.AssertInternalStatus(ki.Name) + + // confirm model-server deployment ready + deploy := appsv1.Deployment{} + f.AssertResourceExists(ki.ModelServerDeploymentName(), testNs, &deploy) + readyReplicas := deploy.Status.ReadyReplicas + assert.Equal(t, int32(1), readyReplicas) +} + +func TestKeplerInternal_WithEstimatorAndModelServer(t *testing.T) { + f := test.NewFramework(t) + name := "e2e-kepler-internal-with-estimator-and-modelserver" + // Ensure Kepler is not deployed (by any chance) + f.AssertNoResourceExists(name, "", &v1alpha1.KeplerInternal{}, test.NoWait()) + + // test namespace must be the deployment namespace for controller + // to watch the deployments / daemonsets etc + testNs := controllers.KeplerDeploymentNS + + // pre-condition + f.AssertNoResourceExists(name, "", &v1alpha1.KeplerInternal{}, test.NoWait()) + // when + b := test.InternalBuilder{} + ki := f.CreateInternal(name, + b.WithNamespace(testNs), + b.WithExporterLibBpfImage(), + b.WithEstimator(), + b.WithModelServer(), + ) + + // then the following resources will be created + f.AssertResourceExists(testNs, "", &corev1.Namespace{}) + + ds := appsv1.DaemonSet{} + f.AssertResourceExists(ki.Name, testNs, &ds) + containers := ds.Spec.Template.Spec.Containers + // deamonset must have a sidecar + assert.Equal(t, 2, len(containers)) + // test expected status + f.AssertInternalStatus(ki.Name) - // the status will be updated - ki = f.WaitUntilInternalCondition(ki.Name, v1alpha1.Reconciled, v1alpha1.ConditionTrue) - assert.Equal(t, []corev1.Toleration{{Operator: "Exists"}}, ki.Spec.Exporter.Deployment.Tolerations) - - reconciled, err := k8s.FindCondition(ki.Status.Exporter.Conditions, v1alpha1.Reconciled) - assert.NoError(t, err, "unable to get reconciled condition") - assert.Equal(t, reconciled.ObservedGeneration, ki.Generation) - assert.Equal(t, reconciled.Status, v1alpha1.ConditionTrue) - // - ki = f.WaitUntilInternalCondition(ki.Name, v1alpha1.Available, v1alpha1.ConditionTrue) - available, err := k8s.FindCondition(ki.Status.Exporter.Conditions, v1alpha1.Available) - assert.NoError(t, err, "unable to get available condition") - assert.Equal(t, available.ObservedGeneration, ki.Generation) - assert.Equal(t, available.Status, v1alpha1.ConditionTrue) + // confirm model-server deployment ready + deploy := appsv1.Deployment{} + f.AssertResourceExists(ki.ModelServerDeploymentName(), testNs, &deploy) + readyReplicas := deploy.Status.ReadyReplicas + assert.Equal(t, int32(1), readyReplicas) }