From f54d6af3372e47b418ebd6ebc190c3e8bc3de25e Mon Sep 17 00:00:00 2001 From: LindaYu17 Date: Wed, 7 Dec 2022 15:02:35 +0800 Subject: [PATCH] add poolcoordinator controller and webhooks to replace nodelifecycle controller (#1040) make yurt-controller-manager take care of webhook configurations and certs add unit tests --- charts/openyurt/templates/_helpers.tpl | 27 + .../templates/yurt-controller-manager.yaml | 93 +- charts/openyurt/values.yaml | 13 + .../app/controllermanager.go | 3 +- cmd/yurt-controller-manager/app/core.go | 40 +- go.mod | 2 +- go.sum | 11 +- pkg/controller/nodelifecycle/metrics.go | 84 - .../node_lifecycle_controller.go | 1567 ------- .../node_lifecycle_controller_test.go | 3941 ----------------- .../scheduler/rate_limited_queue.go | 308 -- .../scheduler/rate_limited_queue_test.go | 334 -- .../nodelifecycle/scheduler/taint_manager.go | 540 --- .../scheduler/taint_manager_test.go | 941 ---- .../nodelifecycle/scheduler/timed_workers.go | 144 - .../scheduler/timed_workers_test.go | 141 - .../poolcoordinator/constant/constant.go | 31 + .../poolcoordinator_controller.go | 233 + .../poolcoordinator_controller_test.go | 40 + pkg/controller/poolcoordinator/utils/file.go | 56 + .../poolcoordinator/utils/file_test.go | 48 + pkg/controller/poolcoordinator/utils/lease.go | 75 + .../poolcoordinator/utils/lease_test.go | 36 + .../poolcoordinator/utils/nodepool.go | 153 + .../poolcoordinator/utils/nodepool_test.go | 117 + .../poolcoordinator/utils/taints.go | 44 + .../poolcoordinator/utils/taints_test.go | 59 + .../poolcoordinator/utils/tolerations.go | 110 + .../poolcoordinator/utils/tolerations_test.go | 92 + pkg/webhook/certs.go | 166 + pkg/webhook/certs_test.go | 29 + pkg/webhook/poolcoordinator_webhook.go | 673 +++ pkg/webhook/poolcoordinator_webhook_test.go | 169 + pkg/webhook/webhook.go | 116 + 34 files changed, 2392 insertions(+), 8044 deletions(-) delete mode 100644 pkg/controller/nodelifecycle/metrics.go delete mode 100644 pkg/controller/nodelifecycle/node_lifecycle_controller.go delete mode 100644 pkg/controller/nodelifecycle/node_lifecycle_controller_test.go delete mode 100644 pkg/controller/nodelifecycle/scheduler/rate_limited_queue.go delete mode 100644 pkg/controller/nodelifecycle/scheduler/rate_limited_queue_test.go delete mode 100644 pkg/controller/nodelifecycle/scheduler/taint_manager.go delete mode 100644 pkg/controller/nodelifecycle/scheduler/taint_manager_test.go delete mode 100644 pkg/controller/nodelifecycle/scheduler/timed_workers.go delete mode 100644 pkg/controller/nodelifecycle/scheduler/timed_workers_test.go create mode 100644 pkg/controller/poolcoordinator/constant/constant.go create mode 100644 pkg/controller/poolcoordinator/poolcoordinator_controller.go create mode 100644 pkg/controller/poolcoordinator/poolcoordinator_controller_test.go create mode 100644 pkg/controller/poolcoordinator/utils/file.go create mode 100644 pkg/controller/poolcoordinator/utils/file_test.go create mode 100644 pkg/controller/poolcoordinator/utils/lease.go create mode 100644 pkg/controller/poolcoordinator/utils/lease_test.go create mode 100644 pkg/controller/poolcoordinator/utils/nodepool.go create mode 100644 pkg/controller/poolcoordinator/utils/nodepool_test.go create mode 100644 pkg/controller/poolcoordinator/utils/taints.go create mode 100644 pkg/controller/poolcoordinator/utils/taints_test.go create mode 100644 pkg/controller/poolcoordinator/utils/tolerations.go create mode 100644 pkg/controller/poolcoordinator/utils/tolerations_test.go create mode 100644 pkg/webhook/certs.go create mode 100644 pkg/webhook/certs_test.go create mode 100644 pkg/webhook/poolcoordinator_webhook.go create mode 100644 pkg/webhook/poolcoordinator_webhook_test.go create mode 100644 pkg/webhook/webhook.go diff --git a/charts/openyurt/templates/_helpers.tpl b/charts/openyurt/templates/_helpers.tpl index 5b0982c9f5d..b17cd55fdff 100644 --- a/charts/openyurt/templates/_helpers.tpl +++ b/charts/openyurt/templates/_helpers.tpl @@ -1 +1,28 @@ {{/* vim: set filetype=mustache: */}} + +{{- define "yurt-controller-manager.fullname" -}} +yurt-controller-manager +{{- end -}} + +{{- define "yurt-controller-manager.name" -}} +yurt-controller-manager +{{- end -}} + +{{/* +Selector labels +*/}} +{{- define "yurt-controller-manager.selectorLabels" -}} +app.kubernetes.io/name: {{ include "yurt-controller-manager.name" . }} +app.kubernetes.io/instance: {{ printf "yurt-controller-manager-%s" .Release.Name }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "yurt-controller-manager.labels" -}} +{{ include "yurt-controller-manager.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} \ No newline at end of file diff --git a/charts/openyurt/templates/yurt-controller-manager.yaml b/charts/openyurt/templates/yurt-controller-manager.yaml index 9a81eb42e08..ae3ef02ca40 100644 --- a/charts/openyurt/templates/yurt-controller-manager.yaml +++ b/charts/openyurt/templates/yurt-controller-manager.yaml @@ -16,19 +16,9 @@ rules: resources: - nodes verbs: - - delete - get - list - - patch - - update - watch - - apiGroups: - - "" - resources: - - nodes/status - verbs: - - patch - - update - apiGroups: - "" resources: @@ -40,8 +30,12 @@ rules: resources: - pods verbs: + - create - delete + - get - list + - patch + - update - watch - apiGroups: - "" @@ -129,6 +123,24 @@ rules: - get - list - watch + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - apiGroups: + - admissionregistration.k8s.io + resources: + - validatingwebhookconfigurations + - mutatingwebhookconfigurations + verbs: + - create + - delete + - get + - update --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -147,15 +159,18 @@ apiVersion: apps/v1 kind: Deployment metadata: name: yurt-controller-manager + namespace: {{ .Release.Namespace }} + labels: + {{- include "yurt-controller-manager.labels" . | nindent 4 }} spec: replicas: 1 selector: matchLabels: - app: yurt-controller-manager + {{- include "yurt-controller-manager.selectorLabels" . | nindent 6 }} template: metadata: labels: - app: yurt-controller-manager + {{- include "yurt-controller-manager.selectorLabels" . | nindent 8 }} spec: serviceAccountName: yurt-controller-manager hostNetwork: true @@ -177,7 +192,53 @@ spec: image: "{{ .Values.yurtControllerManager.image.registry }}/{{ .Values.yurtControllerManager.image.repository }}:{{ .Values.yurtControllerManager.image.tag }}" imagePullPolicy: {{ .Values.yurtControllerManager.image.pullPolicy }} command: - - yurt-controller-manager - {{- if .Values.imagePullSecrets }} - imagePullSecrets: {{ toYaml .Values.imagePullSecrets | nindent 8 }} - {{- end }} \ No newline at end of file + - yurt-controller-manager + {{- if .Values.imagePullSecrets }} + imagePullSecrets: + {{ toYaml .Values.imagePullSecrets | nindent 10 }} + {{- end }} + ports: + - name: webhook-server + containerPort: {{ .Values.admissionWebhooks.service.port }} + protocol: TCP + - name: health + containerPort: 8000 + protocol: TCP + env: + - name: WEBHOOK_CERT_DIR + value: {{ .Values.admissionWebhooks.certificate.mountPath }} + - name: WEBHOOK_SERVICE_PORT + value: {{ .Values.admissionWebhooks.service.port | quote }} + - name: WEBHOOK_SERVICE_NAME + value: {{ template "yurt-controller-manager.fullname" . }}-webhook + - name: WEBHOOK_NAMESPACE + value: {{ .Release.Namespace }} + - name: WEBHOOK_POD_VALIDATING_CONFIGURATION_NAME + value: {{ template "yurt-controller-manager.fullname" . }} + - name: WEBHOOK_POD_MUTATING_CONFIGURATION_NAME + value: {{ template "yurt-controller-manager.fullname" . }} + - name: WEBHOOK_POD_VALIDATING_NAME + value: {{ .Values.admissionWebhooks.names.validatingWebhookName }} + - name: WEBHOOK_POD_MUTATING_NAME + value: {{ .Values.admissionWebhooks.names.mutatingWebhookName }} + - name: WEBHOOK_POD_VALIDATING_PATH + value: {{ .Values.admissionWebhooks.names.webhookPodValidatingPath }} + - name: WEBHOOK_POD_MUTATING_PATH + value: {{ .Values.admissionWebhooks.names.webhookPodMutatingPath }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ template "yurt-controller-manager.name" . }}-webhook + namespace: {{ .Release.Namespace }} + labels: + {{- include "yurt-controller-manager.labels" . | nindent 4 }} +spec: + type: {{ .Values.admissionWebhooks.service.type }} + ports: + - port: 443 + targetPort: {{ .Values.admissionWebhooks.service.port }} + protocol: TCP + name: https + selector: + {{ include "yurt-controller-manager.selectorLabels" . | nindent 6 }} diff --git a/charts/openyurt/values.yaml b/charts/openyurt/values.yaml index 0c3ef98420e..b02d32f42f8 100644 --- a/charts/openyurt/values.yaml +++ b/charts/openyurt/values.yaml @@ -13,6 +13,19 @@ yurtControllerManager: type: ClusterIP port: 80 +admissionWebhooks: + service: + type: ClusterIP + port: 9443 + failurePolicy: Fail + certificate: + mountPath: /tmp/k8s-webhook-server/serving-certs + names: + validatingWebhookName: vpoolcoordinator.openyurt.io + mutatingWebhookName: mpoolcoordinator.openyurt.io + webhookPodValidatingPath: /pool-coordinator-webhook-validate + webhookPodMutatingPath: /pool-coordinator-webhook-mutate + yurtTunnelAgent: replicaCount: 1 tolerations: [] diff --git a/cmd/yurt-controller-manager/app/controllermanager.go b/cmd/yurt-controller-manager/app/controllermanager.go index 88c52cada33..8f13a21a1e3 100644 --- a/cmd/yurt-controller-manager/app/controllermanager.go +++ b/cmd/yurt-controller-manager/app/controllermanager.go @@ -309,10 +309,11 @@ var ControllersDisabledByDefault = sets.NewString() // paired to their InitFunc. This allows for structured downstream composition and subdivision. func NewControllerInitializers() map[string]InitFunc { controllers := map[string]InitFunc{} - controllers["nodelifecycle"] = startNodeLifecycleController + controllers["poolcoordinator"] = startPoolCoordinatorController controllers["yurtcsrapprover"] = startYurtCSRApproverController controllers["daemonpodupdater"] = startDaemonPodUpdaterController controllers["servicetopologycontroller"] = startServiceTopologyController + controllers["webhookmanager"] = startWebhookManager return controllers } diff --git a/cmd/yurt-controller-manager/app/core.go b/cmd/yurt-controller-manager/app/core.go index 95728245cac..f78a3c2bbbc 100644 --- a/cmd/yurt-controller-manager/app/core.go +++ b/cmd/yurt-controller-manager/app/core.go @@ -23,37 +23,20 @@ package app import ( "net/http" - "time" "github.com/openyurtio/openyurt/pkg/controller/certificates" daemonpodupdater "github.com/openyurtio/openyurt/pkg/controller/daemonpodupdater" - lifecyclecontroller "github.com/openyurtio/openyurt/pkg/controller/nodelifecycle" + poolcoordinator "github.com/openyurtio/openyurt/pkg/controller/poolcoordinator" "github.com/openyurtio/openyurt/pkg/controller/servicetopology" + "github.com/openyurtio/openyurt/pkg/webhook" ) -func startNodeLifecycleController(ctx ControllerContext) (http.Handler, bool, error) { - lifecycleController, err := lifecyclecontroller.NewNodeLifecycleController( - ctx.InformerFactory.Coordination().V1().Leases(), - ctx.InformerFactory.Core().V1().Pods(), - ctx.InformerFactory.Core().V1().Nodes(), - ctx.InformerFactory.Apps().V1().DaemonSets(), - // node lifecycle controller uses existing cluster role from node-controller - ctx.ClientBuilder.ClientOrDie("node-controller"), - //ctx.ComponentConfig.KubeCloudShared.NodeMonitorPeriod.Duration, - 5*time.Second, - ctx.ComponentConfig.NodeLifecycleController.NodeStartupGracePeriod.Duration, - ctx.ComponentConfig.NodeLifecycleController.NodeMonitorGracePeriod.Duration, - ctx.ComponentConfig.NodeLifecycleController.PodEvictionTimeout.Duration, - ctx.ComponentConfig.NodeLifecycleController.NodeEvictionRate, - ctx.ComponentConfig.NodeLifecycleController.SecondaryNodeEvictionRate, - ctx.ComponentConfig.NodeLifecycleController.LargeClusterSizeThreshold, - ctx.ComponentConfig.NodeLifecycleController.UnhealthyZoneThreshold, - *ctx.ComponentConfig.NodeLifecycleController.EnableTaintManager, +func startPoolCoordinatorController(ctx ControllerContext) (http.Handler, bool, error) { + poolcoordinatorController := poolcoordinator.NewController( + ctx.ClientBuilder.ClientOrDie("poolcoordinator-controller"), + ctx.InformerFactory, ) - if err != nil { - return nil, true, err - } - go lifecycleController.Run(ctx.Stop) + go poolcoordinatorController.Run(ctx.Stop) return nil, true, nil } @@ -94,3 +77,12 @@ func startServiceTopologyController(ctx ControllerContext) (http.Handler, bool, go svcTopologyController.Run(ctx.Stop) return nil, true, nil } + +func startWebhookManager(ctx ControllerContext) (http.Handler, bool, error) { + webhookManager := webhook.NewWebhookManager( + ctx.ClientBuilder.ClientOrDie("webhook manager"), + ctx.InformerFactory, + ) + go webhookManager.Run(ctx.Stop) + return nil, true, nil +} diff --git a/go.mod b/go.mod index b59d9a1d095..60fbc2ed791 100644 --- a/go.mod +++ b/go.mod @@ -26,6 +26,7 @@ require ( github.com/spf13/pflag v1.0.5 github.com/stretchr/testify v1.7.0 github.com/vishvananda/netlink v1.1.1-0.20200603190939-5a869a71f0cb + github.com/wI2L/jsondiff v0.3.0 golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a google.golang.org/grpc v1.40.0 gopkg.in/cheggaaa/pb.v1 v1.0.25 @@ -36,7 +37,6 @@ require ( k8s.io/client-go v0.22.3 k8s.io/cluster-bootstrap v0.22.3 k8s.io/component-base v0.22.3 - k8s.io/component-helpers v0.22.3 k8s.io/controller-manager v0.22.3 k8s.io/klog/v2 v2.9.0 k8s.io/kube-controller-manager v0.22.3 diff --git a/go.sum b/go.sum index d4ec6f07a85..ed692042e88 100644 --- a/go.sum +++ b/go.sum @@ -581,6 +581,13 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw= +github.com/tidwall/gjson v1.14.3 h1:9jvXn7olKEHU1S9vwoMGliaT8jq1vJ7IH/n9zD9Dnlw= +github.com/tidwall/gjson v1.14.3/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/tmc/grpc-websocket-proxy v0.0.0-20201229170055-e5319fda7802 h1:uruHq4dN7GR16kFc5fp3d1RIYzJW5onx8Ybykw2YQFA= @@ -590,6 +597,8 @@ github.com/vishvananda/netlink v1.1.1-0.20200603190939-5a869a71f0cb h1:MY3XXjEi7 github.com/vishvananda/netlink v1.1.1-0.20200603190939-5a869a71f0cb/go.mod h1:FSQhuTO7eHT34mPzX+B04SUAjiqLxtXs1et0S6l9k4k= github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df h1:OviZH7qLw/7ZovXvuNyL3XQl8UFofeikI1NW1Gypu7k= github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU= +github.com/wI2L/jsondiff v0.3.0 h1:iTzQ9u/d86GE9RsBzVHX88f2EA1vQUboHwLhSQFc1s4= +github.com/wI2L/jsondiff v0.3.0/go.mod h1:y1IMzNNjlSsk3IUoJdRJO7VRBtzMvRgyo4Vu0LdHpTc= github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2 h1:eY9dn8+vbi4tKz5Qo6v2eYzo7kUS51QINcR5jNpbZS8= github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= @@ -1146,8 +1155,6 @@ k8s.io/component-base v0.18.8/go.mod h1:00frPRDas29rx58pPCxNkhUfPbwajlyyvu8ruNgS k8s.io/component-base v0.21.1/go.mod h1:NgzFZ2qu4m1juby4TnrmpR8adRk6ka62YdH5DkIIyKA= k8s.io/component-base v0.22.3 h1:/+hryAW03u3FpJQww+GSMsArJNUbGjH66lrgxaRynLU= k8s.io/component-base v0.22.3/go.mod h1:kuybv1miLCMoOk3ebrqF93GbQHQx6W2287FC0YEQY6s= -k8s.io/component-helpers v0.22.3 h1:08tn+T8HnjRTwDP2ErIBhHGvPcYJf5zWaWW83golHWc= -k8s.io/component-helpers v0.22.3/go.mod h1:7OVySVH5elhHKuJKUOxZEfpT1Bm3ChmBQZHmuFfbGHk= k8s.io/controller-manager v0.22.3 h1:nBKG8MsgtUd/oFaZvE5zAYRIr45+Hn8QkHzq5+CtPOE= k8s.io/controller-manager v0.22.3/go.mod h1:4cvQGMvYf6IpTY08/NigEiI5UrN/cbtOe5e5WepYmcQ= k8s.io/gengo v0.0.0-20190128074634-0689ccc1d7d6/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0= diff --git a/pkg/controller/nodelifecycle/metrics.go b/pkg/controller/nodelifecycle/metrics.go deleted file mode 100644 index ee6cfb0dffc..00000000000 --- a/pkg/controller/nodelifecycle/metrics.go +++ /dev/null @@ -1,84 +0,0 @@ -/* -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package nodelifecycle - -import ( - "sync" - - "k8s.io/component-base/metrics" - "k8s.io/component-base/metrics/legacyregistry" -) - -const ( - nodeControllerSubsystem = "edge_node_collector" - zoneHealthStatisticKey = "zone_health" - zoneSizeKey = "zone_size" - zoneNoUnhealthyNodesKey = "unhealthy_nodes_in_zone" - evictionsNumberKey = "evictions_number" - zone = "zone" -) - -var ( - zoneHealth = metrics.NewGaugeVec( - &metrics.GaugeOpts{ - Subsystem: nodeControllerSubsystem, - Name: zoneHealthStatisticKey, - Help: "Gauge measuring percentage of healthy nodes per zone.", - StabilityLevel: metrics.ALPHA, - }, - []string{zone}, - ) - zoneSize = metrics.NewGaugeVec( - &metrics.GaugeOpts{ - Subsystem: nodeControllerSubsystem, - Name: zoneSizeKey, - Help: "Gauge measuring number of registered Nodes per zones.", - StabilityLevel: metrics.ALPHA, - }, - []string{zone}, - ) - unhealthyNodes = metrics.NewGaugeVec( - &metrics.GaugeOpts{ - Subsystem: nodeControllerSubsystem, - Name: zoneNoUnhealthyNodesKey, - Help: "Gauge measuring number of not Ready Nodes per zones.", - StabilityLevel: metrics.ALPHA, - }, - []string{zone}, - ) - evictionsNumber = metrics.NewCounterVec( - &metrics.CounterOpts{ - Subsystem: nodeControllerSubsystem, - Name: evictionsNumberKey, - Help: "Number of Node evictions that happened since current instance of NodeController started.", - StabilityLevel: metrics.ALPHA, - }, - []string{zone}, - ) -) - -var registerMetrics sync.Once - -// Register the metrics that are to be monitored. -func Register() { - registerMetrics.Do(func() { - legacyregistry.MustRegister(zoneHealth) - legacyregistry.MustRegister(zoneSize) - legacyregistry.MustRegister(unhealthyNodes) - legacyregistry.MustRegister(evictionsNumber) - }) -} diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller.go b/pkg/controller/nodelifecycle/node_lifecycle_controller.go deleted file mode 100644 index ce0dd5fc15c..00000000000 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller.go +++ /dev/null @@ -1,1567 +0,0 @@ -/* -Copyright 2020 The OpenYurt Authors. -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// The Controller sets tainted annotations on nodes. -// Tainted nodes should not be used for new work loads and -// some effort should be given to getting existing work -// loads off of tainted nodes. - -package nodelifecycle - -import ( - "context" - "fmt" - "sync" - "time" - - coordv1 "k8s.io/api/coordination/v1" - v1 "k8s.io/api/core/v1" - apiequality "k8s.io/apimachinery/pkg/api/equality" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/apimachinery/pkg/util/wait" - appsv1informers "k8s.io/client-go/informers/apps/v1" - coordinformers "k8s.io/client-go/informers/coordination/v1" - coreinformers "k8s.io/client-go/informers/core/v1" - clientset "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/scheme" - v1core "k8s.io/client-go/kubernetes/typed/core/v1" - appsv1listers "k8s.io/client-go/listers/apps/v1" - coordlisters "k8s.io/client-go/listers/coordination/v1" - corelisters "k8s.io/client-go/listers/core/v1" - "k8s.io/client-go/tools/cache" - "k8s.io/client-go/tools/record" - "k8s.io/client-go/util/flowcontrol" - "k8s.io/client-go/util/workqueue" - "k8s.io/component-base/metrics/prometheus/ratelimiter" - utilnode "k8s.io/component-helpers/node/topology" - "k8s.io/klog/v2" - - "github.com/openyurtio/openyurt/pkg/controller/kubernetes/controller" - taintutils "github.com/openyurtio/openyurt/pkg/controller/kubernetes/util/taints" - "github.com/openyurtio/openyurt/pkg/controller/nodelifecycle/scheduler" - nodeutil "github.com/openyurtio/openyurt/pkg/controller/util/node" -) - -func init() { - // Register prometheus metrics - Register() -} - -const ( - // LabelOS is a label to indicate the operating system of the node. - // The OS labels are promoted to GA in 1.14. kubelet applies GA labels and stop applying the beta OS labels in Kubernetes 1.19. - LabelOS = "beta.kubernetes.io/os" - // LabelArch is a label to indicate the architecture of the node. - // The Arch labels are promoted to GA in 1.14. kubelet applies GA labels and stop applying the beta Arch labels in Kubernetes 1.19. - LabelArch = "beta.kubernetes.io/arch" -) - -var ( - // UnreachableTaintTemplate is the taint for when a node becomes unreachable. - UnreachableTaintTemplate = &v1.Taint{ - Key: v1.TaintNodeUnreachable, - Effect: v1.TaintEffectNoExecute, - } - - // NotReadyTaintTemplate is the taint for when a node is not ready for - // executing pods - NotReadyTaintTemplate = &v1.Taint{ - Key: v1.TaintNodeNotReady, - Effect: v1.TaintEffectNoExecute, - } - - // map {NodeConditionType: {ConditionStatus: TaintKey}} - // represents which NodeConditionType under which ConditionStatus should be - // tainted with which TaintKey - // for certain NodeConditionType, there are multiple {ConditionStatus,TaintKey} pairs - nodeConditionToTaintKeyStatusMap = map[v1.NodeConditionType]map[v1.ConditionStatus]string{ - v1.NodeReady: { - v1.ConditionFalse: v1.TaintNodeNotReady, - v1.ConditionUnknown: v1.TaintNodeUnreachable, - }, - v1.NodeMemoryPressure: { - v1.ConditionTrue: v1.TaintNodeMemoryPressure, - }, - v1.NodeDiskPressure: { - v1.ConditionTrue: v1.TaintNodeDiskPressure, - }, - v1.NodeNetworkUnavailable: { - v1.ConditionTrue: v1.TaintNodeNetworkUnavailable, - }, - v1.NodePIDPressure: { - v1.ConditionTrue: v1.TaintNodePIDPressure, - }, - } - - taintKeyToNodeConditionMap = map[string]v1.NodeConditionType{ - v1.TaintNodeNotReady: v1.NodeReady, - v1.TaintNodeUnreachable: v1.NodeReady, - v1.TaintNodeNetworkUnavailable: v1.NodeNetworkUnavailable, - v1.TaintNodeMemoryPressure: v1.NodeMemoryPressure, - v1.TaintNodeDiskPressure: v1.NodeDiskPressure, - v1.TaintNodePIDPressure: v1.NodePIDPressure, - } -) - -// ZoneState is the state of a given zone. -type ZoneState string - -const ( - stateInitial = ZoneState("Initial") - stateNormal = ZoneState("Normal") - stateFullDisruption = ZoneState("FullDisruption") - statePartialDisruption = ZoneState("PartialDisruption") -) - -const ( - // The amount of time the nodecontroller should sleep between retrying node health updates - retrySleepTime = 20 * time.Millisecond - nodeNameKeyIndex = "spec.nodeName" - // podUpdateWorkerSizes assumes that in most cases pod will be handled by monitorNodeHealth pass. - // Pod update workers will only handle lagging cache pods. 4 workers should be enough. - podUpdateWorkerSize = 4 -) - -// labelReconcileInfo lists Node labels to reconcile, and how to reconcile them. -// primaryKey and secondaryKey are keys of labels to reconcile. -// - If both keys exist, but their values don't match. Use the value from the -// primaryKey as the source of truth to reconcile. -// - If ensureSecondaryExists is true, and the secondaryKey does not -// exist, secondaryKey will be added with the value of the primaryKey. -var labelReconcileInfo = []struct { - primaryKey string - secondaryKey string - ensureSecondaryExists bool -}{ - { - // Reconcile the beta and the stable OS label using the beta label as - // the source of truth. - // TODO(#73084): switch to using the stable label as the source of - // truth in v1.18. - primaryKey: LabelOS, - secondaryKey: v1.LabelOSStable, - ensureSecondaryExists: true, - }, - { - // Reconcile the beta and the stable arch label using the beta label as - // the source of truth. - // TODO(#73084): switch to using the stable label as the source of - // truth in v1.18. - primaryKey: LabelArch, - secondaryKey: v1.LabelArchStable, - ensureSecondaryExists: true, - }, -} - -type nodeHealthData struct { - probeTimestamp metav1.Time - readyTransitionTimestamp metav1.Time - status *v1.NodeStatus - lease *coordv1.Lease -} - -func (n *nodeHealthData) deepCopy() *nodeHealthData { - if n == nil { - return nil - } - return &nodeHealthData{ - probeTimestamp: n.probeTimestamp, - readyTransitionTimestamp: n.readyTransitionTimestamp, - status: n.status.DeepCopy(), - lease: n.lease.DeepCopy(), - } -} - -type nodeHealthMap struct { - lock sync.RWMutex - nodeHealths map[string]*nodeHealthData -} - -func newNodeHealthMap() *nodeHealthMap { - return &nodeHealthMap{ - nodeHealths: make(map[string]*nodeHealthData), - } -} - -// getDeepCopy - returns copy of node health data. -// It prevents data being changed after retrieving it from the map. -func (n *nodeHealthMap) getDeepCopy(name string) *nodeHealthData { - n.lock.RLock() - defer n.lock.RUnlock() - return n.nodeHealths[name].deepCopy() -} - -func (n *nodeHealthMap) set(name string, data *nodeHealthData) { - n.lock.Lock() - defer n.lock.Unlock() - n.nodeHealths[name] = data -} - -type podUpdateItem struct { - namespace string - name string -} - -type evictionStatus int - -const ( - unmarked = iota - toBeEvicted - evicted -) - -// nodeEvictionMap stores evictionStatus data for each node. -type nodeEvictionMap struct { - lock sync.Mutex - nodeEvictions map[string]evictionStatus -} - -func newNodeEvictionMap() *nodeEvictionMap { - return &nodeEvictionMap{ - nodeEvictions: make(map[string]evictionStatus), - } -} - -func (n *nodeEvictionMap) registerNode(nodeName string) { - n.lock.Lock() - defer n.lock.Unlock() - n.nodeEvictions[nodeName] = unmarked -} - -func (n *nodeEvictionMap) unregisterNode(nodeName string) { - n.lock.Lock() - defer n.lock.Unlock() - delete(n.nodeEvictions, nodeName) -} - -func (n *nodeEvictionMap) setStatus(nodeName string, status evictionStatus) bool { - n.lock.Lock() - defer n.lock.Unlock() - if _, exists := n.nodeEvictions[nodeName]; !exists { - return false - } - n.nodeEvictions[nodeName] = status - return true -} - -func (n *nodeEvictionMap) getStatus(nodeName string) (evictionStatus, bool) { - n.lock.Lock() - defer n.lock.Unlock() - if _, exists := n.nodeEvictions[nodeName]; !exists { - return unmarked, false - } - return n.nodeEvictions[nodeName], true -} - -// Controller is the controller that manages node's life cycle. -type Controller struct { - taintManager *scheduler.NoExecuteTaintManager - - podLister corelisters.PodLister - podInformerSynced cache.InformerSynced - kubeClient clientset.Interface - - // This timestamp is to be used instead of LastProbeTime stored in Condition. We do this - // to avoid the problem with time skew across the cluster. - now func() metav1.Time - - enterPartialDisruptionFunc func(nodeNum int) float32 - enterFullDisruptionFunc func(nodeNum int) float32 - computeZoneStateFunc func(nodeConditions []*v1.NodeCondition) (int, ZoneState) - - knownNodeSet map[string]*v1.Node - // per Node map storing last observed health together with a local time when it was observed. - nodeHealthMap *nodeHealthMap - - // evictorLock protects zonePodEvictor and zoneNoExecuteTainter. - // TODO(#83954): API calls shouldn't be executed under the lock. - evictorLock sync.Mutex - nodeEvictionMap *nodeEvictionMap - // workers that evicts pods from unresponsive nodes. - zonePodEvictor map[string]*scheduler.RateLimitedTimedQueue - // workers that are responsible for tainting nodes. - zoneNoExecuteTainter map[string]*scheduler.RateLimitedTimedQueue - - nodesToRetry sync.Map - - zoneStates map[string]ZoneState - - daemonSetStore appsv1listers.DaemonSetLister - daemonSetInformerSynced cache.InformerSynced - - leaseLister coordlisters.LeaseLister - leaseInformerSynced cache.InformerSynced - nodeLister corelisters.NodeLister - nodeInformerSynced cache.InformerSynced - - getPodsAssignedToNode func(nodeName string) ([]*v1.Pod, error) - - recorder record.EventRecorder - - // Value controlling Controller monitoring period, i.e. how often does Controller - // check node health signal posted from kubelet. This value should be lower than - // nodeMonitorGracePeriod. - // TODO: Change node health monitor to watch based. - nodeMonitorPeriod time.Duration - - // When node is just created, e.g. cluster bootstrap or node creation, we give - // a longer grace period. - nodeStartupGracePeriod time.Duration - - // Controller will not proactively sync node health, but will monitor node - // health signal updated from kubelet. There are 2 kinds of node healthiness - // signals: NodeStatus and NodeLease. NodeLease signal is generated only when - // NodeLease feature is enabled. If it doesn't receive update for this amount - // of time, it will start posting "NodeReady==ConditionUnknown". The amount of - // time before which Controller start evicting pods is controlled via flag - // 'pod-eviction-timeout'. - // Note: be cautious when changing the constant, it must work with - // nodeStatusUpdateFrequency in kubelet and renewInterval in NodeLease - // controller. The node health signal update frequency is the minimal of the - // two. - // There are several constraints: - // 1. nodeMonitorGracePeriod must be N times more than the node health signal - // update frequency, where N means number of retries allowed for kubelet to - // post node status/lease. It is pointless to make nodeMonitorGracePeriod - // be less than the node health signal update frequency, since there will - // only be fresh values from Kubelet at an interval of node health signal - // update frequency. The constant must be less than podEvictionTimeout. - // 2. nodeMonitorGracePeriod can't be too large for user experience - larger - // value takes longer for user to see up-to-date node health. - nodeMonitorGracePeriod time.Duration - - podEvictionTimeout time.Duration - evictionLimiterQPS float32 - secondaryEvictionLimiterQPS float32 - largeClusterThreshold int32 - unhealthyZoneThreshold float32 - - // if set to true Controller will start TaintManager that will evict Pods from - // tainted nodes, if they're not tolerated. - runTaintManager bool - - nodeUpdateQueue workqueue.Interface - podUpdateQueue workqueue.RateLimitingInterface -} - -// NewNodeLifecycleController returns a new taint controller. -func NewNodeLifecycleController( - leaseInformer coordinformers.LeaseInformer, - podInformer coreinformers.PodInformer, - nodeInformer coreinformers.NodeInformer, - daemonSetInformer appsv1informers.DaemonSetInformer, - kubeClient clientset.Interface, - nodeMonitorPeriod time.Duration, - nodeStartupGracePeriod time.Duration, - nodeMonitorGracePeriod time.Duration, - podEvictionTimeout time.Duration, - evictionLimiterQPS float32, - secondaryEvictionLimiterQPS float32, - largeClusterThreshold int32, - unhealthyZoneThreshold float32, - runTaintManager bool, -) (*Controller, error) { - - if kubeClient == nil { - klog.Fatalf("kubeClient is nil when starting Controller") - } - - eventBroadcaster := record.NewBroadcaster() - recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "node-controller"}) - eventBroadcaster.StartLogging(klog.Infof) - - klog.Infof("Sending events to api server.") - eventBroadcaster.StartRecordingToSink( - &v1core.EventSinkImpl{ - Interface: v1core.New(kubeClient.CoreV1().RESTClient()).Events(""), - }) - - if kubeClient.CoreV1().RESTClient().GetRateLimiter() != nil { - ratelimiter.RegisterMetricAndTrackRateLimiterUsage("node_lifecycle_controller", kubeClient.CoreV1().RESTClient().GetRateLimiter()) - } - - nc := &Controller{ - kubeClient: kubeClient, - now: metav1.Now, - knownNodeSet: make(map[string]*v1.Node), - nodeHealthMap: newNodeHealthMap(), - nodeEvictionMap: newNodeEvictionMap(), - recorder: recorder, - nodeMonitorPeriod: nodeMonitorPeriod, - nodeStartupGracePeriod: nodeStartupGracePeriod, - nodeMonitorGracePeriod: nodeMonitorGracePeriod, - zonePodEvictor: make(map[string]*scheduler.RateLimitedTimedQueue), - zoneNoExecuteTainter: make(map[string]*scheduler.RateLimitedTimedQueue), - nodesToRetry: sync.Map{}, - zoneStates: make(map[string]ZoneState), - podEvictionTimeout: podEvictionTimeout, - evictionLimiterQPS: evictionLimiterQPS, - secondaryEvictionLimiterQPS: secondaryEvictionLimiterQPS, - largeClusterThreshold: largeClusterThreshold, - unhealthyZoneThreshold: unhealthyZoneThreshold, - runTaintManager: runTaintManager, - nodeUpdateQueue: workqueue.NewNamed("node_lifecycle_controller"), - podUpdateQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "node_lifecycle_controller_pods"), - } - - nc.enterPartialDisruptionFunc = nc.ReducedQPSFunc - nc.enterFullDisruptionFunc = nc.HealthyQPSFunc - nc.computeZoneStateFunc = nc.ComputeZoneState - - podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - pod := obj.(*v1.Pod) - nc.podUpdated(nil, pod) - if nc.taintManager != nil { - nc.taintManager.PodUpdated(nil, pod) - } - }, - UpdateFunc: func(prev, obj interface{}) { - prevPod := prev.(*v1.Pod) - newPod := obj.(*v1.Pod) - nc.podUpdated(prevPod, newPod) - if nc.taintManager != nil { - nc.taintManager.PodUpdated(prevPod, newPod) - } - }, - DeleteFunc: func(obj interface{}) { - pod, isPod := obj.(*v1.Pod) - // We can get DeletedFinalStateUnknown instead of *v1.Pod here and we need to handle that correctly. - if !isPod { - deletedState, ok := obj.(cache.DeletedFinalStateUnknown) - if !ok { - klog.Errorf("Received unexpected object: %v", obj) - return - } - pod, ok = deletedState.Obj.(*v1.Pod) - if !ok { - klog.Errorf("DeletedFinalStateUnknown contained non-Pod object: %v", deletedState.Obj) - return - } - } - nc.podUpdated(pod, nil) - if nc.taintManager != nil { - nc.taintManager.PodUpdated(pod, nil) - } - }, - }) - nc.podInformerSynced = podInformer.Informer().HasSynced - podInformer.Informer().AddIndexers(cache.Indexers{ - nodeNameKeyIndex: func(obj interface{}) ([]string, error) { - pod, ok := obj.(*v1.Pod) - if !ok { - return []string{}, nil - } - if len(pod.Spec.NodeName) == 0 { - return []string{}, nil - } - return []string{pod.Spec.NodeName}, nil - }, - }) - - podIndexer := podInformer.Informer().GetIndexer() - nc.getPodsAssignedToNode = func(nodeName string) ([]*v1.Pod, error) { - objs, err := podIndexer.ByIndex(nodeNameKeyIndex, nodeName) - if err != nil { - return nil, err - } - pods := make([]*v1.Pod, 0, len(objs)) - for _, obj := range objs { - pod, ok := obj.(*v1.Pod) - if !ok { - continue - } - pods = append(pods, pod) - } - return pods, nil - } - nc.podLister = podInformer.Lister() - - if nc.runTaintManager { - podGetter := func(name, namespace string) (*v1.Pod, error) { return nc.podLister.Pods(namespace).Get(name) } - nodeLister := nodeInformer.Lister() - nodeGetter := func(name string) (*v1.Node, error) { return nodeLister.Get(name) } - nc.taintManager = scheduler.NewNoExecuteTaintManager(kubeClient, podGetter, nodeGetter, nc.getPodsAssignedToNode) - nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: nodeutil.CreateAddNodeHandler(func(node *v1.Node) error { - nc.taintManager.NodeUpdated(nil, node) - return nil - }), - UpdateFunc: nodeutil.CreateUpdateNodeHandler(func(oldNode, newNode *v1.Node) error { - nc.taintManager.NodeUpdated(oldNode, newNode) - return nil - }), - DeleteFunc: nodeutil.CreateDeleteNodeHandler(func(node *v1.Node) error { - nc.taintManager.NodeUpdated(node, nil) - return nil - }), - }) - } - - klog.Infof("Controller will reconcile labels.") - nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: nodeutil.CreateAddNodeHandler(func(node *v1.Node) error { - nc.nodeUpdateQueue.Add(node.Name) - nc.nodeEvictionMap.registerNode(node.Name) - return nil - }), - UpdateFunc: nodeutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error { - nc.nodeUpdateQueue.Add(newNode.Name) - return nil - }), - DeleteFunc: nodeutil.CreateDeleteNodeHandler(func(node *v1.Node) error { - nc.nodesToRetry.Delete(node.Name) - nc.nodeEvictionMap.unregisterNode(node.Name) - return nil - }), - }) - - nc.leaseLister = leaseInformer.Lister() - nc.leaseInformerSynced = leaseInformer.Informer().HasSynced - - nc.nodeLister = nodeInformer.Lister() - nc.nodeInformerSynced = nodeInformer.Informer().HasSynced - - nc.daemonSetStore = daemonSetInformer.Lister() - nc.daemonSetInformerSynced = daemonSetInformer.Informer().HasSynced - - return nc, nil -} - -// Run starts an asynchronous loop that monitors the status of cluster nodes. -func (nc *Controller) Run(stopCh <-chan struct{}) { - defer utilruntime.HandleCrash() - - klog.Infof("Starting node controller") - defer klog.Infof("Shutting down node controller") - - if !cache.WaitForNamedCacheSync("taint", stopCh, nc.leaseInformerSynced, nc.nodeInformerSynced, nc.podInformerSynced, nc.daemonSetInformerSynced) { - return - } - - if nc.runTaintManager { - go nc.taintManager.Run(stopCh) - } - - // Close node update queue to cleanup go routine. - defer nc.nodeUpdateQueue.ShutDown() - defer nc.podUpdateQueue.ShutDown() - - // Start workers to reconcile labels and/or update NoSchedule taint for nodes. - for i := 0; i < scheduler.UpdateWorkerSize; i++ { - // Thanks to "workqueue", each worker just need to get item from queue, because - // the item is flagged when got from queue: if new event come, the new item will - // be re-queued until "Done", so no more than one worker handle the same item and - // no event missed. - go wait.Until(nc.doNodeProcessingPassWorker, time.Second, stopCh) - } - - for i := 0; i < podUpdateWorkerSize; i++ { - go wait.Until(nc.doPodProcessingWorker, time.Second, stopCh) - } - - if nc.runTaintManager { - // Handling taint based evictions. Because we don't want a dedicated logic in TaintManager for NC-originated - // taints and we normally don't rate limit evictions caused by taints, we need to rate limit adding taints. - go wait.Until(nc.doNoExecuteTaintingPass, scheduler.NodeEvictionPeriod, stopCh) - } else { - // Managing eviction of nodes: - // When we delete pods off a node, if the node was not empty at the time we then - // queue an eviction watcher. If we hit an error, retry deletion. - go wait.Until(nc.doEvictionPass, scheduler.NodeEvictionPeriod, stopCh) - } - - // Incorporate the results of node health signal pushed from kubelet to master. - go wait.Until(func() { - if err := nc.monitorNodeHealth(); err != nil { - klog.Errorf("Error monitoring node health: %v", err) - } - }, nc.nodeMonitorPeriod, stopCh) - - <-stopCh -} - -func (nc *Controller) doNodeProcessingPassWorker() { - for { - obj, shutdown := nc.nodeUpdateQueue.Get() - // "nodeUpdateQueue" will be shutdown when "stopCh" closed; - // we do not need to re-check "stopCh" again. - if shutdown { - return - } - nodeName := obj.(string) - if err := nc.doNoScheduleTaintingPass(nodeName); err != nil { - klog.Errorf("Failed to taint NoSchedule on node <%s>, requeue it: %v", nodeName, err) - // TODO(k82cn): Add nodeName back to the queue - } - // TODO: re-evaluate whether there are any labels that need to be - // reconcile in 1.19. Remove this function if it's no longer necessary. - if err := nc.reconcileNodeLabels(nodeName); err != nil { - klog.Errorf("Failed to reconcile labels for node <%s>, requeue it: %v", nodeName, err) - // TODO(yujuhong): Add nodeName back to the queue - } - nc.nodeUpdateQueue.Done(nodeName) - } -} - -func (nc *Controller) doNoScheduleTaintingPass(nodeName string) error { - node, err := nc.nodeLister.Get(nodeName) - if err != nil { - // If node not found, just ignore it. - if apierrors.IsNotFound(err) { - return nil - } - return err - } - - // Map node's condition to Taints. - var taints []v1.Taint - for _, condition := range node.Status.Conditions { - if taintMap, found := nodeConditionToTaintKeyStatusMap[condition.Type]; found { - if taintKey, found := taintMap[condition.Status]; found { - taints = append(taints, v1.Taint{ - Key: taintKey, - Effect: v1.TaintEffectNoSchedule, - }) - } - } - } - if node.Spec.Unschedulable { - // If unschedulable, append related taint. - taints = append(taints, v1.Taint{ - Key: v1.TaintNodeUnschedulable, - Effect: v1.TaintEffectNoSchedule, - }) - } - - // Get exist taints of node. - nodeTaints := taintutils.TaintSetFilter(node.Spec.Taints, func(t *v1.Taint) bool { - // only NoSchedule taints are candidates to be compared with "taints" later - if t.Effect != v1.TaintEffectNoSchedule { - return false - } - // Find unschedulable taint of node. - if t.Key == v1.TaintNodeUnschedulable { - return true - } - // Find node condition taints of node. - _, found := taintKeyToNodeConditionMap[t.Key] - return found - }) - taintsToAdd, taintsToDel := taintutils.TaintSetDiff(taints, nodeTaints) - // If nothing to add not delete, return true directly. - if len(taintsToAdd) == 0 && len(taintsToDel) == 0 { - return nil - } - if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, taintsToAdd, taintsToDel, node) { - return fmt.Errorf("failed to swap taints of node %+v", node) - } - return nil -} - -func (nc *Controller) doNoExecuteTaintingPass() { - nc.evictorLock.Lock() - defer nc.evictorLock.Unlock() - for k := range nc.zoneNoExecuteTainter { - // Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded). - nc.zoneNoExecuteTainter[k].Try(func(value scheduler.TimedValue) (bool, time.Duration) { - node, err := nc.nodeLister.Get(value.Value) - if apierrors.IsNotFound(err) { - klog.Warningf("Node %v no longer present in nodeLister!", value.Value) - return true, 0 - } else if err != nil { - klog.Warningf("Failed to get Node %v from the nodeLister: %v", value.Value, err) - // retry in 50 millisecond - return false, 50 * time.Millisecond - } - _, condition := nodeutil.GetNodeCondition(&node.Status, v1.NodeReady) - // Because we want to mimic NodeStatus.Condition["Ready"] we make "unreachable" and "not ready" taints mutually exclusive. - taintToAdd := v1.Taint{} - oppositeTaint := v1.Taint{} - switch condition.Status { - case v1.ConditionFalse: - taintToAdd = *NotReadyTaintTemplate - oppositeTaint = *UnreachableTaintTemplate - case v1.ConditionUnknown: - taintToAdd = *UnreachableTaintTemplate - oppositeTaint = *NotReadyTaintTemplate - default: - // It seems that the Node is ready again, so there's no need to taint it. - klog.V(4).Infof("Node %v was in a taint queue, but it's ready now. Ignoring taint request.", value.Value) - return true, 0 - } - - result := nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{&oppositeTaint}, node) - if result { - //count the evictionsNumber - zone := utilnode.GetZoneKey(node) - evictionsNumber.WithLabelValues(zone).Inc() - } - - return result, 0 - }) - } -} - -func (nc *Controller) doEvictionPass() { - nc.evictorLock.Lock() - defer nc.evictorLock.Unlock() - for k := range nc.zonePodEvictor { - // Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded). - nc.zonePodEvictor[k].Try(func(value scheduler.TimedValue) (bool, time.Duration) { - node, err := nc.nodeLister.Get(value.Value) - if apierrors.IsNotFound(err) { - klog.Warningf("Node %v no longer present in nodeLister!", value.Value) - } else if err != nil { - klog.Warningf("Failed to get Node %v from the nodeLister: %v", value.Value, err) - } - nodeUID, _ := value.UID.(string) - pods, err := nc.getPodsAssignedToNode(value.Value) - if err != nil { - utilruntime.HandleError(fmt.Errorf("unable to list pods from node %q: %w", value.Value, err)) - return false, 0 - } - remaining, err := nodeutil.DeletePods(nc.kubeClient, pods, nc.recorder, value.Value, nodeUID, nc.daemonSetStore) - if err != nil { - // We are not setting eviction status here. - // New pods will be handled by zonePodEvictor retry - // instead of immediate pod eviction. - utilruntime.HandleError(fmt.Errorf("unable to evict node %q: %w", value.Value, err)) - return false, 0 - } - if !nc.nodeEvictionMap.setStatus(value.Value, evicted) { - klog.V(2).Infof("node %v was unregistered in the meantime - skipping setting status", value.Value) - } - if remaining { - klog.Infof("Pods awaiting deletion due to Controller eviction") - } - - if node != nil { - zone := utilnode.GetZoneKey(node) - evictionsNumber.WithLabelValues(zone).Inc() - } - - return true, 0 - }) - } -} - -// monitorNodeHealth verifies node health are constantly updated by kubelet, and -// if not, post "NodeReady==ConditionUnknown". -// This function will taint nodes who are not ready or not reachable for a long period of time. -func (nc *Controller) monitorNodeHealth() error { - // We are listing nodes from local cache as we can tolerate some small delays - // comparing to state from etcd and there is eventual consistency anyway. - nodes, err := nc.nodeLister.List(labels.Everything()) - if err != nil { - return err - } - added, deleted, newZoneRepresentatives := nc.classifyNodes(nodes) - - for i := range newZoneRepresentatives { - nc.addPodEvictorForNewZone(newZoneRepresentatives[i]) - } - - for i := range added { - klog.V(1).Infof("Controller observed a new Node: %#v", added[i].Name) - nodeutil.RecordNodeEvent(nc.recorder, added[i].Name, string(added[i].UID), v1.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in Controller", added[i].Name)) - nc.knownNodeSet[added[i].Name] = added[i] - nc.addPodEvictorForNewZone(added[i]) - if nc.runTaintManager { - nc.markNodeAsReachable(added[i]) - } else { - nc.cancelPodEviction(added[i]) - } - } - - for i := range deleted { - klog.V(1).Infof("Controller observed a Node deletion: %v", deleted[i].Name) - nodeutil.RecordNodeEvent(nc.recorder, deleted[i].Name, string(deleted[i].UID), v1.EventTypeNormal, "RemovingNode", fmt.Sprintf("Removing Node %v from Controller", deleted[i].Name)) - delete(nc.knownNodeSet, deleted[i].Name) - } - - zoneToNodeConditions := map[string][]*v1.NodeCondition{} - for i := range nodes { - var gracePeriod time.Duration - var observedReadyCondition v1.NodeCondition - var currentReadyCondition *v1.NodeCondition - node := nodes[i].DeepCopy() - if err := wait.PollImmediate(retrySleepTime, retrySleepTime*scheduler.NodeHealthUpdateRetry, func() (bool, error) { - gracePeriod, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeHealth(node) - if err == nil { - return true, nil - } - name := node.Name - node, err = nc.kubeClient.CoreV1().Nodes().Get(context.TODO(), name, metav1.GetOptions{}) - if err != nil { - klog.Errorf("Failed while getting a Node to retry updating node health. Probably Node %s was deleted.", name) - return false, err - } - return false, nil - }); err != nil { - klog.Errorf("Update health of Node '%v' from Controller error: %v. "+ - "Skipping - no pods will be evicted.", node.Name, err) - continue - } - - // Some nodes may be excluded from disruption checking - if !isNodeExcludedFromDisruptionChecks(node) { - zoneToNodeConditions[utilnode.GetZoneKey(node)] = append(zoneToNodeConditions[utilnode.GetZoneKey(node)], currentReadyCondition) - } - - if currentReadyCondition != nil { - pods, err := nc.getPodsAssignedToNode(node.Name) - if err != nil { - utilruntime.HandleError(fmt.Errorf("unable to list pods of node %v: %w", node.Name, err)) - if currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue { - // If error happened during node status transition (Ready -> NotReady) - // we need to mark node for retry to force MarkPodsNotReady execution - // in the next iteration. - nc.nodesToRetry.Store(node.Name, struct{}{}) - } - continue - } - if nc.runTaintManager { - nc.processTaintBaseEviction(node, &observedReadyCondition) - } else { - if err := nc.processNoTaintBaseEviction(node, &observedReadyCondition, gracePeriod, pods); err != nil { - utilruntime.HandleError(fmt.Errorf("unable to evict all pods from node %v: %w; queuing for retry", node.Name, err)) - } - } - - _, needsRetry := nc.nodesToRetry.Load(node.Name) - switch { - case currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue: - // Report node event only once when status changed. - nodeutil.RecordNodeStatusChange(nc.recorder, node, "NodeNotReady") - fallthrough - case needsRetry && observedReadyCondition.Status != v1.ConditionTrue: - if err = nodeutil.MarkPodsNotReady(nc.kubeClient, pods, node.Name, node); err != nil { - utilruntime.HandleError(fmt.Errorf("unable to mark all pods NotReady on node %v: %w; queuing for retry", node.Name, err)) - nc.nodesToRetry.Store(node.Name, struct{}{}) - continue - } - } - } - nc.nodesToRetry.Delete(node.Name) - } - nc.handleDisruption(zoneToNodeConditions, nodes) - - return nil -} - -func (nc *Controller) processTaintBaseEviction(node *v1.Node, observedReadyCondition *v1.NodeCondition) { - decisionTimestamp := nc.now() - // Check eviction timeout against decisionTimestamp - switch observedReadyCondition.Status { - case v1.ConditionFalse: - // We want to update the taint straight away if Node is already tainted with the UnreachableTaint - if taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) { - taintToAdd := *NotReadyTaintTemplate - if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{UnreachableTaintTemplate}, node) { - klog.Errorf("Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle.") - } - } else if nc.markNodeForTainting(node, v1.ConditionFalse) { - klog.V(2).Infof("Node %v is NotReady as of %v. Adding it to the Taint queue.", - node.Name, - decisionTimestamp, - ) - } - case v1.ConditionUnknown: - // We want to update the taint straight away if Node is already tainted with the UnreachableTaint - if taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) { - taintToAdd := *UnreachableTaintTemplate - if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{NotReadyTaintTemplate}, node) { - klog.Errorf("Failed to instantly swap NotReadyTaint to UnreachableTaint. Will try again in the next cycle.") - } - } else if nc.markNodeForTainting(node, v1.ConditionUnknown) { - klog.V(2).Infof("Node %v is unresponsive as of %v. Adding it to the Taint queue.", - node.Name, - decisionTimestamp, - ) - } - case v1.ConditionTrue: - removed, err := nc.markNodeAsReachable(node) - if err != nil { - klog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name) - } - if removed { - klog.V(2).Infof("Node %s is healthy again, removing all taints", node.Name) - } - } -} - -func (nc *Controller) processNoTaintBaseEviction(node *v1.Node, observedReadyCondition *v1.NodeCondition, gracePeriod time.Duration, pods []*v1.Pod) error { - decisionTimestamp := nc.now() - nodeHealthData := nc.nodeHealthMap.getDeepCopy(node.Name) - if nodeHealthData == nil { - return fmt.Errorf("health data doesn't exist for node %q", node.Name) - } - // Check eviction timeout against decisionTimestamp - switch observedReadyCondition.Status { - case v1.ConditionFalse: - if decisionTimestamp.After(nodeHealthData.readyTransitionTimestamp.Add(nc.podEvictionTimeout)) { - enqueued, err := nc.evictPods(node, pods) - if err != nil { - return err - } - if enqueued { - klog.V(2).Infof("Node is NotReady. Adding Pods on Node %s to eviction queue: %v is later than %v + %v", - node.Name, - decisionTimestamp, - nodeHealthData.readyTransitionTimestamp, - nc.podEvictionTimeout, - ) - } - } - case v1.ConditionUnknown: - if decisionTimestamp.After(nodeHealthData.probeTimestamp.Add(nc.podEvictionTimeout)) { - enqueued, err := nc.evictPods(node, pods) - if err != nil { - return err - } - if enqueued { - klog.V(2).Infof("Node is unresponsive. Adding Pods on Node %s to eviction queues: %v is later than %v + %v", - node.Name, - decisionTimestamp, - nodeHealthData.readyTransitionTimestamp, - nc.podEvictionTimeout-gracePeriod, - ) - } - } - case v1.ConditionTrue: - if nc.cancelPodEviction(node) { - klog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name) - } - } - return nil -} - -// labelNodeDisruptionExclusion is a label on nodes that controls whether they are -// excluded from being considered for disruption checks by the node controller. -const labelNodeDisruptionExclusion = "node.kubernetes.io/exclude-disruption" - -func isNodeExcludedFromDisruptionChecks(node *v1.Node) bool { - if _, ok := node.Labels[labelNodeDisruptionExclusion]; ok { - return true - } - return false -} - -// tryUpdateNodeHealth checks a given node's conditions and tries to update it. Returns grace period to -// which given node is entitled, state of current and last observed Ready Condition, and an error if it occurred. -func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.NodeCondition, *v1.NodeCondition, error) { - nodeHealth := nc.nodeHealthMap.getDeepCopy(node.Name) - defer func() { - nc.nodeHealthMap.set(node.Name, nodeHealth) - }() - - var gracePeriod time.Duration - var observedReadyCondition v1.NodeCondition - _, currentReadyCondition := nodeutil.GetNodeCondition(&node.Status, v1.NodeReady) - if currentReadyCondition == nil { - // If ready condition is nil, then kubelet (or nodecontroller) never posted node status. - // A fake ready condition is created, where LastHeartbeatTime and LastTransitionTime is set - // to node.CreationTimestamp to avoid handle the corner case. - observedReadyCondition = v1.NodeCondition{ - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: node.CreationTimestamp, - LastTransitionTime: node.CreationTimestamp, - } - gracePeriod = nc.nodeStartupGracePeriod - if nodeHealth != nil { - nodeHealth.status = &node.Status - } else { - nodeHealth = &nodeHealthData{ - status: &node.Status, - probeTimestamp: node.CreationTimestamp, - readyTransitionTimestamp: node.CreationTimestamp, - } - } - } else { - // If ready condition is not nil, make a copy of it, since we may modify it in place later. - observedReadyCondition = *currentReadyCondition - gracePeriod = nc.nodeMonitorGracePeriod - } - // There are following cases to check: - // - both saved and new status have no Ready Condition set - we leave everything as it is, - // - saved status have no Ready Condition, but current one does - Controller was restarted with Node data already present in etcd, - // - saved status have some Ready Condition, but current one does not - it's an error, but we fill it up because that's probably a good thing to do, - // - both saved and current statuses have Ready Conditions and they have the same LastProbeTime - nothing happened on that Node, it may be - // unresponsive, so we leave it as it is, - // - both saved and current statuses have Ready Conditions, they have different LastProbeTimes, but the same Ready Condition State - - // everything's in order, no transition occurred, we update only probeTimestamp, - // - both saved and current statuses have Ready Conditions, different LastProbeTimes and different Ready Condition State - - // Ready Condition changed it state since we last seen it, so we update both probeTimestamp and readyTransitionTimestamp. - // TODO: things to consider: - // - if 'LastProbeTime' have gone back in time its probably an error, currently we ignore it, - // - currently only correct Ready State transition outside of Node Controller is marking it ready by Kubelet, we don't check - // if that's the case, but it does not seem necessary. - var savedCondition *v1.NodeCondition - var savedLease *coordv1.Lease - if nodeHealth != nil { - _, savedCondition = nodeutil.GetNodeCondition(nodeHealth.status, v1.NodeReady) - savedLease = nodeHealth.lease - } - - if nodeHealth == nil { - klog.Warningf("Missing timestamp for Node %s. Assuming now as a timestamp.", node.Name) - nodeHealth = &nodeHealthData{ - status: &node.Status, - probeTimestamp: nc.now(), - readyTransitionTimestamp: nc.now(), - } - } else if savedCondition == nil && currentReadyCondition != nil { - klog.V(1).Infof("Creating timestamp entry for newly observed Node %s", node.Name) - nodeHealth = &nodeHealthData{ - status: &node.Status, - probeTimestamp: nc.now(), - readyTransitionTimestamp: nc.now(), - } - } else if savedCondition != nil && currentReadyCondition == nil { - klog.Errorf("ReadyCondition was removed from Status of Node %s", node.Name) - // TODO: figure out what to do in this case. For now we do the same thing as above. - nodeHealth = &nodeHealthData{ - status: &node.Status, - probeTimestamp: nc.now(), - readyTransitionTimestamp: nc.now(), - } - } else if savedCondition != nil && currentReadyCondition != nil && savedCondition.LastHeartbeatTime != currentReadyCondition.LastHeartbeatTime { - var transitionTime metav1.Time - // If ReadyCondition changed since the last time we checked, we update the transition timestamp to "now", - // otherwise we leave it as it is. - if savedCondition.LastTransitionTime != currentReadyCondition.LastTransitionTime { - klog.V(3).Infof("ReadyCondition for Node %s transitioned from %v to %v", node.Name, savedCondition, currentReadyCondition) - transitionTime = nc.now() - } else { - transitionTime = nodeHealth.readyTransitionTimestamp - } - if klog.V(5).Enabled() { - klog.Infof("Node %s ReadyCondition updated. Updating timestamp: %+v vs %+v.", node.Name, nodeHealth.status, node.Status) - } else { - klog.V(3).Infof("Node %s ReadyCondition updated. Updating timestamp.", node.Name) - } - nodeHealth = &nodeHealthData{ - status: &node.Status, - probeTimestamp: nc.now(), - readyTransitionTimestamp: transitionTime, - } - } - // Always update the probe time if node lease is renewed. - // Note: If kubelet never posted the node status, but continues renewing the - // heartbeat leases, the node controller will assume the node is healthy and - // take no action. - observedLease, _ := nc.leaseLister.Leases(v1.NamespaceNodeLease).Get(node.Name) - if observedLease != nil && (savedLease == nil || savedLease.Spec.RenewTime.Before(observedLease.Spec.RenewTime)) { - nodeHealth.lease = observedLease - nodeHealth.probeTimestamp = nc.now() - } - - if nc.now().After(nodeHealth.probeTimestamp.Add(gracePeriod)) { - // NodeReady condition or lease was last set longer ago than gracePeriod, so - // update it to Unknown (regardless of its current value) in the master. - - nodeConditionTypes := []v1.NodeConditionType{ - v1.NodeReady, - v1.NodeMemoryPressure, - v1.NodeDiskPressure, - v1.NodePIDPressure, - // We don't change 'NodeNetworkUnavailable' condition, as it's managed on a control plane level. - // v1.NodeNetworkUnavailable, - } - - nowTimestamp := nc.now() - for _, nodeConditionType := range nodeConditionTypes { - _, currentCondition := nodeutil.GetNodeCondition(&node.Status, nodeConditionType) - if currentCondition == nil { - klog.V(2).Infof("Condition %v of node %v was never updated by kubelet", nodeConditionType, node.Name) - node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{ - Type: nodeConditionType, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: node.CreationTimestamp, - LastTransitionTime: nowTimestamp, - }) - } else { - klog.V(2).Infof("node %v hasn't been updated for %+v. Last %v is: %+v", - node.Name, nc.now().Time.Sub(nodeHealth.probeTimestamp.Time), nodeConditionType, currentCondition) - if currentCondition.Status != v1.ConditionUnknown { - currentCondition.Status = v1.ConditionUnknown - currentCondition.Reason = "NodeStatusUnknown" - currentCondition.Message = "Kubelet stopped posting node status." - currentCondition.LastTransitionTime = nowTimestamp - } - } - } - // We need to update currentReadyCondition due to its value potentially changed. - _, currentReadyCondition = nodeutil.GetNodeCondition(&node.Status, v1.NodeReady) - - if !apiequality.Semantic.DeepEqual(currentReadyCondition, &observedReadyCondition) { - if _, err := nc.kubeClient.CoreV1().Nodes().UpdateStatus(context.TODO(), node, metav1.UpdateOptions{}); err != nil { - klog.Errorf("Error updating node %s: %v", node.Name, err) - return gracePeriod, observedReadyCondition, currentReadyCondition, err - } - nodeHealth = &nodeHealthData{ - status: &node.Status, - probeTimestamp: nodeHealth.probeTimestamp, - readyTransitionTimestamp: nc.now(), - lease: observedLease, - } - return gracePeriod, observedReadyCondition, currentReadyCondition, nil - } - } - - return gracePeriod, observedReadyCondition, currentReadyCondition, nil -} - -func (nc *Controller) handleDisruption(zoneToNodeConditions map[string][]*v1.NodeCondition, nodes []*v1.Node) { - newZoneStates := map[string]ZoneState{} - allAreFullyDisrupted := true - for k, v := range zoneToNodeConditions { - zoneSize.WithLabelValues(k).Set(float64(len(v))) - unhealthy, newState := nc.computeZoneStateFunc(v) - zoneHealth.WithLabelValues(k).Set(float64(100*(len(v)-unhealthy)) / float64(len(v))) - unhealthyNodes.WithLabelValues(k).Set(float64(unhealthy)) - if newState != stateFullDisruption { - allAreFullyDisrupted = false - } - newZoneStates[k] = newState - if _, had := nc.zoneStates[k]; !had { - klog.Errorf("Setting initial state for unseen zone: %v", k) - nc.zoneStates[k] = stateInitial - } - } - - allWasFullyDisrupted := true - for k, v := range nc.zoneStates { - if _, have := zoneToNodeConditions[k]; !have { - zoneSize.WithLabelValues(k).Set(0) - zoneHealth.WithLabelValues(k).Set(100) - unhealthyNodes.WithLabelValues(k).Set(0) - delete(nc.zoneStates, k) - continue - } - if v != stateFullDisruption { - allWasFullyDisrupted = false - break - } - } - - // At least one node was responding in previous pass or in the current pass. Semantics is as follows: - // - if the new state is "partialDisruption" we call a user defined function that returns a new limiter to use, - // - if the new state is "normal" we resume normal operation (go back to default limiter settings), - // - if new state is "fullDisruption" we restore normal eviction rate, - // - unless all zones in the cluster are in "fullDisruption" - in that case we stop all evictions. - if !allAreFullyDisrupted || !allWasFullyDisrupted { - // We're switching to full disruption mode - if allAreFullyDisrupted { - klog.V(0).Info("Controller detected that all Nodes are not-Ready. Entering master disruption mode.") - for i := range nodes { - if nc.runTaintManager { - _, err := nc.markNodeAsReachable(nodes[i]) - if err != nil { - klog.Errorf("Failed to remove taints from Node %v", nodes[i].Name) - } - } else { - nc.cancelPodEviction(nodes[i]) - } - } - // We stop all evictions. - for k := range nc.zoneStates { - if nc.runTaintManager { - nc.zoneNoExecuteTainter[k].SwapLimiter(0) - } else { - nc.zonePodEvictor[k].SwapLimiter(0) - } - } - for k := range nc.zoneStates { - nc.zoneStates[k] = stateFullDisruption - } - // All rate limiters are updated, so we can return early here. - return - } - // We're exiting full disruption mode - if allWasFullyDisrupted { - klog.V(0).Info("Controller detected that some Nodes are Ready. Exiting master disruption mode.") - // When exiting disruption mode update probe timestamps on all Nodes. - now := nc.now() - for i := range nodes { - v := nc.nodeHealthMap.getDeepCopy(nodes[i].Name) - v.probeTimestamp = now - v.readyTransitionTimestamp = now - nc.nodeHealthMap.set(nodes[i].Name, v) - } - // We reset all rate limiters to settings appropriate for the given state. - for k := range nc.zoneStates { - nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newZoneStates[k]) - nc.zoneStates[k] = newZoneStates[k] - } - return - } - // We know that there's at least one not-fully disrupted so, - // we can use default behavior for rate limiters - for k, v := range nc.zoneStates { - newState := newZoneStates[k] - if v == newState { - continue - } - klog.V(0).Infof("Controller detected that zone %v is now in state %v.", k, newState) - nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newState) - nc.zoneStates[k] = newState - } - } -} - -func (nc *Controller) podUpdated(oldPod, newPod *v1.Pod) { - if newPod == nil { - return - } - if len(newPod.Spec.NodeName) != 0 && (oldPod == nil || newPod.Spec.NodeName != oldPod.Spec.NodeName) { - podItem := podUpdateItem{newPod.Namespace, newPod.Name} - nc.podUpdateQueue.Add(podItem) - } -} - -func (nc *Controller) doPodProcessingWorker() { - for { - obj, shutdown := nc.podUpdateQueue.Get() - // "podUpdateQueue" will be shutdown when "stopCh" closed; - // we do not need to re-check "stopCh" again. - if shutdown { - return - } - - podItem := obj.(podUpdateItem) - nc.processPod(podItem) - } -} - -// processPod is processing events of assigning pods to nodes. In particular: -// 1. for NodeReady=true node, taint eviction for this pod will be cancelled -// 2. for NodeReady=false or unknown node, taint eviction of pod will happen and pod will be marked as not ready -// 3. if node doesn't exist in cache, it will be skipped and handled later by doEvictionPass -func (nc *Controller) processPod(podItem podUpdateItem) { - defer nc.podUpdateQueue.Done(podItem) - pod, err := nc.podLister.Pods(podItem.namespace).Get(podItem.name) - if err != nil { - if apierrors.IsNotFound(err) { - // If the pod was deleted, there is no need to requeue. - return - } - klog.Warningf("Failed to read pod %v/%v: %v.", podItem.namespace, podItem.name, err) - nc.podUpdateQueue.AddRateLimited(podItem) - return - } - - nodeName := pod.Spec.NodeName - - nodeHealth := nc.nodeHealthMap.getDeepCopy(nodeName) - if nodeHealth == nil { - // Node data is not gathered yet or node has beed removed in the meantime. - // Pod will be handled by doEvictionPass method. - return - } - - node, err := nc.nodeLister.Get(nodeName) - if err != nil { - klog.Warningf("Failed to read node %v: %v.", nodeName, err) - nc.podUpdateQueue.AddRateLimited(podItem) - return - } - - _, currentReadyCondition := nodeutil.GetNodeCondition(nodeHealth.status, v1.NodeReady) - if currentReadyCondition == nil { - // Lack of NodeReady condition may only happen after node addition (or if it will be maliciously deleted). - // In both cases, the pod will be handled correctly (evicted if needed) during processing - // of the next node update event. - return - } - - pods := []*v1.Pod{pod} - // In taint-based eviction mode, only node updates are processed by NodeLifecycleController. - // Pods are processed by TaintManager. - if !nc.runTaintManager { - if err := nc.processNoTaintBaseEviction(node, currentReadyCondition, nc.nodeMonitorGracePeriod, pods); err != nil { - klog.Warningf("Unable to process pod %+v eviction from node %v: %v.", podItem, nodeName, err) - nc.podUpdateQueue.AddRateLimited(podItem) - return - } - } - - if currentReadyCondition.Status != v1.ConditionTrue { - if err := nodeutil.MarkPodsNotReady(nc.kubeClient, pods, nodeName, node); err != nil { - klog.Warningf("Unable to mark pod %+v NotReady on node %v: %v.", podItem, nodeName, err) - nc.podUpdateQueue.AddRateLimited(podItem) - } - } -} - -func (nc *Controller) setLimiterInZone(zone string, zoneSize int, state ZoneState) { - switch state { - case stateNormal: - if nc.runTaintManager { - nc.zoneNoExecuteTainter[zone].SwapLimiter(nc.evictionLimiterQPS) - } else { - nc.zonePodEvictor[zone].SwapLimiter(nc.evictionLimiterQPS) - } - case statePartialDisruption: - if nc.runTaintManager { - nc.zoneNoExecuteTainter[zone].SwapLimiter( - nc.enterPartialDisruptionFunc(zoneSize)) - } else { - nc.zonePodEvictor[zone].SwapLimiter( - nc.enterPartialDisruptionFunc(zoneSize)) - } - case stateFullDisruption: - if nc.runTaintManager { - nc.zoneNoExecuteTainter[zone].SwapLimiter( - nc.enterFullDisruptionFunc(zoneSize)) - } else { - nc.zonePodEvictor[zone].SwapLimiter( - nc.enterFullDisruptionFunc(zoneSize)) - } - } -} - -// classifyNodes classifies the allNodes to three categories: -// 1. added: the nodes that in 'allNodes', but not in 'knownNodeSet' -// 2. deleted: the nodes that in 'knownNodeSet', but not in 'allNodes' -// 3. newZoneRepresentatives: the nodes that in both 'knownNodeSet' and 'allNodes', but no zone states -func (nc *Controller) classifyNodes(allNodes []*v1.Node) (added, deleted, newZoneRepresentatives []*v1.Node) { - for i := range allNodes { - if _, has := nc.knownNodeSet[allNodes[i].Name]; !has { - added = append(added, allNodes[i]) - } else { - // Currently, we only consider new zone as updated. - zone := utilnode.GetZoneKey(allNodes[i]) - if _, found := nc.zoneStates[zone]; !found { - newZoneRepresentatives = append(newZoneRepresentatives, allNodes[i]) - } - } - } - - // If there's a difference between lengths of known Nodes and observed nodes - // we must have removed some Node. - if len(nc.knownNodeSet)+len(added) != len(allNodes) { - knowSetCopy := map[string]*v1.Node{} - for k, v := range nc.knownNodeSet { - knowSetCopy[k] = v - } - for i := range allNodes { - delete(knowSetCopy, allNodes[i].Name) - } - for i := range knowSetCopy { - deleted = append(deleted, knowSetCopy[i]) - } - } - return -} - -// HealthyQPSFunc returns the default value for cluster eviction rate - we take -// nodeNum for consistency with ReducedQPSFunc. -func (nc *Controller) HealthyQPSFunc(nodeNum int) float32 { - return nc.evictionLimiterQPS -} - -// ReducedQPSFunc returns the QPS for when a the cluster is large make -// evictions slower, if they're small stop evictions altogether. -func (nc *Controller) ReducedQPSFunc(nodeNum int) float32 { - if int32(nodeNum) > nc.largeClusterThreshold { - return nc.secondaryEvictionLimiterQPS - } - return 0 -} - -// addPodEvictorForNewZone checks if new zone appeared, and if so add new evictor. -func (nc *Controller) addPodEvictorForNewZone(node *v1.Node) { - nc.evictorLock.Lock() - defer nc.evictorLock.Unlock() - zone := utilnode.GetZoneKey(node) - if _, found := nc.zoneStates[zone]; !found { - nc.zoneStates[zone] = stateInitial - if !nc.runTaintManager { - nc.zonePodEvictor[zone] = - scheduler.NewRateLimitedTimedQueue( - flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, scheduler.EvictionRateLimiterBurst)) - } else { - nc.zoneNoExecuteTainter[zone] = - scheduler.NewRateLimitedTimedQueue( - flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, scheduler.EvictionRateLimiterBurst)) - } - // Init the metric for the new zone. - klog.Infof("Initializing eviction metric for zone: %v", zone) - evictionsNumber.WithLabelValues(zone).Add(0) - } -} - -// cancelPodEviction removes any queued evictions, typically because the node is available again. It -// returns true if an eviction was queued. -func (nc *Controller) cancelPodEviction(node *v1.Node) bool { - zone := utilnode.GetZoneKey(node) - nc.evictorLock.Lock() - defer nc.evictorLock.Unlock() - if !nc.nodeEvictionMap.setStatus(node.Name, unmarked) { - klog.V(2).Infof("node %v was unregistered in the meantime - skipping setting status", node.Name) - } - wasDeleting := nc.zonePodEvictor[zone].Remove(node.Name) - if wasDeleting { - klog.V(2).Infof("Cancelling pod Eviction on Node: %v", node.Name) - return true - } - return false -} - -// evictPods: -// - adds node to evictor queue if the node is not marked as evicted. -// Returns false if the node name was already enqueued. -// - deletes pods immediately if node is already marked as evicted. -// Returns false, because the node wasn't added to the queue. -func (nc *Controller) evictPods(node *v1.Node, pods []*v1.Pod) (bool, error) { - // if node is in autonomy status, skip evict pods from the node - if node != nil && node.Annotations != nil && node.Annotations[nodeutil.AnnotationKeyNodeAutonomy] == "true" { - klog.V(2).Infof("node %s is in autonomy status, so skip pods eviction", node.Name) - return false, nil - } - - nc.evictorLock.Lock() - defer nc.evictorLock.Unlock() - status, ok := nc.nodeEvictionMap.getStatus(node.Name) - if ok && status == evicted { - // Node eviction already happened for this node. - // Handling immediate pod deletion. - _, err := nodeutil.DeletePods(nc.kubeClient, pods, nc.recorder, node.Name, string(node.UID), nc.daemonSetStore) - if err != nil { - return false, fmt.Errorf("unable to delete pods from node %q: %w", node.Name, err) - } - return false, nil - } - if !nc.nodeEvictionMap.setStatus(node.Name, toBeEvicted) { - klog.V(2).Infof("node %v was unregistered in the meantime - skipping setting status", node.Name) - } - return nc.zonePodEvictor[utilnode.GetZoneKey(node)].Add(node.Name, string(node.UID)), nil -} - -func (nc *Controller) markNodeForTainting(node *v1.Node, status v1.ConditionStatus) bool { - nc.evictorLock.Lock() - defer nc.evictorLock.Unlock() - if status == v1.ConditionFalse { - if !taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) { - nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Remove(node.Name) - } - } - - if status == v1.ConditionUnknown { - if !taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) { - nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Remove(node.Name) - } - } - - return nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Add(node.Name, string(node.UID)) -} - -func (nc *Controller) markNodeAsReachable(node *v1.Node) (bool, error) { - nc.evictorLock.Lock() - defer nc.evictorLock.Unlock() - err := controller.RemoveTaintOffNode(nc.kubeClient, node.Name, node, UnreachableTaintTemplate) - if err != nil { - klog.Errorf("Failed to remove taint from node %v: %v", node.Name, err) - return false, err - } - err = controller.RemoveTaintOffNode(nc.kubeClient, node.Name, node, NotReadyTaintTemplate) - if err != nil { - klog.Errorf("Failed to remove taint from node %v: %v", node.Name, err) - return false, err - } - return nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Remove(node.Name), nil -} - -// ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone. -// The zone is considered: -// - fullyDisrupted if there're no Ready Nodes, -// - partiallyDisrupted if at least than nc.unhealthyZoneThreshold percent of Nodes are not Ready, -// - normal otherwise -func (nc *Controller) ComputeZoneState(nodeReadyConditions []*v1.NodeCondition) (int, ZoneState) { - readyNodes := 0 - notReadyNodes := 0 - for i := range nodeReadyConditions { - if nodeReadyConditions[i] != nil && nodeReadyConditions[i].Status == v1.ConditionTrue { - readyNodes++ - } else { - notReadyNodes++ - } - } - switch { - case readyNodes == 0 && notReadyNodes > 0: - return notReadyNodes, stateFullDisruption - case notReadyNodes > 2 && float32(notReadyNodes)/float32(notReadyNodes+readyNodes) >= nc.unhealthyZoneThreshold: - return notReadyNodes, statePartialDisruption - default: - return notReadyNodes, stateNormal - } -} - -// reconcileNodeLabels reconciles node labels. -func (nc *Controller) reconcileNodeLabels(nodeName string) error { - node, err := nc.nodeLister.Get(nodeName) - if err != nil { - // If node not found, just ignore it. - if apierrors.IsNotFound(err) { - return nil - } - return err - } - - if node.Labels == nil { - // Nothing to reconcile. - return nil - } - - labelsToUpdate := map[string]string{} - for _, r := range labelReconcileInfo { - primaryValue, primaryExists := node.Labels[r.primaryKey] - secondaryValue, secondaryExists := node.Labels[r.secondaryKey] - - if !primaryExists { - // The primary label key does not exist. This should not happen - // within our supported version skew range, when no external - // components/factors modifying the node object. Ignore this case. - continue - } - if secondaryExists && primaryValue != secondaryValue { - // Secondary label exists, but not consistent with the primary - // label. Need to reconcile. - labelsToUpdate[r.secondaryKey] = primaryValue - - } else if !secondaryExists && r.ensureSecondaryExists { - // Apply secondary label based on primary label. - labelsToUpdate[r.secondaryKey] = primaryValue - } - } - - if len(labelsToUpdate) == 0 { - return nil - } - if !nodeutil.AddOrUpdateLabelsOnNode(nc.kubeClient, labelsToUpdate, node) { - return fmt.Errorf("failed update labels for node %+v", node) - } - return nil -} diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go deleted file mode 100644 index 6d854c0dfc0..00000000000 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go +++ /dev/null @@ -1,3941 +0,0 @@ -/* -Copyright 2020 The OpenYurt Authors. -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* -This file was derived from k8s.io/kubernetes/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go -at commit: 27522a29feb. - -CHANGELOG from OpenYurt Authors: -1. Remove Test_isNodeExcludedFromDisruptionChecks. -2. Use LabelOS and LabelArch in node_lifecycle_controller.go instead of kubeletapi. -3. Change cases in TestReconcileNodeLabels, details can be found in comment. -4. Remove master node role case in TestMonitorNodeHealthEvictPodsWithDisruption, details can be found in comment. -5. Add autonomy test case in TestMonitorNodeHealthEvictPods. -6. Add autonomy test case in TestMonitorNodeHealthMarkPodsNotReady. -*/ - -package nodelifecycle - -import ( - "context" - "fmt" - "strings" - "testing" - "time" - - apps "k8s.io/api/apps/v1" - coordv1 "k8s.io/api/coordination/v1" - v1 "k8s.io/api/core/v1" - apiequality "k8s.io/apimachinery/pkg/api/equality" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/fields" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/util/diff" - "k8s.io/client-go/informers" - appsinformers "k8s.io/client-go/informers/apps/v1" - coordinformers "k8s.io/client-go/informers/coordination/v1" - coreinformers "k8s.io/client-go/informers/core/v1" - clientset "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/fake" - testcore "k8s.io/client-go/testing" - "k8s.io/utils/pointer" - - "github.com/openyurtio/openyurt/pkg/controller/kubernetes/controller" - "github.com/openyurtio/openyurt/pkg/controller/kubernetes/controller/testutil" - "github.com/openyurtio/openyurt/pkg/controller/kubernetes/util/node" - taintutils "github.com/openyurtio/openyurt/pkg/controller/kubernetes/util/taints" - "github.com/openyurtio/openyurt/pkg/controller/nodelifecycle/scheduler" - nodeutil "github.com/openyurtio/openyurt/pkg/controller/util/node" -) - -const ( - testNodeMonitorGracePeriod = 40 * time.Second - testNodeStartupGracePeriod = 60 * time.Second - testNodeMonitorPeriod = 5 * time.Second - testRateLimiterQPS = float32(10000) - testLargeClusterThreshold = 20 - testUnhealthyThreshold = float32(0.55) -) - -func alwaysReady() bool { return true } - -func fakeGetPodsAssignedToNode(c *fake.Clientset) func(string) ([]*v1.Pod, error) { - return func(nodeName string) ([]*v1.Pod, error) { - selector := fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeName}) - pods, err := c.CoreV1().Pods(v1.NamespaceAll).List(context.TODO(), metav1.ListOptions{ - FieldSelector: selector.String(), - LabelSelector: labels.Everything().String(), - }) - if err != nil { - return nil, fmt.Errorf("failed to get Pods assigned to node %v", nodeName) - } - rPods := make([]*v1.Pod, len(pods.Items)) - for i := range pods.Items { - rPods[i] = &pods.Items[i] - } - return rPods, nil - } -} - -type nodeLifecycleController struct { - *Controller - leaseInformer coordinformers.LeaseInformer - nodeInformer coreinformers.NodeInformer - daemonSetInformer appsinformers.DaemonSetInformer -} - -// doEviction does the fake eviction and returns the status of eviction operation. -func (nc *nodeLifecycleController) doEviction(fakeNodeHandler *testutil.FakeNodeHandler) bool { - nc.evictorLock.Lock() - defer nc.evictorLock.Unlock() - zones := testutil.GetZones(fakeNodeHandler) - for _, zone := range zones { - nc.zonePodEvictor[zone].Try(func(value scheduler.TimedValue) (bool, time.Duration) { - uid, _ := value.UID.(string) - pods, _ := nc.getPodsAssignedToNode(value.Value) - nodeutil.DeletePods(fakeNodeHandler, pods, nc.recorder, value.Value, uid, nc.daemonSetStore) - _ = nc.nodeEvictionMap.setStatus(value.Value, evicted) - return true, 0 - }) - } - - for _, action := range fakeNodeHandler.Actions() { - if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" { - return true - } - } - return false -} - -func createNodeLease(nodeName string, renewTime metav1.MicroTime) *coordv1.Lease { - return &coordv1.Lease{ - ObjectMeta: metav1.ObjectMeta{ - Name: nodeName, - Namespace: v1.NamespaceNodeLease, - }, - Spec: coordv1.LeaseSpec{ - HolderIdentity: pointer.StringPtr(nodeName), - RenewTime: &renewTime, - }, - } -} - -func (nc *nodeLifecycleController) syncLeaseStore(lease *coordv1.Lease) error { - if lease == nil { - return nil - } - newElems := make([]interface{}, 0, 1) - newElems = append(newElems, lease) - return nc.leaseInformer.Informer().GetStore().Replace(newElems, "newRV") -} - -func (nc *nodeLifecycleController) syncNodeStore(fakeNodeHandler *testutil.FakeNodeHandler) error { - nodes, err := fakeNodeHandler.List(context.TODO(), metav1.ListOptions{}) - if err != nil { - return err - } - newElems := make([]interface{}, 0, len(nodes.Items)) - for i := range nodes.Items { - newElems = append(newElems, &nodes.Items[i]) - } - return nc.nodeInformer.Informer().GetStore().Replace(newElems, "newRV") -} - -func newNodeLifecycleControllerFromClient( - kubeClient clientset.Interface, - podEvictionTimeout time.Duration, - evictionLimiterQPS float32, - secondaryEvictionLimiterQPS float32, - largeClusterThreshold int32, - unhealthyZoneThreshold float32, - nodeMonitorGracePeriod time.Duration, - nodeStartupGracePeriod time.Duration, - nodeMonitorPeriod time.Duration, - useTaints bool, -) (*nodeLifecycleController, error) { - - factory := informers.NewSharedInformerFactory(kubeClient, controller.NoResyncPeriodFunc()) - - leaseInformer := factory.Coordination().V1().Leases() - nodeInformer := factory.Core().V1().Nodes() - daemonSetInformer := factory.Apps().V1().DaemonSets() - - nc, err := NewNodeLifecycleController( - leaseInformer, - factory.Core().V1().Pods(), - nodeInformer, - daemonSetInformer, - kubeClient, - nodeMonitorPeriod, - nodeStartupGracePeriod, - nodeMonitorGracePeriod, - podEvictionTimeout, - evictionLimiterQPS, - secondaryEvictionLimiterQPS, - largeClusterThreshold, - unhealthyZoneThreshold, - useTaints, - ) - if err != nil { - return nil, err - } - - nc.leaseInformerSynced = alwaysReady - nc.podInformerSynced = alwaysReady - nc.nodeInformerSynced = alwaysReady - nc.daemonSetInformerSynced = alwaysReady - - return &nodeLifecycleController{nc, leaseInformer, nodeInformer, daemonSetInformer}, nil -} - -func TestMonitorNodeHealthEvictPods(t *testing.T) { - fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute - labels := map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - } - - // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady - // we need second healthy node in tests. Because of how the tests are written we need to update - // the status of this Node. - healthyNodeNewStatus := v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - // Node status has just been updated, and is NotReady for 10min. - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 9, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - } - - table := []struct { - fakeNodeHandler *testutil.FakeNodeHandler - daemonSets []apps.DaemonSet - timeToPass time.Duration - newNodeStatus v1.NodeStatus - secondNodeNewStatus v1.NodeStatus - expectedEvictPods bool - description string - }{ - // Node created recently, with no status (happens only at cluster startup). - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: fakeNow, - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - daemonSets: nil, - timeToPass: 0, - newNodeStatus: v1.NodeStatus{}, - secondNodeNewStatus: healthyNodeNewStatus, - expectedEvictPods: false, - description: "Node created recently, with no status.", - }, - // Node created recently without FailureDomain labels which is added back later, with no status (happens only at cluster startup). - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: fakeNow, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - daemonSets: nil, - timeToPass: 0, - newNodeStatus: v1.NodeStatus{}, - secondNodeNewStatus: healthyNodeNewStatus, - expectedEvictPods: false, - description: "Node created recently without FailureDomain labels which is added back later, with no status (happens only at cluster startup).", - }, - // Node created long time ago, and kubelet posted NotReady for a short period of time. - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionFalse, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - daemonSets: nil, - timeToPass: evictionTimeout, - newNodeStatus: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionFalse, - // Node status has just been updated, and is NotReady for 10min. - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 9, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - secondNodeNewStatus: healthyNodeNewStatus, - expectedEvictPods: false, - description: "Node created long time ago, and kubelet posted NotReady for a short period of time.", - }, - // Pod is ds-managed, and kubelet posted NotReady for a long period of time. - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionFalse, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset( - &v1.PodList{ - Items: []v1.Pod{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "pod0", - Namespace: "default", - Labels: map[string]string{"daemon": "yes"}, - }, - Spec: v1.PodSpec{ - NodeName: "node0", - }, - }, - }, - }, - ), - }, - daemonSets: []apps.DaemonSet{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "ds0", - Namespace: "default", - }, - Spec: apps.DaemonSetSpec{ - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{"daemon": "yes"}, - }, - }, - }, - }, - timeToPass: time.Hour, - newNodeStatus: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionFalse, - // Node status has just been updated, and is NotReady for 1hr. - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 59, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - secondNodeNewStatus: healthyNodeNewStatus, - expectedEvictPods: false, - description: "Pod is ds-managed, and kubelet posted NotReady for a long period of time.", - }, - // Node created long time ago, and kubelet posted NotReady for a long period of time. - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionFalse, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - daemonSets: nil, - timeToPass: time.Hour, - newNodeStatus: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionFalse, - // Node status has just been updated, and is NotReady for 1hr. - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 59, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - secondNodeNewStatus: healthyNodeNewStatus, - expectedEvictPods: true, - description: "Node created long time ago, and kubelet posted NotReady for a long period of time.", - }, - // Node created long time ago, node controller posted Unknown for a short period of time. - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - daemonSets: nil, - timeToPass: evictionTimeout - testNodeMonitorGracePeriod, - newNodeStatus: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - // Node status was updated by nodecontroller 10min ago - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - secondNodeNewStatus: healthyNodeNewStatus, - expectedEvictPods: false, - description: "Node created long time ago, node controller posted Unknown for a short period of time.", - }, - // Node created long time ago, node controller posted Unknown for a long period of time. - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - daemonSets: nil, - timeToPass: 60 * time.Minute, - newNodeStatus: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - // Node status was updated by nodecontroller 1hr ago - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - secondNodeNewStatus: healthyNodeNewStatus, - expectedEvictPods: true, - description: "Node created long time ago, node controller posted Unknown for a long period of time.", - }, - // From OpenYurt Authors: - // When node runs in autonomy mode, do not evict pods on it even its ready status is Unknown. - // Node created long time ago but run in autonomy mod, node controller posted Unknown for a short period of time. - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Annotations: map[string]string{ - nodeutil.AnnotationKeyNodeAutonomy: "true", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - daemonSets: nil, - timeToPass: 60 * time.Minute, - newNodeStatus: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - secondNodeNewStatus: healthyNodeNewStatus, - expectedEvictPods: false, - description: "Node created long time ago and is running in autonomy mode, node controller posted Unknown for a long period of time", - }, - } - - for _, item := range table { - nodeController, _ := newNodeLifecycleControllerFromClient( - item.fakeNodeHandler, - evictionTimeout, - testRateLimiterQPS, - testRateLimiterQPS, - testLargeClusterThreshold, - testUnhealthyThreshold, - testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, - testNodeMonitorPeriod, - false) - nodeController.now = func() metav1.Time { return fakeNow } - nodeController.recorder = testutil.NewFakeRecorder() - nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) - for _, ds := range item.daemonSets { - nodeController.daemonSetInformer.Informer().GetStore().Add(&ds) - } - if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("unexpected error: %v", err) - } - if item.timeToPass > 0 { - nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} } - item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus - item.fakeNodeHandler.Existing[1].Status = item.secondNodeNewStatus - } - if len(item.fakeNodeHandler.Existing[0].Labels) == 0 && len(item.fakeNodeHandler.Existing[1].Labels) == 0 { - item.fakeNodeHandler.Existing[0].Labels = labels - item.fakeNodeHandler.Existing[1].Labels = labels - } - if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("unexpected error: %v", err) - } - zones := testutil.GetZones(item.fakeNodeHandler) - for _, zone := range zones { - if _, ok := nodeController.zonePodEvictor[zone]; ok { - nodeController.zonePodEvictor[zone].Try(func(value scheduler.TimedValue) (bool, time.Duration) { - nodeUID, _ := value.UID.(string) - pods, err := nodeController.getPodsAssignedToNode(value.Value) - if err != nil { - t.Errorf("unexpected error: %v", err) - } - t.Logf("listed pods %d for node %v", len(pods), value.Value) - nodeutil.DeletePods(item.fakeNodeHandler, pods, nodeController.recorder, value.Value, nodeUID, nodeController.daemonSetInformer.Lister()) - return true, 0 - }) - } else { - t.Fatalf("Zone %v was uninitialized!", zone) - } - } - - podEvicted := false - for _, action := range item.fakeNodeHandler.Actions() { - if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" { - podEvicted = true - } - } - - if item.expectedEvictPods != podEvicted { - t.Errorf("expected pod eviction: %+v, got %+v for %+v", item.expectedEvictPods, - podEvicted, item.description) - } - } -} - -func TestPodStatusChange(t *testing.T) { - fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute - - // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady - // we need second healthy node in tests. Because of how the tests are written we need to update - // the status of this Node. - healthyNodeNewStatus := v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - // Node status has just been updated, and is NotReady for 10min. - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 9, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - } - - // Node created long time ago, node controller posted Unknown for a long period of time. - table := []struct { - fakeNodeHandler *testutil.FakeNodeHandler - timeToPass time.Duration - newNodeStatus v1.NodeStatus - secondNodeNewStatus v1.NodeStatus - expectedPodUpdate bool - expectedReason string - description string - }{ - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - timeToPass: 60 * time.Minute, - newNodeStatus: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - // Node status was updated by nodecontroller 1hr ago - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - secondNodeNewStatus: healthyNodeNewStatus, - expectedPodUpdate: true, - expectedReason: node.NodeUnreachablePodReason, - description: "Node created long time ago, node controller posted Unknown for a " + - "long period of time, the pod status must include reason for termination.", - }, - } - - for _, item := range table { - nodeController, _ := newNodeLifecycleControllerFromClient( - item.fakeNodeHandler, - evictionTimeout, - testRateLimiterQPS, - testRateLimiterQPS, - testLargeClusterThreshold, - testUnhealthyThreshold, - testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, - testNodeMonitorPeriod, - false) - nodeController.now = func() metav1.Time { return fakeNow } - nodeController.recorder = testutil.NewFakeRecorder() - nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) - if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("unexpected error: %v", err) - } - if item.timeToPass > 0 { - nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} } - item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus - item.fakeNodeHandler.Existing[1].Status = item.secondNodeNewStatus - } - if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("unexpected error: %v", err) - } - zones := testutil.GetZones(item.fakeNodeHandler) - for _, zone := range zones { - nodeController.zonePodEvictor[zone].Try(func(value scheduler.TimedValue) (bool, time.Duration) { - nodeUID, _ := value.UID.(string) - pods, err := nodeController.getPodsAssignedToNode(value.Value) - if err != nil { - t.Errorf("unexpected error: %v", err) - } - nodeutil.DeletePods(item.fakeNodeHandler, pods, nodeController.recorder, value.Value, nodeUID, nodeController.daemonSetStore) - return true, 0 - }) - } - - podReasonUpdate := false - for _, action := range item.fakeNodeHandler.Actions() { - if action.GetVerb() == "update" && action.GetResource().Resource == "pods" { - updateReason := action.(testcore.UpdateActionImpl).GetObject().(*v1.Pod).Status.Reason - podReasonUpdate = true - if updateReason != item.expectedReason { - t.Errorf("expected pod status reason: %+v, got %+v for %+v", item.expectedReason, updateReason, item.description) - } - } - } - - if podReasonUpdate != item.expectedPodUpdate { - t.Errorf("expected pod update: %+v, got %+v for %+v", item.expectedPodUpdate, podReasonUpdate, item.description) - } - } -} - -func TestMonitorNodeHealthEvictPodsWithDisruption(t *testing.T) { - fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute - timeToPass := 60 * time.Minute - - // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady - // we need second healthy node in tests. Because of how the tests are written we need to update - // the status of this Node. - healthyNodeNewStatus := v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 13, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - } - unhealthyNodeNewStatus := v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - // Node status was updated by nodecontroller 1hr ago - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - } - - table := []struct { - nodeList []*v1.Node - podList []v1.Pod - updatedNodeStatuses []v1.NodeStatus - expectedInitialStates map[string]ZoneState - expectedFollowingStates map[string]ZoneState - expectedEvictPods bool - description string - }{ - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes. - // Only zone is down - eviction shouldn't take place - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - unhealthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{testutil.CreateZoneID("region1", "zone1"): stateFullDisruption}, - expectedFollowingStates: map[string]ZoneState{testutil.CreateZoneID("region1", "zone1"): stateFullDisruption}, - expectedEvictPods: false, - description: "Network Disruption: Only zone is down - eviction shouldn't take place.", - }, - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes. - // Both zones down - eviction shouldn't take place - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region2", - v1.LabelTopologyZone: "zone2", - v1.LabelFailureDomainBetaRegion: "region2", - v1.LabelFailureDomainBetaZone: "zone2", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - unhealthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region2", "zone2"): stateFullDisruption, - }, - expectedFollowingStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region2", "zone2"): stateFullDisruption, - }, - expectedEvictPods: false, - description: "Network Disruption: Both zones down - eviction shouldn't take place.", - }, - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes. - // One zone is down - eviction should take place - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone2", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone2", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - healthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region1", "zone2"): stateNormal, - }, - expectedFollowingStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region1", "zone2"): stateNormal, - }, - expectedEvictPods: true, - description: "Network Disruption: One zone is down - eviction should take place.", - }, - // OpenYurt Authors: - // We don't use master node role to determine whether exclude this node from disruption check in - // func isNodeExcludedFromDisruptionChecks in node_lifecycle_controller.go. So, do not run this case. - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period - // of on first Node, eviction should stop even though -master Node is healthy. - // { - // nodeList: []*v1.Node{ - // { - // ObjectMeta: metav1.ObjectMeta{ - // Name: "node0", - // CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - // Labels: map[string]string{ - // v1.LabelTopologyRegion: "region1", - // v1.LabelTopologyZone: "zone1", - // v1.LabelFailureDomainBetaRegion: "region1", - // v1.LabelFailureDomainBetaZone: "zone1", - // }, - // }, - // Status: v1.NodeStatus{ - // Conditions: []v1.NodeCondition{ - // { - // Type: v1.NodeReady, - // Status: v1.ConditionUnknown, - // LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - // LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - // }, - // }, - // }, - // }, - // { - // ObjectMeta: metav1.ObjectMeta{ - // Name: "node-master", - // CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - // Labels: map[string]string{ - // v1.LabelTopologyRegion: "region1", - // v1.LabelTopologyZone: "zone1", - // v1.LabelFailureDomainBetaRegion: "region1", - // v1.LabelFailureDomainBetaZone: "zone1", - // }, - // }, - // Status: v1.NodeStatus{ - // Conditions: []v1.NodeCondition{ - // { - // Type: v1.NodeReady, - // Status: v1.ConditionTrue, - // LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - // LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - // }, - // }, - // }, - // }, - // }, - // podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - // updatedNodeStatuses: []v1.NodeStatus{ - // unhealthyNodeNewStatus, - // healthyNodeNewStatus, - // }, - // expectedInitialStates: map[string]ZoneState{ - // testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - // }, - // expectedFollowingStates: map[string]ZoneState{ - // testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - // }, - // expectedEvictPods: false, - // description: "NetworkDisruption: eviction should stop, only -master Node is healthy", - // }, - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes. - // Initially both zones down, one comes back - eviction should take place - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone2", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone2", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - healthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region1", "zone2"): stateFullDisruption, - }, - expectedFollowingStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, - testutil.CreateZoneID("region1", "zone2"): stateNormal, - }, - expectedEvictPods: true, - description: "Initially both zones down, one comes back - eviction should take place", - }, - // NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes. - // Zone is partially disrupted - eviction should take place - { - nodeList: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node2", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node3", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node4", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - - podList: []v1.Pod{*testutil.NewPod("pod0", "node0")}, - updatedNodeStatuses: []v1.NodeStatus{ - unhealthyNodeNewStatus, - unhealthyNodeNewStatus, - unhealthyNodeNewStatus, - healthyNodeNewStatus, - healthyNodeNewStatus, - }, - expectedInitialStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): statePartialDisruption, - }, - expectedFollowingStates: map[string]ZoneState{ - testutil.CreateZoneID("region1", "zone1"): statePartialDisruption, - }, - expectedEvictPods: true, - description: "Zone is partially disrupted - eviction should take place.", - }, - } - - for _, item := range table { - fakeNodeHandler := &testutil.FakeNodeHandler{ - Existing: item.nodeList, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: item.podList}), - } - nodeController, _ := newNodeLifecycleControllerFromClient( - fakeNodeHandler, - evictionTimeout, - testRateLimiterQPS, - testRateLimiterQPS, - testLargeClusterThreshold, - testUnhealthyThreshold, - testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, - testNodeMonitorPeriod, - false) - nodeController.now = func() metav1.Time { return fakeNow } - nodeController.recorder = testutil.NewFakeRecorder() - nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) - nodeController.enterPartialDisruptionFunc = func(nodeNum int) float32 { - return testRateLimiterQPS - } - nodeController.enterFullDisruptionFunc = func(nodeNum int) float32 { - return testRateLimiterQPS - } - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("%v: unexpected error: %v", item.description, err) - } - - for zone, state := range item.expectedInitialStates { - if state != nodeController.zoneStates[zone] { - t.Errorf("%v: Unexpected zone state: %v: %v instead %v", item.description, zone, nodeController.zoneStates[zone], state) - } - } - - nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(timeToPass)} } - for i := range item.updatedNodeStatuses { - fakeNodeHandler.Existing[i].Status = item.updatedNodeStatuses[i] - } - - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("%v: unexpected error: %v", item.description, err) - } - for zone, state := range item.expectedFollowingStates { - if state != nodeController.zoneStates[zone] { - t.Errorf("%v: Unexpected zone state: %v: %v instead %v", item.description, zone, nodeController.zoneStates[zone], state) - } - } - var podEvicted bool - start := time.Now() - // Infinite loop, used for retrying in case ratelimiter fails to reload for Try function. - // this breaks when we have the status that we need for test case or when we don't see the - // intended result after 1 minute. - for { - podEvicted = nodeController.doEviction(fakeNodeHandler) - if podEvicted == item.expectedEvictPods || time.Since(start) > 1*time.Minute { - break - } - } - if item.expectedEvictPods != podEvicted { - t.Errorf("%v: expected pod eviction: %+v, got %+v", item.description, item.expectedEvictPods, podEvicted) - } - } -} - -func TestMonitorNodeHealthUpdateStatus(t *testing.T) { - fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) - table := []struct { - fakeNodeHandler *testutil.FakeNodeHandler - timeToPass time.Duration - newNodeStatus v1.NodeStatus - expectedRequestCount int - expectedNodes []*v1.Node - expectedPodStatusUpdate bool - }{ - // Node created long time ago, without status: - // Expect Unknown status posted from node controller. - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - expectedRequestCount: 2, // List+Update - expectedNodes: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - LastTransitionTime: fakeNow, - }, - { - Type: v1.NodeMemoryPressure, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - LastTransitionTime: fakeNow, - }, - { - Type: v1.NodeDiskPressure, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - LastTransitionTime: fakeNow, - }, - { - Type: v1.NodePIDPressure, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - LastTransitionTime: fakeNow, - }, - }, - }, - }, - }, - expectedPodStatusUpdate: false, // Pod was never scheduled - }, - // Node created recently, without status. - // Expect no action from node controller (within startup grace period). - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: fakeNow, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - expectedRequestCount: 1, // List - expectedNodes: nil, - expectedPodStatusUpdate: false, - }, - // Node created long time ago, with status updated by kubelet exceeds grace period. - // Expect Unknown status posted from node controller. - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - // Node status hasn't been updated for 1hr. - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - expectedRequestCount: 3, // (List+)List+Update - timeToPass: time.Hour, - newNodeStatus: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - // Node status hasn't been updated for 1hr. - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - }, - expectedNodes: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - Reason: "NodeStatusUnknown", - Message: "Kubelet stopped posting node status.", - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)}, - }, - { - Type: v1.NodeMemoryPressure, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), // should default to node creation time if condition was never updated - LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)}, - }, - { - Type: v1.NodeDiskPressure, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), // should default to node creation time if condition was never updated - LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)}, - }, - { - Type: v1.NodePIDPressure, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), // should default to node creation time if condition was never updated - LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)}, - }, - }, - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - }, - }, - }, - expectedPodStatusUpdate: true, - }, - // Node created long time ago, with status updated recently. - // Expect no action from node controller (within monitor grace period). - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - // Node status has just been updated. - LastHeartbeatTime: fakeNow, - LastTransitionTime: fakeNow, - }, - }, - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - expectedRequestCount: 1, // List - expectedNodes: nil, - expectedPodStatusUpdate: false, - }, - } - for i, item := range table { - nodeController, _ := newNodeLifecycleControllerFromClient( - item.fakeNodeHandler, - 5*time.Minute, - testRateLimiterQPS, - testRateLimiterQPS, - testLargeClusterThreshold, - testUnhealthyThreshold, - testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, - testNodeMonitorPeriod, - false) - nodeController.now = func() metav1.Time { return fakeNow } - nodeController.recorder = testutil.NewFakeRecorder() - nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) - if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("unexpected error: %v", err) - } - if item.timeToPass > 0 { - nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} } - item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus - if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("unexpected error: %v", err) - } - } - if item.expectedRequestCount != item.fakeNodeHandler.RequestCount { - t.Errorf("expected %v call, but got %v.", item.expectedRequestCount, item.fakeNodeHandler.RequestCount) - } - if len(item.fakeNodeHandler.UpdatedNodes) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, item.fakeNodeHandler.UpdatedNodes) { - t.Errorf("Case[%d] unexpected nodes: %s", i, diff.ObjectDiff(item.expectedNodes[0], item.fakeNodeHandler.UpdatedNodes[0])) - } - if len(item.fakeNodeHandler.UpdatedNodeStatuses) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, item.fakeNodeHandler.UpdatedNodeStatuses) { - t.Errorf("Case[%d] unexpected nodes: %s", i, diff.ObjectDiff(item.expectedNodes[0], item.fakeNodeHandler.UpdatedNodeStatuses[0])) - } - - podStatusUpdated := false - for _, action := range item.fakeNodeHandler.Actions() { - if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" { - podStatusUpdated = true - } - } - if podStatusUpdated != item.expectedPodStatusUpdate { - t.Errorf("Case[%d] expect pod status updated to be %v, but got %v", i, item.expectedPodStatusUpdate, podStatusUpdated) - } - } -} - -func TestMonitorNodeHealthUpdateNodeAndPodStatusWithLease(t *testing.T) { - nodeCreationTime := metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC) - fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) - testcases := []struct { - description string - fakeNodeHandler *testutil.FakeNodeHandler - lease *coordv1.Lease - timeToPass time.Duration - newNodeStatus v1.NodeStatus - newLease *coordv1.Lease - expectedRequestCount int - expectedNodes []*v1.Node - expectedPodStatusUpdate bool - }{ - // Node created recently, without status. Node lease is missing. - // Expect no action from node controller (within startup grace period). - { - description: "Node created recently, without status. Node lease is missing.", - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: fakeNow, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - expectedRequestCount: 1, // List - expectedNodes: nil, - expectedPodStatusUpdate: false, - }, - // Node created recently, without status. Node lease is renewed recently. - // Expect no action from node controller (within startup grace period). - { - description: "Node created recently, without status. Node lease is renewed recently.", - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: fakeNow, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), - expectedRequestCount: 1, // List - expectedNodes: nil, - expectedPodStatusUpdate: false, - }, - // Node created long time ago, without status. Node lease is missing. - // Expect Unknown status posted from node controller. - { - description: "Node created long time ago, without status. Node lease is missing.", - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: nodeCreationTime, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - expectedRequestCount: 2, // List+Update - expectedNodes: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: nodeCreationTime, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: nodeCreationTime, - LastTransitionTime: fakeNow, - }, - { - Type: v1.NodeMemoryPressure, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: nodeCreationTime, - LastTransitionTime: fakeNow, - }, - { - Type: v1.NodeDiskPressure, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: nodeCreationTime, - LastTransitionTime: fakeNow, - }, - { - Type: v1.NodePIDPressure, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: nodeCreationTime, - LastTransitionTime: fakeNow, - }, - }, - }, - }, - }, - expectedPodStatusUpdate: false, // Pod was never scheduled because the node was never ready. - }, - // Node created long time ago, without status. Node lease is renewed recently. - // Expect no action from node controller (within monitor grace period). - { - description: "Node created long time ago, without status. Node lease is renewed recently.", - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: nodeCreationTime, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), - timeToPass: time.Hour, - newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time.Add(time.Hour))), // Lease is renewed after 1 hour. - expectedRequestCount: 2, // List+List - expectedNodes: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: nodeCreationTime, - }, - }, - }, - expectedPodStatusUpdate: false, - }, - // Node created long time ago, without status. Node lease is expired. - // Expect Unknown status posted from node controller. - { - description: "Node created long time ago, without status. Node lease is expired.", - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: nodeCreationTime, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), - timeToPass: time.Hour, - newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), // Lease is not renewed after 1 hour. - expectedRequestCount: 3, // List+List+Update - expectedNodes: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: nodeCreationTime, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: nodeCreationTime, - LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, - }, - { - Type: v1.NodeMemoryPressure, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: nodeCreationTime, - LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, - }, - { - Type: v1.NodeDiskPressure, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: nodeCreationTime, - LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, - }, - { - Type: v1.NodePIDPressure, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: nodeCreationTime, - LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, - }, - }, - }, - }, - }, - expectedPodStatusUpdate: false, - }, - // Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is renewed. - // Expect no action from node controller (within monitor grace period). - { - description: "Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is renewed.", - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: nodeCreationTime, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: fakeNow, - LastTransitionTime: fakeNow, - }, - { - Type: v1.NodeDiskPressure, - Status: v1.ConditionFalse, - LastHeartbeatTime: fakeNow, - LastTransitionTime: fakeNow, - }, - }, - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), - expectedRequestCount: 2, // List+List - timeToPass: time.Hour, - newNodeStatus: v1.NodeStatus{ - // Node status hasn't been updated for 1 hour. - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: fakeNow, - LastTransitionTime: fakeNow, - }, - { - Type: v1.NodeDiskPressure, - Status: v1.ConditionFalse, - LastHeartbeatTime: fakeNow, - LastTransitionTime: fakeNow, - }, - }, - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - }, - newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time.Add(time.Hour))), // Lease is renewed after 1 hour. - expectedNodes: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: nodeCreationTime, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: fakeNow, - LastTransitionTime: fakeNow, - }, - { - Type: v1.NodeDiskPressure, - Status: v1.ConditionFalse, - LastHeartbeatTime: fakeNow, - LastTransitionTime: fakeNow, - }, - }, - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - }, - }, - }, - expectedPodStatusUpdate: false, - }, - // Node created long time ago, with status updated by kubelet recently. Node lease is expired. - // Expect no action from node controller (within monitor grace period). - { - description: "Node created long time ago, with status updated by kubelet recently. Node lease is expired.", - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: nodeCreationTime, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: fakeNow, - LastTransitionTime: fakeNow, - }, - { - Type: v1.NodeDiskPressure, - Status: v1.ConditionFalse, - LastHeartbeatTime: fakeNow, - LastTransitionTime: fakeNow, - }, - }, - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), - expectedRequestCount: 2, // List+List - timeToPass: time.Hour, - newNodeStatus: v1.NodeStatus{ - // Node status is updated after 1 hour. - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, - LastTransitionTime: fakeNow, - }, - { - Type: v1.NodeDiskPressure, - Status: v1.ConditionFalse, - LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, - LastTransitionTime: fakeNow, - }, - }, - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - }, - newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), // Lease is not renewed after 1 hour. - expectedNodes: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: nodeCreationTime, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, - LastTransitionTime: fakeNow, - }, - { - Type: v1.NodeDiskPressure, - Status: v1.ConditionFalse, - LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, - LastTransitionTime: fakeNow, - }, - }, - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - }, - }, - }, - expectedPodStatusUpdate: false, - }, - // Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is also expired. - // Expect Unknown status posted from node controller. - { - description: "Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is also expired.", - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: nodeCreationTime, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: fakeNow, - LastTransitionTime: fakeNow, - }, - }, - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), - expectedRequestCount: 3, // List+List+Update - timeToPass: time.Hour, - newNodeStatus: v1.NodeStatus{ - // Node status hasn't been updated for 1 hour. - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: fakeNow, - LastTransitionTime: fakeNow, - }, - }, - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - }, - newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), // Lease is not renewed after 1 hour. - expectedNodes: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: nodeCreationTime, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - Reason: "NodeStatusUnknown", - Message: "Kubelet stopped posting node status.", - LastHeartbeatTime: fakeNow, - LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, - }, - { - Type: v1.NodeMemoryPressure, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: nodeCreationTime, // should default to node creation time if condition was never updated - LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, - }, - { - Type: v1.NodeDiskPressure, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: nodeCreationTime, // should default to node creation time if condition was never updated - LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, - }, - { - Type: v1.NodePIDPressure, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: "Kubelet never posted node status.", - LastHeartbeatTime: nodeCreationTime, // should default to node creation time if condition was never updated - LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, - }, - }, - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - }, - }, - }, - expectedPodStatusUpdate: true, - }, - } - - for _, item := range testcases { - t.Run(item.description, func(t *testing.T) { - nodeController, _ := newNodeLifecycleControllerFromClient( - item.fakeNodeHandler, - 5*time.Minute, - testRateLimiterQPS, - testRateLimiterQPS, - testLargeClusterThreshold, - testUnhealthyThreshold, - testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, - testNodeMonitorPeriod, - false) - nodeController.now = func() metav1.Time { return fakeNow } - nodeController.recorder = testutil.NewFakeRecorder() - nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) - if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { - t.Fatalf("unexpected error: %v", err) - } - if err := nodeController.syncLeaseStore(item.lease); err != nil { - t.Fatalf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Fatalf("unexpected error: %v", err) - } - if item.timeToPass > 0 { - nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} } - item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus - if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { - t.Fatalf("unexpected error: %v", err) - } - if err := nodeController.syncLeaseStore(item.newLease); err != nil { - t.Fatalf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Fatalf("unexpected error: %v", err) - } - } - if item.expectedRequestCount != item.fakeNodeHandler.RequestCount { - t.Errorf("expected %v call, but got %v.", item.expectedRequestCount, item.fakeNodeHandler.RequestCount) - } - if len(item.fakeNodeHandler.UpdatedNodes) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, item.fakeNodeHandler.UpdatedNodes) { - t.Errorf("unexpected nodes: %s", diff.ObjectDiff(item.expectedNodes[0], item.fakeNodeHandler.UpdatedNodes[0])) - } - if len(item.fakeNodeHandler.UpdatedNodeStatuses) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, item.fakeNodeHandler.UpdatedNodeStatuses) { - t.Errorf("unexpected nodes: %s", diff.ObjectDiff(item.expectedNodes[0], item.fakeNodeHandler.UpdatedNodeStatuses[0])) - } - - podStatusUpdated := false - for _, action := range item.fakeNodeHandler.Actions() { - if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" { - podStatusUpdated = true - } - } - if podStatusUpdated != item.expectedPodStatusUpdate { - t.Errorf("expect pod status updated to be %v, but got %v", item.expectedPodStatusUpdate, podStatusUpdated) - } - }) - } -} - -func TestMonitorNodeHealthMarkPodsNotReady(t *testing.T) { - fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) - table := []struct { - fakeNodeHandler *testutil.FakeNodeHandler - timeToPass time.Duration - newNodeStatus v1.NodeStatus - expectedPodStatusUpdate bool - }{ - // Node created recently, without status. - // Expect no action from node controller (within startup grace period). - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: fakeNow, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - expectedPodStatusUpdate: false, - }, - // Node created long time ago, with status updated recently. - // Expect no action from node controller (within monitor grace period). - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - // Node status has just been updated. - LastHeartbeatTime: fakeNow, - LastTransitionTime: fakeNow, - }, - }, - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - expectedPodStatusUpdate: false, - }, - // Node created long time ago, with status updated by kubelet exceeds grace period. - // Expect pods status updated and Unknown node status posted from node controller - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - // Node status hasn't been updated for 1hr. - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - timeToPass: 1 * time.Minute, - newNodeStatus: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - // Node status hasn't been updated for 1hr. - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - }, - expectedPodStatusUpdate: true, - }, - // From OpenYurt Authors: - // Node created long time ago and run in autonomy mode, with status updated by kubelet exceeds grace period. - // Expect pods status is not updated and Unknown node status posted from node controller. - { - fakeNodeHandler: &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Annotations: map[string]string{ - nodeutil.AnnotationKeyNodeAutonomy: "true", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - // Node status hasn't been updated for 1hr. - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - timeToPass: 1 * time.Minute, - newNodeStatus: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - // Node status hasn't been updated for 1hr. - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - expectedPodStatusUpdate: false, - }, - } - - for i, item := range table { - nodeController, _ := newNodeLifecycleControllerFromClient( - item.fakeNodeHandler, - 5*time.Minute, - testRateLimiterQPS, - testRateLimiterQPS, - testLargeClusterThreshold, - testUnhealthyThreshold, - testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, - testNodeMonitorPeriod, - false) - nodeController.now = func() metav1.Time { return fakeNow } - nodeController.recorder = testutil.NewFakeRecorder() - nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) - if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("Case[%d] unexpected error: %v", i, err) - } - if item.timeToPass > 0 { - nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} } - item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus - if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("Case[%d] unexpected error: %v", i, err) - } - } - - podStatusUpdated := false - for _, action := range item.fakeNodeHandler.Actions() { - if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" { - podStatusUpdated = true - } - } - if podStatusUpdated != item.expectedPodStatusUpdate { - t.Errorf("Case[%d] expect pod status updated to be %v, but got %v", i, item.expectedPodStatusUpdate, podStatusUpdated) - } - } -} - -func TestMonitorNodeHealthMarkPodsNotReadyRetry(t *testing.T) { - type nodeIteration struct { - timeToPass time.Duration - newNodes []*v1.Node - } - timeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) - timePlusTwoMinutes := metav1.Date(2015, 1, 1, 12, 0, 2, 0, time.UTC) - makeNodes := func(status v1.ConditionStatus, lastHeartbeatTime, lastTransitionTime metav1.Time) []*v1.Node { - return []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: timeNow, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: status, - LastHeartbeatTime: lastHeartbeatTime, - LastTransitionTime: lastTransitionTime, - }, - }, - }, - }, - } - } - table := []struct { - desc string - fakeNodeHandler *testutil.FakeNodeHandler - updateReactor func(action testcore.Action) (bool, runtime.Object, error) - fakeGetPodsAssignedToNode func(c *fake.Clientset) func(string) ([]*v1.Pod, error) - nodeIterations []nodeIteration - expectedPodStatusUpdates int - }{ - // Node created long time ago, with status updated by kubelet exceeds grace period. - // First monitorNodeHealth check will update pod status to NotReady. - // Second monitorNodeHealth check will do no updates (no retry). - { - desc: "successful pod status update, no retry required", - fakeNodeHandler: &testutil.FakeNodeHandler{ - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - fakeGetPodsAssignedToNode: fakeGetPodsAssignedToNode, - nodeIterations: []nodeIteration{ - { - timeToPass: 0, - newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), - }, - { - timeToPass: 1 * time.Minute, - newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), - }, - { - timeToPass: 1 * time.Minute, - newNodes: makeNodes(v1.ConditionFalse, timePlusTwoMinutes, timePlusTwoMinutes), - }, - }, - expectedPodStatusUpdates: 1, - }, - // Node created long time ago, with status updated by kubelet exceeds grace period. - // First monitorNodeHealth check will fail to update pod status to NotReady. - // Second monitorNodeHealth check will update pod status to NotReady (retry). - { - desc: "unsuccessful pod status update, retry required", - fakeNodeHandler: &testutil.FakeNodeHandler{ - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - updateReactor: func() func(action testcore.Action) (bool, runtime.Object, error) { - i := 0 - return func(action testcore.Action) (bool, runtime.Object, error) { - if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" { - i++ - switch i { - case 1: - return true, nil, fmt.Errorf("fake error") - default: - return true, testutil.NewPod("pod0", "node0"), nil - } - } - - return true, nil, fmt.Errorf("unsupported action") - } - }(), - fakeGetPodsAssignedToNode: fakeGetPodsAssignedToNode, - nodeIterations: []nodeIteration{ - { - timeToPass: 0, - newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), - }, - { - timeToPass: 1 * time.Minute, - newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), - }, - { - timeToPass: 1 * time.Minute, - newNodes: makeNodes(v1.ConditionFalse, timePlusTwoMinutes, timePlusTwoMinutes), - }, - }, - expectedPodStatusUpdates: 2, // One failed and one retry. - }, - // Node created long time ago, with status updated by kubelet exceeds grace period. - // First monitorNodeHealth check will fail to list pods. - // Second monitorNodeHealth check will update pod status to NotReady (retry). - { - desc: "unsuccessful pod list, retry required", - fakeNodeHandler: &testutil.FakeNodeHandler{ - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - }, - fakeGetPodsAssignedToNode: func(c *fake.Clientset) func(string) ([]*v1.Pod, error) { - i := 0 - f := fakeGetPodsAssignedToNode(c) - return func(nodeName string) ([]*v1.Pod, error) { - i++ - if i == 1 { - return nil, fmt.Errorf("fake error") - } - return f(nodeName) - } - }, - nodeIterations: []nodeIteration{ - { - timeToPass: 0, - newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), - }, - { - timeToPass: 1 * time.Minute, - newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), - }, - { - timeToPass: 1 * time.Minute, - newNodes: makeNodes(v1.ConditionFalse, timePlusTwoMinutes, timePlusTwoMinutes), - }, - }, - expectedPodStatusUpdates: 1, - }, - } - - for _, item := range table { - t.Run(item.desc, func(t *testing.T) { - nodeController, _ := newNodeLifecycleControllerFromClient( - item.fakeNodeHandler, - 5*time.Minute, - testRateLimiterQPS, - testRateLimiterQPS, - testLargeClusterThreshold, - testUnhealthyThreshold, - testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, - testNodeMonitorPeriod, - false) - if item.updateReactor != nil { - item.fakeNodeHandler.Clientset.PrependReactor("update", "pods", item.updateReactor) - } - nodeController.now = func() metav1.Time { return timeNow } - nodeController.recorder = testutil.NewFakeRecorder() - nodeController.getPodsAssignedToNode = item.fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) - for _, itertion := range item.nodeIterations { - nodeController.now = func() metav1.Time { return metav1.Time{Time: timeNow.Add(itertion.timeToPass)} } - item.fakeNodeHandler.Existing = itertion.newNodes - if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("unexpected error: %v", err) - } - } - - podStatusUpdates := 0 - for _, action := range item.fakeNodeHandler.Actions() { - if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" { - podStatusUpdates++ - } - } - if podStatusUpdates != item.expectedPodStatusUpdates { - t.Errorf("expect pod status updated to happen %d times, but got %d", item.expectedPodStatusUpdates, podStatusUpdates) - } - }) - } -} - -// TestApplyNoExecuteTaints, ensures we just have a NoExecute taint applied to node. -// NodeController is just responsible for enqueuing the node to tainting queue from which taint manager picks up -// and evicts the pods on the node. -func TestApplyNoExecuteTaints(t *testing.T) { - fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute - - fakeNodeHandler := &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - // Unreachable Taint with effect 'NoExecute' should be applied to this node. - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady - // we need second healthy node in tests. - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - // NotReady Taint with NoExecute effect should be applied to this node. - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node2", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionFalse, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - } - healthyNodeNewStatus := v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - } - originalTaint := UnreachableTaintTemplate - nodeController, _ := newNodeLifecycleControllerFromClient( - fakeNodeHandler, - evictionTimeout, - testRateLimiterQPS, - testRateLimiterQPS, - testLargeClusterThreshold, - testUnhealthyThreshold, - testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, - testNodeMonitorPeriod, - true) - nodeController.now = func() metav1.Time { return fakeNow } - nodeController.recorder = testutil.NewFakeRecorder() - nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("unexpected error: %v", err) - } - nodeController.doNoExecuteTaintingPass() - node0, err := fakeNodeHandler.Get(context.TODO(), "node0", metav1.GetOptions{}) - if err != nil { - t.Errorf("Can't get current node0...") - return - } - if !taintutils.TaintExists(node0.Spec.Taints, UnreachableTaintTemplate) { - t.Errorf("Can't find taint %v in %v", originalTaint, node0.Spec.Taints) - } - node2, err := fakeNodeHandler.Get(context.TODO(), "node2", metav1.GetOptions{}) - if err != nil { - t.Errorf("Can't get current node2...") - return - } - if !taintutils.TaintExists(node2.Spec.Taints, NotReadyTaintTemplate) { - t.Errorf("Can't find taint %v in %v", NotReadyTaintTemplate, node2.Spec.Taints) - } - - // Make node3 healthy again. - node2.Status = healthyNodeNewStatus - _, err = fakeNodeHandler.UpdateStatus(context.TODO(), node2, metav1.UpdateOptions{}) - if err != nil { - t.Errorf(err.Error()) - return - } - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("unexpected error: %v", err) - } - nodeController.doNoExecuteTaintingPass() - - node2, err = fakeNodeHandler.Get(context.TODO(), "node2", metav1.GetOptions{}) - if err != nil { - t.Errorf("Can't get current node2...") - return - } - // We should not see any taint on the node(especially the Not-Ready taint with NoExecute effect). - if taintutils.TaintExists(node2.Spec.Taints, NotReadyTaintTemplate) || len(node2.Spec.Taints) > 0 { - t.Errorf("Found taint %v in %v, which should not be present", NotReadyTaintTemplate, node2.Spec.Taints) - } -} - -// TestApplyNoExecuteTaintsToNodesEnqueueTwice ensures we taint every node with NoExecute even if enqueued twice -func TestApplyNoExecuteTaintsToNodesEnqueueTwice(t *testing.T) { - fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute - - fakeNodeHandler := &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - // Unreachable Taint with effect 'NoExecute' should be applied to this node. - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady - // we need second healthy node in tests. - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - // NotReady Taint with NoExecute effect should be applied to this node. - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node2", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionFalse, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - } - healthyNodeNewStatus := v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - } - nodeController, _ := newNodeLifecycleControllerFromClient( - fakeNodeHandler, - evictionTimeout, - testRateLimiterQPS, - testRateLimiterQPS, - testLargeClusterThreshold, - testUnhealthyThreshold, - testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, - testNodeMonitorPeriod, - true) - nodeController.now = func() metav1.Time { return fakeNow } - nodeController.recorder = testutil.NewFakeRecorder() - nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - // 1. monitor node health twice, add untainted node once - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("unexpected error: %v", err) - } - - // 2. mark node0 healthy - node0, err := fakeNodeHandler.Get(context.TODO(), "node0", metav1.GetOptions{}) - if err != nil { - t.Errorf("Can't get current node0...") - return - } - node0.Status = healthyNodeNewStatus - _, err = fakeNodeHandler.UpdateStatus(context.TODO(), node0, metav1.UpdateOptions{}) - if err != nil { - t.Errorf(err.Error()) - return - } - - // add other notReady nodes - fakeNodeHandler.Existing = append(fakeNodeHandler.Existing, []*v1.Node{ - // Unreachable Taint with effect 'NoExecute' should be applied to this node. - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node3", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady - // we need second healthy node in tests. - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node4", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - // NotReady Taint with NoExecute effect should be applied to this node. - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node5", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionFalse, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }...) - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - // 3. start monitor node health again, add untainted node twice, construct UniqueQueue with duplicated node cache - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("unexpected error: %v", err) - } - - // 4. do NoExecute taint pass - // when processing with node0, condition.Status is NodeReady, and return true with default case - // then remove the set value and queue value both, the taint job never stuck - nodeController.doNoExecuteTaintingPass() - - // 5. get node3 and node5, see if it has ready got NoExecute taint - node3, err := fakeNodeHandler.Get(context.TODO(), "node3", metav1.GetOptions{}) - if err != nil { - t.Errorf("Can't get current node3...") - return - } - if !taintutils.TaintExists(node3.Spec.Taints, UnreachableTaintTemplate) || len(node3.Spec.Taints) == 0 { - t.Errorf("Not found taint %v in %v, which should be present in %s", UnreachableTaintTemplate, node3.Spec.Taints, node3.Name) - } - node5, err := fakeNodeHandler.Get(context.TODO(), "node5", metav1.GetOptions{}) - if err != nil { - t.Errorf("Can't get current node5...") - return - } - if !taintutils.TaintExists(node5.Spec.Taints, NotReadyTaintTemplate) || len(node5.Spec.Taints) == 0 { - t.Errorf("Not found taint %v in %v, which should be present in %s", NotReadyTaintTemplate, node5.Spec.Taints, node5.Name) - } -} - -func TestSwapUnreachableNotReadyTaints(t *testing.T) { - fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute - - fakeNodeHandler := &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady - // we need second healthy node in tests. Because of how the tests are written we need to update - // the status of this Node. - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - } - timeToPass := evictionTimeout - newNodeStatus := v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionFalse, - // Node status has just been updated, and is NotReady for 10min. - LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 9, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - } - healthyNodeNewStatus := v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - } - originalTaint := UnreachableTaintTemplate - updatedTaint := NotReadyTaintTemplate - - nodeController, _ := newNodeLifecycleControllerFromClient( - fakeNodeHandler, - evictionTimeout, - testRateLimiterQPS, - testRateLimiterQPS, - testLargeClusterThreshold, - testUnhealthyThreshold, - testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, - testNodeMonitorPeriod, - true) - nodeController.now = func() metav1.Time { return fakeNow } - nodeController.recorder = testutil.NewFakeRecorder() - nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("unexpected error: %v", err) - } - nodeController.doNoExecuteTaintingPass() - - node0, err := fakeNodeHandler.Get(context.TODO(), "node0", metav1.GetOptions{}) - if err != nil { - t.Errorf("Can't get current node0...") - return - } - node1, err := fakeNodeHandler.Get(context.TODO(), "node1", metav1.GetOptions{}) - if err != nil { - t.Errorf("Can't get current node1...") - return - } - - if originalTaint != nil && !taintutils.TaintExists(node0.Spec.Taints, originalTaint) { - t.Errorf("Can't find taint %v in %v", originalTaint, node0.Spec.Taints) - } - - nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(timeToPass)} } - - node0.Status = newNodeStatus - node1.Status = healthyNodeNewStatus - _, err = fakeNodeHandler.UpdateStatus(context.TODO(), node0, metav1.UpdateOptions{}) - if err != nil { - t.Errorf(err.Error()) - return - } - _, err = fakeNodeHandler.UpdateStatus(context.TODO(), node1, metav1.UpdateOptions{}) - if err != nil { - t.Errorf(err.Error()) - return - } - - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("unexpected error: %v", err) - } - nodeController.doNoExecuteTaintingPass() - - node0, err = fakeNodeHandler.Get(context.TODO(), "node0", metav1.GetOptions{}) - if err != nil { - t.Errorf("Can't get current node0...") - return - } - if updatedTaint != nil { - if !taintutils.TaintExists(node0.Spec.Taints, updatedTaint) { - t.Errorf("Can't find taint %v in %v", updatedTaint, node0.Spec.Taints) - } - } -} - -func TestTaintsNodeByCondition(t *testing.T) { - fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute - - fakeNodeHandler := &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - } - - nodeController, _ := newNodeLifecycleControllerFromClient( - fakeNodeHandler, - evictionTimeout, - testRateLimiterQPS, - testRateLimiterQPS, - testLargeClusterThreshold, - testUnhealthyThreshold, - testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, - testNodeMonitorPeriod, - true) - nodeController.now = func() metav1.Time { return fakeNow } - nodeController.recorder = testutil.NewFakeRecorder() - nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) - - networkUnavailableTaint := &v1.Taint{ - Key: v1.TaintNodeNetworkUnavailable, - Effect: v1.TaintEffectNoSchedule, - } - notReadyTaint := &v1.Taint{ - Key: v1.TaintNodeNotReady, - Effect: v1.TaintEffectNoSchedule, - } - unreachableTaint := &v1.Taint{ - Key: v1.TaintNodeUnreachable, - Effect: v1.TaintEffectNoSchedule, - } - - tests := []struct { - Name string - Node *v1.Node - ExpectedTaints []*v1.Taint - }{ - { - Name: "NetworkUnavailable is true", - Node: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - { - Type: v1.NodeNetworkUnavailable, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - ExpectedTaints: []*v1.Taint{networkUnavailableTaint}, - }, - { - Name: "NetworkUnavailable is true", - Node: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - { - Type: v1.NodeNetworkUnavailable, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - ExpectedTaints: []*v1.Taint{networkUnavailableTaint}, - }, - { - Name: "Ready is false", - Node: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionFalse, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - ExpectedTaints: []*v1.Taint{notReadyTaint}, - }, - { - Name: "Ready is unknown", - Node: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - ExpectedTaints: []*v1.Taint{unreachableTaint}, - }, - } - - for _, test := range tests { - fakeNodeHandler.Update(context.TODO(), test.Node, metav1.UpdateOptions{}) - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - nodeController.doNoScheduleTaintingPass(test.Node.Name) - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - node0, err := nodeController.nodeLister.Get("node0") - if err != nil { - t.Errorf("Can't get current node0...") - return - } - if len(node0.Spec.Taints) != len(test.ExpectedTaints) { - t.Errorf("%s: Unexpected number of taints: expected %d, got %d", - test.Name, len(test.ExpectedTaints), len(node0.Spec.Taints)) - } - for _, taint := range test.ExpectedTaints { - if !taintutils.TaintExists(node0.Spec.Taints, taint) { - t.Errorf("%s: Can't find taint %v in %v", test.Name, taint, node0.Spec.Taints) - } - } - } -} - -func TestNodeEventGeneration(t *testing.T) { - fakeNow := metav1.Date(2016, 9, 10, 12, 0, 0, 0, time.UTC) - fakeNodeHandler := &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - UID: "1234567890", - CreationTimestamp: metav1.Date(2015, 8, 10, 0, 0, 0, 0, time.UTC), - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 8, 10, 0, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 8, 10, 0, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - } - - nodeController, _ := newNodeLifecycleControllerFromClient( - fakeNodeHandler, - 5*time.Minute, - testRateLimiterQPS, - testRateLimiterQPS, - testLargeClusterThreshold, - testUnhealthyThreshold, - testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, - testNodeMonitorPeriod, - false) - nodeController.now = func() metav1.Time { return fakeNow } - fakeRecorder := testutil.NewFakeRecorder() - nodeController.recorder = fakeRecorder - nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) - - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Errorf("unexpected error: %v", err) - } - if err := nodeController.monitorNodeHealth(); err != nil { - t.Errorf("unexpected error: %v", err) - } - if len(fakeRecorder.Events) != 1 { - t.Fatalf("unexpected events, got %v, expected %v: %+v", len(fakeRecorder.Events), 1, fakeRecorder.Events) - } - if fakeRecorder.Events[0].Reason != "RegisteredNode" { - var reasons []string - for _, event := range fakeRecorder.Events { - reasons = append(reasons, event.Reason) - } - t.Fatalf("unexpected events generation: %v", strings.Join(reasons, ",")) - } - for _, event := range fakeRecorder.Events { - involvedObject := event.InvolvedObject - actualUID := string(involvedObject.UID) - if actualUID != "1234567890" { - t.Fatalf("unexpected event uid: %v", actualUID) - } - } -} - -func TestReconcileNodeLabels(t *testing.T) { - fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute - - fakeNodeHandler := &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - v1.LabelTopologyZone: "zone1", - v1.LabelFailureDomainBetaRegion: "region1", - v1.LabelFailureDomainBetaZone: "zone1", - }, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - } - - nodeController, _ := newNodeLifecycleControllerFromClient( - fakeNodeHandler, - evictionTimeout, - testRateLimiterQPS, - testRateLimiterQPS, - testLargeClusterThreshold, - testUnhealthyThreshold, - testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, - testNodeMonitorPeriod, - true) - nodeController.now = func() metav1.Time { return fakeNow } - nodeController.recorder = testutil.NewFakeRecorder() - nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) - - tests := []struct { - Name string - Node *v1.Node - ExpectedLabels map[string]string - }{ - { - Name: "No-op if node has no labels", - Node: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - }, - }, - ExpectedLabels: nil, - }, - { - Name: "No-op if no target labels present", - Node: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - v1.LabelTopologyRegion: "region1", - }, - }, - }, - ExpectedLabels: map[string]string{ - v1.LabelTopologyRegion: "region1", - }, - }, - { - // OpenYurt Authors: - // Currently in node_lifecycle_controller.go, we have - // - primaryKey: "beta.kubernetes.io" - // - secondaryKey: "kubernetes.io" - // But in node_lifecycle_controller_test.go, which is from k8s 1.20.11, - // assuming that - // - primaryKey: "kubernetes.io" - // - secondaryKey: "beta.kubernetes.io" - // So we change this case. - Name: "Create OS/arch stable labels when they don't exist", - Node: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - LabelOS: "linux", - LabelArch: "amd64", - }, - }, - }, - ExpectedLabels: map[string]string{ - LabelOS: "linux", - LabelArch: "amd64", - v1.LabelOSStable: "linux", - v1.LabelArchStable: "amd64", - }, - }, - { - // OpenYurt Authors: - // Currently in node_lifecycle_controller.go, we have - // - primaryKey: "beta.kubernetes.io" - // - secondaryKey: "kubernetes.io" - // But in node_lifecycle_controller_test.go, which is from k8s 1.20.11, - // assuming that - // - primaryKey: "kubernetes.io" - // - secondaryKey: "beta.kubernetes.io" - // So we change this case. - Name: "Reconcile OS/arch stable labels to match beta labels", - Node: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), - Labels: map[string]string{ - LabelOS: "linux", - LabelArch: "amd64", - v1.LabelOSStable: "windows", - v1.LabelArchStable: "arm", - }, - }, - }, - ExpectedLabels: map[string]string{ - LabelOS: "linux", - LabelArch: "amd64", - v1.LabelOSStable: "linux", - v1.LabelArchStable: "amd64", - }, - }, - } - - for _, test := range tests { - fakeNodeHandler.Update(context.TODO(), test.Node, metav1.UpdateOptions{}) - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Fatalf("unexpected error: %v", err) - } - nodeController.reconcileNodeLabels(test.Node.Name) - if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { - t.Fatalf("unexpected error: %v", err) - } - node0, err := nodeController.nodeLister.Get("node0") - if err != nil { - t.Fatalf("Can't get current node0...") - } - if len(node0.Labels) != len(test.ExpectedLabels) { - t.Errorf("%s: Unexpected number of taints: expected %d, got %d", - test.Name, len(test.ExpectedLabels), len(node0.Labels)) - } - for key, expectedValue := range test.ExpectedLabels { - actualValue, ok := node0.Labels[key] - if !ok { - t.Errorf("%s: Can't find label %v in %v", test.Name, key, node0.Labels) - } - if actualValue != expectedValue { - t.Errorf("%s: label %q: expected value %q, got value %q", test.Name, key, expectedValue, actualValue) - } - } - } -} - -func TestTryUpdateNodeHealth(t *testing.T) { - fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) - fakeOld := metav1.Date(2016, 1, 1, 12, 0, 0, 0, time.UTC) - evictionTimeout := 10 * time.Minute - - fakeNodeHandler := &testutil.FakeNodeHandler{ - Existing: []*v1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: fakeNow, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: fakeNow, - LastTransitionTime: fakeNow, - }, - }, - }, - }, - }, - Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), - } - - nodeController, _ := newNodeLifecycleControllerFromClient( - fakeNodeHandler, - evictionTimeout, - testRateLimiterQPS, - testRateLimiterQPS, - testLargeClusterThreshold, - testUnhealthyThreshold, - testNodeMonitorGracePeriod, - testNodeStartupGracePeriod, - testNodeMonitorPeriod, - true) - nodeController.now = func() metav1.Time { return fakeNow } - nodeController.recorder = testutil.NewFakeRecorder() - nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) - - getStatus := func(cond *v1.NodeCondition) *v1.ConditionStatus { - if cond == nil { - return nil - } - return &cond.Status - } - - tests := []struct { - name string - node *v1.Node - }{ - { - name: "Status true", - node: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: fakeNow, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: fakeNow, - LastTransitionTime: fakeNow, - }, - }, - }, - }, - }, - { - name: "Status false", - node: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: fakeNow, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionFalse, - LastHeartbeatTime: fakeNow, - LastTransitionTime: fakeNow, - }, - }, - }, - }, - }, - { - name: "Status unknown", - node: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: fakeNow, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: fakeNow, - LastTransitionTime: fakeNow, - }, - }, - }, - }, - }, - { - name: "Status nil", - node: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: fakeNow, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{}, - }, - }, - }, - { - name: "Status true - after grace period", - node: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: fakeOld, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: fakeOld, - LastTransitionTime: fakeOld, - }, - }, - }, - }, - }, - { - name: "Status false - after grace period", - node: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: fakeOld, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionFalse, - LastHeartbeatTime: fakeOld, - LastTransitionTime: fakeOld, - }, - }, - }, - }, - }, - { - name: "Status unknown - after grace period", - node: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: fakeOld, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: fakeOld, - LastTransitionTime: fakeOld, - }, - }, - }, - }, - }, - { - name: "Status nil - after grace period", - node: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node0", - CreationTimestamp: fakeOld, - }, - Status: v1.NodeStatus{ - Conditions: []v1.NodeCondition{}, - }, - }, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - nodeController.nodeHealthMap.set(test.node.Name, &nodeHealthData{ - status: &test.node.Status, - probeTimestamp: test.node.CreationTimestamp, - readyTransitionTimestamp: test.node.CreationTimestamp, - }) - _, _, currentReadyCondition, err := nodeController.tryUpdateNodeHealth(test.node) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - _, savedReadyCondition := nodeutil.GetNodeCondition(nodeController.nodeHealthMap.getDeepCopy(test.node.Name).status, v1.NodeReady) - savedStatus := getStatus(savedReadyCondition) - currentStatus := getStatus(currentReadyCondition) - if !apiequality.Semantic.DeepEqual(currentStatus, savedStatus) { - t.Errorf("expected %v, got %v", savedStatus, currentStatus) - } - }) - } -} diff --git a/pkg/controller/nodelifecycle/scheduler/rate_limited_queue.go b/pkg/controller/nodelifecycle/scheduler/rate_limited_queue.go deleted file mode 100644 index ea9a269fda7..00000000000 --- a/pkg/controller/nodelifecycle/scheduler/rate_limited_queue.go +++ /dev/null @@ -1,308 +0,0 @@ -/* -Copyright 2015 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package scheduler - -import ( - "container/heap" - "sync" - "time" - - "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/client-go/util/flowcontrol" - "k8s.io/klog/v2" -) - -const ( - // NodeHealthUpdateRetry controls the number of retries of writing - // node health update. - NodeHealthUpdateRetry = 5 - // NodeEvictionPeriod controls how often NodeController will try to - // evict Pods from non-responsive Nodes. - NodeEvictionPeriod = 100 * time.Millisecond - // EvictionRateLimiterBurst is the burst value for all eviction rate - // limiters - EvictionRateLimiterBurst = 1 -) - -// TimedValue is a value that should be processed at a designated time. -type TimedValue struct { - Value string - // UID could be anything that helps identify the value - UID interface{} - AddedAt time.Time - ProcessAt time.Time -} - -// now is used to test time -var now = time.Now - -// TimedQueue is a priority heap where the lowest ProcessAt is at the front of the queue -type TimedQueue []*TimedValue - -// Len is the length of the queue. -func (h TimedQueue) Len() int { return len(h) } - -// Less returns true if queue[i] < queue[j]. -func (h TimedQueue) Less(i, j int) bool { return h[i].ProcessAt.Before(h[j].ProcessAt) } - -// Swap swaps index i and j. -func (h TimedQueue) Swap(i, j int) { h[i], h[j] = h[j], h[i] } - -// Push a new TimedValue on to the queue. -func (h *TimedQueue) Push(x interface{}) { - *h = append(*h, x.(*TimedValue)) -} - -// Pop the lowest ProcessAt item. -func (h *TimedQueue) Pop() interface{} { - old := *h - n := len(old) - x := old[n-1] - *h = old[0 : n-1] - return x -} - -// UniqueQueue is a FIFO queue which additionally guarantees that any -// element can be added only once until it is removed. -type UniqueQueue struct { - lock sync.Mutex - queue TimedQueue - set sets.String -} - -// Add a new value to the queue if it wasn't added before, or was -// explicitly removed by the Remove call. Returns true if new value -// was added. -func (q *UniqueQueue) Add(value TimedValue) bool { - q.lock.Lock() - defer q.lock.Unlock() - - if q.set.Has(value.Value) { - return false - } - heap.Push(&q.queue, &value) - q.set.Insert(value.Value) - return true -} - -// Replace replaces an existing value in the queue if it already -// exists, otherwise it does nothing. Returns true if the item was -// found. -func (q *UniqueQueue) Replace(value TimedValue) bool { - q.lock.Lock() - defer q.lock.Unlock() - - for i := range q.queue { - if q.queue[i].Value != value.Value { - continue - } - heap.Remove(&q.queue, i) - heap.Push(&q.queue, &value) - return true - } - return false -} - -// RemoveFromQueue the value from the queue, but keeps it in the set, -// so it won't be added second time. Returns true if something was -// removed. -func (q *UniqueQueue) RemoveFromQueue(value string) bool { - q.lock.Lock() - defer q.lock.Unlock() - - if !q.set.Has(value) { - return false - } - for i, val := range q.queue { - if val.Value == value { - heap.Remove(&q.queue, i) - return true - } - } - return false -} - -// Remove the value from the queue, so Get() call won't return it, and -// allow subsequent addition of the given value. If the value is not -// present does nothing and returns false. -func (q *UniqueQueue) Remove(value string) bool { - q.lock.Lock() - defer q.lock.Unlock() - - if !q.set.Has(value) { - return false - } - q.set.Delete(value) - for i, val := range q.queue { - if val.Value == value { - heap.Remove(&q.queue, i) - return true - } - } - return true -} - -// Get returns the oldest added value that wasn't returned yet. -func (q *UniqueQueue) Get() (TimedValue, bool) { - q.lock.Lock() - defer q.lock.Unlock() - if len(q.queue) == 0 { - return TimedValue{}, false - } - result := heap.Pop(&q.queue).(*TimedValue) - q.set.Delete(result.Value) - return *result, true -} - -// Head returns the oldest added value that wasn't returned yet -// without removing it. -func (q *UniqueQueue) Head() (TimedValue, bool) { - q.lock.Lock() - defer q.lock.Unlock() - if len(q.queue) == 0 { - return TimedValue{}, false - } - result := q.queue[0] - return *result, true -} - -// Clear removes all items from the queue and duplication preventing -// set. -func (q *UniqueQueue) Clear() { - q.lock.Lock() - defer q.lock.Unlock() - if q.queue.Len() > 0 { - q.queue = make(TimedQueue, 0) - } - if len(q.set) > 0 { - q.set = sets.NewString() - } -} - -// RateLimitedTimedQueue is a unique item priority queue ordered by -// the expected next time of execution. It is also rate limited. -type RateLimitedTimedQueue struct { - queue UniqueQueue - limiterLock sync.Mutex - limiter flowcontrol.RateLimiter -} - -// NewRateLimitedTimedQueue creates new queue which will use given -// RateLimiter to oversee execution. -func NewRateLimitedTimedQueue(limiter flowcontrol.RateLimiter) *RateLimitedTimedQueue { - return &RateLimitedTimedQueue{ - queue: UniqueQueue{ - queue: TimedQueue{}, - set: sets.NewString(), - }, - limiter: limiter, - } -} - -// ActionFunc takes a timed value and returns false if the item must -// be retried, with an optional time.Duration if some minimum wait -// interval should be used. -type ActionFunc func(TimedValue) (bool, time.Duration) - -// Try processes the queue.Ends prematurely if RateLimiter forbids an -// action and leak is true. Otherwise, requeues the item to be -// processed. Each value is processed once if fn returns true, -// otherwise it is added back to the queue. The returned remaining is -// used to identify the minimum time to execute the next item in the -// queue. The same value is processed only once unless Remove is -// explicitly called on it (it's done by the cancelPodEviction -// function in NodeController when Node becomes Ready again) TODO: -// figure out a good way to do garbage collection for all Nodes that -// were removed from the cluster. -func (q *RateLimitedTimedQueue) Try(fn ActionFunc) { - val, ok := q.queue.Head() - q.limiterLock.Lock() - defer q.limiterLock.Unlock() - for ok { - // rate limit the queue checking - if !q.limiter.TryAccept() { - klog.V(10).Infof("Try rate limited for value: %v", val) - // Try again later - break - } - - now := now() - if now.Before(val.ProcessAt) { - break - } - - if ok, wait := fn(val); !ok { - val.ProcessAt = now.Add(wait + 1) - q.queue.Replace(val) - } else { - q.queue.RemoveFromQueue(val.Value) - } - val, ok = q.queue.Head() - } -} - -// Add value to the queue to be processed. Won't add the same -// value(comparison by value) a second time if it was already added -// and not removed. -func (q *RateLimitedTimedQueue) Add(value string, uid interface{}) bool { - now := now() - return q.queue.Add(TimedValue{ - Value: value, - UID: uid, - AddedAt: now, - ProcessAt: now, - }) -} - -// Remove Node from the Evictor. The Node won't be processed until -// added again. -func (q *RateLimitedTimedQueue) Remove(value string) bool { - return q.queue.Remove(value) -} - -// Clear removes all items from the queue -func (q *RateLimitedTimedQueue) Clear() { - q.queue.Clear() -} - -// SwapLimiter safely swaps current limiter for this queue with the -// passed one if capacities or qps's differ. -func (q *RateLimitedTimedQueue) SwapLimiter(newQPS float32) { - q.limiterLock.Lock() - defer q.limiterLock.Unlock() - if q.limiter.QPS() == newQPS { - return - } - var newLimiter flowcontrol.RateLimiter - if newQPS <= 0 { - newLimiter = flowcontrol.NewFakeNeverRateLimiter() - } else { - newLimiter = flowcontrol.NewTokenBucketRateLimiter(newQPS, EvictionRateLimiterBurst) - - // If we're currently waiting on limiter, we drain the new one - this is a good approach when Burst value is 1 - // TODO: figure out if we need to support higher Burst values and decide on the drain logic, should we keep: - // - saturation (percentage of used tokens) - // - number of used tokens - // - number of available tokens - // - something else - if q.limiter.TryAccept() == false { - newLimiter.TryAccept() - } - } - q.limiter.Stop() - q.limiter = newLimiter -} diff --git a/pkg/controller/nodelifecycle/scheduler/rate_limited_queue_test.go b/pkg/controller/nodelifecycle/scheduler/rate_limited_queue_test.go deleted file mode 100644 index 644b6569039..00000000000 --- a/pkg/controller/nodelifecycle/scheduler/rate_limited_queue_test.go +++ /dev/null @@ -1,334 +0,0 @@ -/* -Copyright 2015 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package scheduler - -import ( - "reflect" - "testing" - "time" - - "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/client-go/util/flowcontrol" -) - -func CheckQueueEq(lhs []string, rhs TimedQueue) bool { - for i := 0; i < len(lhs); i++ { - if rhs[i].Value != lhs[i] { - return false - } - } - return true -} - -func CheckSetEq(lhs, rhs sets.String) bool { - return lhs.HasAll(rhs.List()...) && rhs.HasAll(lhs.List()...) -} - -func TestAddNode(t *testing.T) { - evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) - evictor.Add("first", "11111") - evictor.Add("second", "22222") - evictor.Add("third", "33333") - - queuePattern := []string{"first", "second", "third"} - if len(evictor.queue.queue) != len(queuePattern) { - t.Fatalf("Queue %v should have length %d", evictor.queue.queue, len(queuePattern)) - } - if !CheckQueueEq(queuePattern, evictor.queue.queue) { - t.Errorf("Invalid queue. Got %v, expected %v", evictor.queue.queue, queuePattern) - } - - setPattern := sets.NewString("first", "second", "third") - if len(evictor.queue.set) != len(setPattern) { - t.Fatalf("Map %v should have length %d", evictor.queue.set, len(setPattern)) - } - if !CheckSetEq(setPattern, evictor.queue.set) { - t.Errorf("Invalid map. Got %v, expected %v", evictor.queue.set, setPattern) - } -} - -func TestDelNode(t *testing.T) { - defer func() { now = time.Now }() - var tick int64 - now = func() time.Time { - t := time.Unix(tick, 0) - tick++ - return t - } - evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) - evictor.Add("first", "11111") - evictor.Add("second", "22222") - evictor.Add("third", "33333") - evictor.Remove("first") - - queuePattern := []string{"second", "third"} - if len(evictor.queue.queue) != len(queuePattern) { - t.Fatalf("Queue %v should have length %d", evictor.queue.queue, len(queuePattern)) - } - if !CheckQueueEq(queuePattern, evictor.queue.queue) { - t.Errorf("Invalid queue. Got %v, expected %v", evictor.queue.queue, queuePattern) - } - - setPattern := sets.NewString("second", "third") - if len(evictor.queue.set) != len(setPattern) { - t.Fatalf("Map %v should have length %d", evictor.queue.set, len(setPattern)) - } - if !CheckSetEq(setPattern, evictor.queue.set) { - t.Errorf("Invalid map. Got %v, expected %v", evictor.queue.set, setPattern) - } - - evictor = NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) - evictor.Add("first", "11111") - evictor.Add("second", "22222") - evictor.Add("third", "33333") - evictor.Remove("second") - - queuePattern = []string{"first", "third"} - if len(evictor.queue.queue) != len(queuePattern) { - t.Fatalf("Queue %v should have length %d", evictor.queue.queue, len(queuePattern)) - } - if !CheckQueueEq(queuePattern, evictor.queue.queue) { - t.Errorf("Invalid queue. Got %v, expected %v", evictor.queue.queue, queuePattern) - } - - setPattern = sets.NewString("first", "third") - if len(evictor.queue.set) != len(setPattern) { - t.Fatalf("Map %v should have length %d", evictor.queue.set, len(setPattern)) - } - if !CheckSetEq(setPattern, evictor.queue.set) { - t.Errorf("Invalid map. Got %v, expected %v", evictor.queue.set, setPattern) - } - - evictor = NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) - evictor.Add("first", "11111") - evictor.Add("second", "22222") - evictor.Add("third", "33333") - evictor.Remove("third") - - queuePattern = []string{"first", "second"} - if len(evictor.queue.queue) != len(queuePattern) { - t.Fatalf("Queue %v should have length %d", evictor.queue.queue, len(queuePattern)) - } - if !CheckQueueEq(queuePattern, evictor.queue.queue) { - t.Errorf("Invalid queue. Got %v, expected %v", evictor.queue.queue, queuePattern) - } - - setPattern = sets.NewString("first", "second") - if len(evictor.queue.set) != len(setPattern) { - t.Fatalf("Map %v should have length %d", evictor.queue.set, len(setPattern)) - } - if !CheckSetEq(setPattern, evictor.queue.set) { - t.Errorf("Invalid map. Got %v, expected %v", evictor.queue.set, setPattern) - } -} - -func TestTry(t *testing.T) { - evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) - evictor.Add("first", "11111") - evictor.Add("second", "22222") - evictor.Add("third", "33333") - evictor.Remove("second") - - deletedMap := sets.NewString() - evictor.Try(func(value TimedValue) (bool, time.Duration) { - deletedMap.Insert(value.Value) - return true, 0 - }) - - setPattern := sets.NewString("first", "third") - if len(deletedMap) != len(setPattern) { - t.Fatalf("Map %v should have length %d", evictor.queue.set, len(setPattern)) - } - if !CheckSetEq(setPattern, deletedMap) { - t.Errorf("Invalid map. Got %v, expected %v", deletedMap, setPattern) - } -} - -func TestTryOrdering(t *testing.T) { - defer func() { now = time.Now }() - current := time.Unix(0, 0) - delay := 0 - // the current time is incremented by 1ms every time now is invoked - now = func() time.Time { - if delay > 0 { - delay-- - } else { - current = current.Add(time.Millisecond) - } - t.Logf("time %d", current.UnixNano()) - return current - } - evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) - evictor.Add("first", "11111") - evictor.Add("second", "22222") - evictor.Add("third", "33333") - - order := []string{} - count := 0 - hasQueued := false - evictor.Try(func(value TimedValue) (bool, time.Duration) { - count++ - t.Logf("eviction %d", count) - if value.ProcessAt.IsZero() { - t.Fatalf("processAt should not be zero") - } - switch value.Value { - case "first": - if !value.AddedAt.Equal(time.Unix(0, time.Millisecond.Nanoseconds())) { - t.Fatalf("added time for %s is %v", value.Value, value.AddedAt) - } - - case "second": - if !value.AddedAt.Equal(time.Unix(0, 2*time.Millisecond.Nanoseconds())) { - t.Fatalf("added time for %s is %v", value.Value, value.AddedAt) - } - if hasQueued { - if !value.ProcessAt.Equal(time.Unix(0, 6*time.Millisecond.Nanoseconds())) { - t.Fatalf("process time for %s is %v", value.Value, value.ProcessAt) - } - break - } - hasQueued = true - delay = 1 - t.Logf("going to delay") - return false, 2 * time.Millisecond - - case "third": - if !value.AddedAt.Equal(time.Unix(0, 3*time.Millisecond.Nanoseconds())) { - t.Fatalf("added time for %s is %v", value.Value, value.AddedAt) - } - } - order = append(order, value.Value) - return true, 0 - }) - if !reflect.DeepEqual(order, []string{"first", "third"}) { - t.Fatalf("order was wrong: %v", order) - } - if count != 3 { - t.Fatalf("unexpected iterations: %d", count) - } -} - -func TestTryRemovingWhileTry(t *testing.T) { - evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) - evictor.Add("first", "11111") - evictor.Add("second", "22222") - evictor.Add("third", "33333") - - processing := make(chan struct{}) - wait := make(chan struct{}) - order := []string{} - count := 0 - queued := false - - // while the Try function is processing "second", remove it from the queue - // we should not see "second" retried. - go func() { - <-processing - evictor.Remove("second") - close(wait) - }() - - evictor.Try(func(value TimedValue) (bool, time.Duration) { - count++ - if value.AddedAt.IsZero() { - t.Fatalf("added should not be zero") - } - if value.ProcessAt.IsZero() { - t.Fatalf("next should not be zero") - } - if !queued && value.Value == "second" { - queued = true - close(processing) - <-wait - return false, time.Millisecond - } - order = append(order, value.Value) - return true, 0 - }) - - if !reflect.DeepEqual(order, []string{"first", "third"}) { - t.Fatalf("order was wrong: %v", order) - } - if count != 3 { - t.Fatalf("unexpected iterations: %d", count) - } -} - -func TestClear(t *testing.T) { - evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) - evictor.Add("first", "11111") - evictor.Add("second", "22222") - evictor.Add("third", "33333") - - evictor.Clear() - - if len(evictor.queue.queue) != 0 { - t.Fatalf("Clear should remove all elements from the queue.") - } -} - -func TestSwapLimiter(t *testing.T) { - evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) - fakeAlways := flowcontrol.NewFakeAlwaysRateLimiter() - qps := evictor.limiter.QPS() - if qps != fakeAlways.QPS() { - t.Fatalf("QPS does not match create one: %v instead of %v", qps, fakeAlways.QPS()) - } - - evictor.SwapLimiter(0) - qps = evictor.limiter.QPS() - fakeNever := flowcontrol.NewFakeNeverRateLimiter() - if qps != fakeNever.QPS() { - t.Fatalf("QPS does not match create one: %v instead of %v", qps, fakeNever.QPS()) - } - - createdQPS := float32(5.5) - evictor.SwapLimiter(createdQPS) - qps = evictor.limiter.QPS() - if qps != createdQPS { - t.Fatalf("QPS does not match create one: %v instead of %v", qps, createdQPS) - } -} - -func TestAddAfterTry(t *testing.T) { - evictor := NewRateLimitedTimedQueue(flowcontrol.NewFakeAlwaysRateLimiter()) - evictor.Add("first", "11111") - evictor.Add("second", "22222") - evictor.Add("third", "33333") - evictor.Remove("second") - - deletedMap := sets.NewString() - evictor.Try(func(value TimedValue) (bool, time.Duration) { - deletedMap.Insert(value.Value) - return true, 0 - }) - - setPattern := sets.NewString("first", "third") - if len(deletedMap) != len(setPattern) { - t.Fatalf("Map %v should have length %d", evictor.queue.set, len(setPattern)) - } - if !CheckSetEq(setPattern, deletedMap) { - t.Errorf("Invalid map. Got %v, expected %v", deletedMap, setPattern) - } - - evictor.Add("first", "11111") - evictor.Try(func(value TimedValue) (bool, time.Duration) { - t.Errorf("We shouldn't process the same value if the explicit remove wasn't called.") - return true, 0 - }) -} diff --git a/pkg/controller/nodelifecycle/scheduler/taint_manager.go b/pkg/controller/nodelifecycle/scheduler/taint_manager.go deleted file mode 100644 index 5de3cf6eceb..00000000000 --- a/pkg/controller/nodelifecycle/scheduler/taint_manager.go +++ /dev/null @@ -1,540 +0,0 @@ -/* -Copyright 2020 The OpenYurt Authors. -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package scheduler - -import ( - "context" - "fmt" - "hash/fnv" - "io" - "math" - "sync" - "time" - - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/equality" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - clientset "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/scheme" - v1core "k8s.io/client-go/kubernetes/typed/core/v1" - "k8s.io/client-go/tools/record" - "k8s.io/client-go/util/workqueue" - "k8s.io/klog/v2" - - nodeutil "github.com/openyurtio/openyurt/pkg/controller/util/node" -) - -const ( - // TODO (k82cn): Figure out a reasonable number of workers/channels and propagate - // the number of workers up making it a parameter of Run() function. - - // NodeUpdateChannelSize defines the size of channel for node update events. - NodeUpdateChannelSize = 10 - // UpdateWorkerSize defines the size of workers for node update or/and pod update. - UpdateWorkerSize = 8 - podUpdateChannelSize = 1 - retries = 5 -) - -type nodeUpdateItem struct { - nodeName string -} - -type podUpdateItem struct { - podName string - podNamespace string - nodeName string -} - -func hash(val string, max int) int { - hasher := fnv.New32a() - io.WriteString(hasher, val) - return int(hasher.Sum32() % uint32(max)) -} - -// GetPodFunc returns the pod for the specified name/namespace, or a NotFound error if missing. -type GetPodFunc func(name, namespace string) (*v1.Pod, error) - -// GetNodeFunc returns the node for the specified name, or a NotFound error if missing. -type GetNodeFunc func(name string) (*v1.Node, error) - -// GetPodsByNodeNameFunc returns the list of pods assigned to the specified node. -type GetPodsByNodeNameFunc func(nodeName string) ([]*v1.Pod, error) - -// NoExecuteTaintManager listens to Taint/Toleration changes and is responsible for removing Pods -// from Nodes tainted with NoExecute Taints. -type NoExecuteTaintManager struct { - client clientset.Interface - recorder record.EventRecorder - getPod GetPodFunc - getNode GetNodeFunc - getPodsAssignedToNode GetPodsByNodeNameFunc - - taintEvictionQueue *TimedWorkerQueue - // keeps a map from nodeName to all noExecute taints on that Node - taintedNodesLock sync.Mutex - taintedNodes map[string][]v1.Taint - - nodeUpdateChannels []chan nodeUpdateItem - podUpdateChannels []chan podUpdateItem - - nodeUpdateQueue workqueue.Interface - podUpdateQueue workqueue.Interface -} - -func deletePodHandler(c clientset.Interface, emitEventFunc func(types.NamespacedName)) func(args *WorkArgs) error { - return func(args *WorkArgs) error { - ns := args.NamespacedName.Namespace - name := args.NamespacedName.Name - klog.V(0).Infof("NoExecuteTaintManager is deleting Pod: %v", args.NamespacedName.String()) - if emitEventFunc != nil { - emitEventFunc(args.NamespacedName) - } - var err error - for i := 0; i < retries; i++ { - err = c.CoreV1().Pods(ns).Delete(context.TODO(), name, metav1.DeleteOptions{}) - if err == nil { - break - } - time.Sleep(10 * time.Millisecond) - } - if err != nil { - klog.Errorf("NoExecuteTaintManager delete pod %v with error: %v", args.NamespacedName.String(), err) - } - return err - } -} - -func getNoExecuteTaints(taints []v1.Taint) []v1.Taint { - result := []v1.Taint{} - for i := range taints { - if taints[i].Effect == v1.TaintEffectNoExecute { - result = append(result, taints[i]) - } - } - return result -} - -// getMinTolerationTime returns minimal toleration time from the given slice, or -1 if it's infinite. -func getMinTolerationTime(tolerations []v1.Toleration) time.Duration { - minTolerationTime := int64(math.MaxInt64) - if len(tolerations) == 0 { - return 0 - } - - for i := range tolerations { - if tolerations[i].TolerationSeconds != nil { - tolerationSeconds := *(tolerations[i].TolerationSeconds) - if tolerationSeconds <= 0 { - return 0 - } else if tolerationSeconds < minTolerationTime { - minTolerationTime = tolerationSeconds - } - } - } - - if minTolerationTime == int64(math.MaxInt64) { - return -1 - } - return time.Duration(minTolerationTime) * time.Second -} - -// NewNoExecuteTaintManager creates a new NoExecuteTaintManager that will use passed clientset to -// communicate with the API server. -func NewNoExecuteTaintManager(c clientset.Interface, getPod GetPodFunc, getNode GetNodeFunc, getPodsAssignedToNode GetPodsByNodeNameFunc) *NoExecuteTaintManager { - eventBroadcaster := record.NewBroadcaster() - recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "taint-controller"}) - eventBroadcaster.StartLogging(klog.Infof) - if c != nil { - klog.V(0).Infof("Sending events to api server.") - eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: c.CoreV1().Events("")}) - } else { - klog.Fatalf("kubeClient is nil when starting NodeController") - } - - tm := &NoExecuteTaintManager{ - client: c, - recorder: recorder, - getPod: getPod, - getNode: getNode, - getPodsAssignedToNode: getPodsAssignedToNode, - taintedNodes: make(map[string][]v1.Taint), - - nodeUpdateQueue: workqueue.NewNamed("noexec_taint_node"), - podUpdateQueue: workqueue.NewNamed("noexec_taint_pod"), - } - tm.taintEvictionQueue = CreateWorkerQueue(deletePodHandler(c, tm.emitPodDeletionEvent)) - - return tm -} - -// Run starts NoExecuteTaintManager which will run in loop until `stopCh` is closed. -func (tc *NoExecuteTaintManager) Run(stopCh <-chan struct{}) { - klog.V(0).Infof("Starting NoExecuteTaintManager") - - for i := 0; i < UpdateWorkerSize; i++ { - tc.nodeUpdateChannels = append(tc.nodeUpdateChannels, make(chan nodeUpdateItem, NodeUpdateChannelSize)) - tc.podUpdateChannels = append(tc.podUpdateChannels, make(chan podUpdateItem, podUpdateChannelSize)) - } - - // Functions that are responsible for taking work items out of the workqueues and putting them - // into channels. - go func(stopCh <-chan struct{}) { - for { - item, shutdown := tc.nodeUpdateQueue.Get() - if shutdown { - break - } - nodeUpdate := item.(nodeUpdateItem) - hash := hash(nodeUpdate.nodeName, UpdateWorkerSize) - select { - case <-stopCh: - tc.nodeUpdateQueue.Done(item) - return - case tc.nodeUpdateChannels[hash] <- nodeUpdate: - // tc.nodeUpdateQueue.Done is called by the nodeUpdateChannels worker - } - } - }(stopCh) - - go func(stopCh <-chan struct{}) { - for { - item, shutdown := tc.podUpdateQueue.Get() - if shutdown { - break - } - // The fact that pods are processed by the same worker as nodes is used to avoid races - // between node worker setting tc.taintedNodes and pod worker reading this to decide - // whether to delete pod. - // It's possible that even without this assumption this code is still correct. - podUpdate := item.(podUpdateItem) - hash := hash(podUpdate.nodeName, UpdateWorkerSize) - select { - case <-stopCh: - tc.podUpdateQueue.Done(item) - return - case tc.podUpdateChannels[hash] <- podUpdate: - // tc.podUpdateQueue.Done is called by the podUpdateChannels worker - } - } - }(stopCh) - - wg := sync.WaitGroup{} - wg.Add(UpdateWorkerSize) - for i := 0; i < UpdateWorkerSize; i++ { - go tc.worker(i, wg.Done, stopCh) - } - wg.Wait() -} - -func (tc *NoExecuteTaintManager) worker(worker int, done func(), stopCh <-chan struct{}) { - defer done() - - // When processing events we want to prioritize Node updates over Pod updates, - // as NodeUpdates that interest NoExecuteTaintManager should be handled as soon as possible - - // we don't want user (or system) to wait until PodUpdate queue is drained before it can - // start evicting Pods from tainted Nodes. - for { - select { - case <-stopCh: - return - case nodeUpdate := <-tc.nodeUpdateChannels[worker]: - tc.handleNodeUpdate(nodeUpdate) - tc.nodeUpdateQueue.Done(nodeUpdate) - case podUpdate := <-tc.podUpdateChannels[worker]: - // If we found a Pod update we need to empty Node queue first. - priority: - for { - select { - case nodeUpdate := <-tc.nodeUpdateChannels[worker]: - tc.handleNodeUpdate(nodeUpdate) - tc.nodeUpdateQueue.Done(nodeUpdate) - default: - break priority - } - } - // After Node queue is emptied we process podUpdate. - tc.handlePodUpdate(podUpdate) - tc.podUpdateQueue.Done(podUpdate) - } - } -} - -// PodUpdated is used to notify NoExecuteTaintManager about Pod changes. -func (tc *NoExecuteTaintManager) PodUpdated(oldPod *v1.Pod, newPod *v1.Pod) { - podName := "" - podNamespace := "" - nodeName := "" - oldTolerations := []v1.Toleration{} - if oldPod != nil { - podName = oldPod.Name - podNamespace = oldPod.Namespace - nodeName = oldPod.Spec.NodeName - oldTolerations = oldPod.Spec.Tolerations - } - newTolerations := []v1.Toleration{} - if newPod != nil { - podName = newPod.Name - podNamespace = newPod.Namespace - nodeName = newPod.Spec.NodeName - newTolerations = newPod.Spec.Tolerations - } - - if oldPod != nil && newPod != nil && equality.Semantic.DeepEqual(oldTolerations, newTolerations) && oldPod.Spec.NodeName == newPod.Spec.NodeName { - return - } - updateItem := podUpdateItem{ - podName: podName, - podNamespace: podNamespace, - nodeName: nodeName, - } - - tc.podUpdateQueue.Add(updateItem) -} - -// NodeUpdated is used to notify NoExecuteTaintManager about Node changes. -func (tc *NoExecuteTaintManager) NodeUpdated(oldNode *v1.Node, newNode *v1.Node) { - nodeName := "" - oldTaints := []v1.Taint{} - if oldNode != nil { - nodeName = oldNode.Name - oldTaints = getNoExecuteTaints(oldNode.Spec.Taints) - } - - newTaints := []v1.Taint{} - if newNode != nil { - nodeName = newNode.Name - newTaints = getNoExecuteTaints(newNode.Spec.Taints) - } - - if oldNode != nil && newNode != nil && equality.Semantic.DeepEqual(oldTaints, newTaints) { - return - } - - // if node in autonomy status, skip evict pods from the node - if newNode != nil { - _, readyCondition := nodeutil.GetNodeCondition(&newNode.Status, v1.NodeReady) - if readyCondition != nil && readyCondition.Status != v1.ConditionTrue { // node is not ready - if newNode.Annotations != nil && newNode.Annotations[nodeutil.AnnotationKeyNodeAutonomy] == "true" { - klog.V(2).Infof("node %s is in autonomy status, so skip pods eviction in no execute taint manager", newNode.Name) - tc.taintedNodesLock.Lock() - defer tc.taintedNodesLock.Unlock() - delete(tc.taintedNodes, newNode.Name) - return - } - } - } - - updateItem := nodeUpdateItem{ - nodeName: nodeName, - } - - tc.nodeUpdateQueue.Add(updateItem) -} - -func (tc *NoExecuteTaintManager) cancelWorkWithEvent(nsName types.NamespacedName) { - if tc.taintEvictionQueue.CancelWork(nsName.String()) { - tc.emitCancelPodDeletionEvent(nsName) - } -} - -func (tc *NoExecuteTaintManager) processPodOnNode( - podNamespacedName types.NamespacedName, - nodeName string, - tolerations []v1.Toleration, - taints []v1.Taint, - now time.Time, -) { - if len(taints) == 0 { - tc.cancelWorkWithEvent(podNamespacedName) - } - allTolerated, usedTolerations := GetMatchingTolerations(taints, tolerations) - if !allTolerated { - klog.V(2).Infof("Not all taints are tolerated after update for Pod %v on %v", podNamespacedName.String(), nodeName) - // We're canceling scheduled work (if any), as we're going to delete the Pod right away. - tc.cancelWorkWithEvent(podNamespacedName) - tc.taintEvictionQueue.AddWork(NewWorkArgs(podNamespacedName.Name, podNamespacedName.Namespace), time.Now(), time.Now()) - return - } - minTolerationTime := getMinTolerationTime(usedTolerations) - // getMinTolerationTime returns negative value to denote infinite toleration. - if minTolerationTime < 0 { - klog.V(4).Infof("New tolerations for %v tolerate forever. Scheduled deletion won't be cancelled if already scheduled.", podNamespacedName.String()) - return - } - - startTime := now - triggerTime := startTime.Add(minTolerationTime) - scheduledEviction := tc.taintEvictionQueue.GetWorkerUnsafe(podNamespacedName.String()) - if scheduledEviction != nil { - startTime = scheduledEviction.CreatedAt - if startTime.Add(minTolerationTime).Before(triggerTime) { - return - } - tc.cancelWorkWithEvent(podNamespacedName) - } - tc.taintEvictionQueue.AddWork(NewWorkArgs(podNamespacedName.Name, podNamespacedName.Namespace), startTime, triggerTime) -} - -func (tc *NoExecuteTaintManager) handlePodUpdate(podUpdate podUpdateItem) { - pod, err := tc.getPod(podUpdate.podName, podUpdate.podNamespace) - if err != nil { - if apierrors.IsNotFound(err) { - // Delete - podNamespacedName := types.NamespacedName{Namespace: podUpdate.podNamespace, Name: podUpdate.podName} - klog.V(4).Infof("Noticed pod deletion: %#v", podNamespacedName) - tc.cancelWorkWithEvent(podNamespacedName) - return - } - utilruntime.HandleError(fmt.Errorf("could not get pod %s/%s: %w", podUpdate.podName, podUpdate.podNamespace, err)) - return - } - - // We key the workqueue and shard workers by nodeName. If we don't match the current state we should not be the one processing the current object. - if pod.Spec.NodeName != podUpdate.nodeName { - return - } - - // Create or Update - podNamespacedName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name} - klog.V(4).Infof("Noticed pod update: %#v", podNamespacedName) - nodeName := pod.Spec.NodeName - if nodeName == "" { - return - } - taints, ok := func() ([]v1.Taint, bool) { - tc.taintedNodesLock.Lock() - defer tc.taintedNodesLock.Unlock() - taints, ok := tc.taintedNodes[nodeName] - return taints, ok - }() - // It's possible that Node was deleted, or Taints were removed before, which triggered - // eviction cancelling if it was needed. - if !ok { - return - } - tc.processPodOnNode(podNamespacedName, nodeName, pod.Spec.Tolerations, taints, time.Now()) -} - -func (tc *NoExecuteTaintManager) handleNodeUpdate(nodeUpdate nodeUpdateItem) { - node, err := tc.getNode(nodeUpdate.nodeName) - if err != nil { - if apierrors.IsNotFound(err) { - // Delete - klog.V(4).Infof("Noticed node deletion: %#v", nodeUpdate.nodeName) - tc.taintedNodesLock.Lock() - defer tc.taintedNodesLock.Unlock() - delete(tc.taintedNodes, nodeUpdate.nodeName) - return - } - utilruntime.HandleError(fmt.Errorf("cannot get node %s: %w", nodeUpdate.nodeName, err)) - return - } - - // Create or Update - klog.V(4).Infof("Noticed node update: %#v", nodeUpdate) - taints := getNoExecuteTaints(node.Spec.Taints) - func() { - tc.taintedNodesLock.Lock() - defer tc.taintedNodesLock.Unlock() - klog.V(4).Infof("Updating known taints on node %v: %v", node.Name, taints) - if len(taints) == 0 { - delete(tc.taintedNodes, node.Name) - } else { - tc.taintedNodes[node.Name] = taints - } - }() - - // This is critical that we update tc.taintedNodes before we call getPodsAssignedToNode: - // getPodsAssignedToNode can be delayed as long as all future updates to pods will call - // tc.PodUpdated which will use tc.taintedNodes to potentially delete delayed pods. - pods, err := tc.getPodsAssignedToNode(node.Name) - if err != nil { - klog.Errorf(err.Error()) - return - } - if len(pods) == 0 { - return - } - // Short circuit, to make this controller a bit faster. - if len(taints) == 0 { - klog.V(4).Infof("All taints were removed from the Node %v. Cancelling all evictions...", node.Name) - for i := range pods { - tc.cancelWorkWithEvent(types.NamespacedName{Namespace: pods[i].Namespace, Name: pods[i].Name}) - } - return - } - - now := time.Now() - for _, pod := range pods { - podNamespacedName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name} - tc.processPodOnNode(podNamespacedName, node.Name, pod.Spec.Tolerations, taints, now) - } -} - -func (tc *NoExecuteTaintManager) emitPodDeletionEvent(nsName types.NamespacedName) { - if tc.recorder == nil { - return - } - ref := &v1.ObjectReference{ - Kind: "Pod", - Name: nsName.Name, - Namespace: nsName.Namespace, - } - tc.recorder.Eventf(ref, v1.EventTypeNormal, "TaintManagerEviction", "Marking for deletion Pod %s", nsName.String()) -} - -func (tc *NoExecuteTaintManager) emitCancelPodDeletionEvent(nsName types.NamespacedName) { - if tc.recorder == nil { - return - } - ref := &v1.ObjectReference{ - Kind: "Pod", - Name: nsName.Name, - Namespace: nsName.Namespace, - } - tc.recorder.Eventf(ref, v1.EventTypeNormal, "TaintManagerEviction", "Cancelling deletion of Pod %s", nsName.String()) -} - -// GetMatchingTolerations Returns true and list of Tolerations matching all Taints if all are tolerated, or false otherwise. -func GetMatchingTolerations(taints []v1.Taint, tolerations []v1.Toleration) (bool, []v1.Toleration) { - if len(taints) == 0 { - return true, []v1.Toleration{} - } - if len(tolerations) == 0 && len(taints) > 0 { - return false, []v1.Toleration{} - } - result := []v1.Toleration{} - for i := range taints { - tolerated := false - for j := range tolerations { - if tolerations[j].ToleratesTaint(&taints[i]) { - result = append(result, tolerations[j]) - tolerated = true - break - } - } - if !tolerated { - return false, []v1.Toleration{} - } - } - return true, result -} diff --git a/pkg/controller/nodelifecycle/scheduler/taint_manager_test.go b/pkg/controller/nodelifecycle/scheduler/taint_manager_test.go deleted file mode 100644 index 73857cfc356..00000000000 --- a/pkg/controller/nodelifecycle/scheduler/taint_manager_test.go +++ /dev/null @@ -1,941 +0,0 @@ -/* -Copyright 2022 The OpenYurt Authors. -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -/* -This file was derived from k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler/taint_manager_test.go -at commit: 27522a29feb. - -CHANGELOG from OpenYurt Authors: -1. Remove temporary->infinite taints case in TestUpdateNodeWithMultipleTaints. -2. Add autonomy case in TestUpdateNode. -*/ - -package scheduler - -import ( - "context" - "fmt" - "sort" - "sync" - "testing" - "time" - - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/fields" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/kubernetes/fake" - clienttesting "k8s.io/client-go/testing" - - "github.com/openyurtio/openyurt/pkg/controller/kubernetes/controller/testutil" - "github.com/openyurtio/openyurt/pkg/controller/util/node" -) - -var timeForControllerToProgress = 500 * time.Millisecond - -func getPodFromClientset(clientset *fake.Clientset) GetPodFunc { - return func(name, namespace string) (*v1.Pod, error) { - return clientset.CoreV1().Pods(namespace).Get(context.TODO(), name, metav1.GetOptions{}) - } -} - -func getPodsAssignedToNode(c *fake.Clientset) GetPodsByNodeNameFunc { - return func(nodeName string) ([]*v1.Pod, error) { - selector := fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeName}) - pods, err := c.CoreV1().Pods(v1.NamespaceAll).List(context.TODO(), metav1.ListOptions{ - FieldSelector: selector.String(), - LabelSelector: labels.Everything().String(), - }) - if err != nil { - return []*v1.Pod{}, fmt.Errorf("failed to get Pods assigned to node %v", nodeName) - } - rPods := make([]*v1.Pod, len(pods.Items)) - for i := range pods.Items { - rPods[i] = &pods.Items[i] - } - return rPods, nil - } -} - -func getNodeFromClientset(clientset *fake.Clientset) GetNodeFunc { - return func(name string) (*v1.Node, error) { - return clientset.CoreV1().Nodes().Get(context.TODO(), name, metav1.GetOptions{}) - } -} - -type podHolder struct { - pod *v1.Pod - sync.Mutex -} - -func (p *podHolder) getPod(name, namespace string) (*v1.Pod, error) { - p.Lock() - defer p.Unlock() - return p.pod, nil -} -func (p *podHolder) setPod(pod *v1.Pod) { - p.Lock() - defer p.Unlock() - p.pod = pod -} - -type nodeHolder struct { - lock sync.Mutex - - node *v1.Node -} - -func (n *nodeHolder) setNode(node *v1.Node) { - n.lock.Lock() - defer n.lock.Unlock() - n.node = node -} - -func (n *nodeHolder) getNode(name string) (*v1.Node, error) { - n.lock.Lock() - defer n.lock.Unlock() - return n.node, nil -} - -func createNoExecuteTaint(index int) v1.Taint { - now := metav1.Now() - return v1.Taint{ - Key: "testTaint" + fmt.Sprintf("%v", index), - Value: "test" + fmt.Sprintf("%v", index), - Effect: v1.TaintEffectNoExecute, - TimeAdded: &now, - } -} - -func addToleration(pod *v1.Pod, index int, duration int64) *v1.Pod { - if pod.Annotations == nil { - pod.Annotations = map[string]string{} - } - if duration < 0 { - pod.Spec.Tolerations = []v1.Toleration{{Key: "testTaint" + fmt.Sprintf("%v", index), Value: "test" + fmt.Sprintf("%v", index), Effect: v1.TaintEffectNoExecute}} - - } else { - pod.Spec.Tolerations = []v1.Toleration{{Key: "testTaint" + fmt.Sprintf("%v", index), Value: "test" + fmt.Sprintf("%v", index), Effect: v1.TaintEffectNoExecute, TolerationSeconds: &duration}} - } - return pod -} - -func addTaintsToNode(node *v1.Node, key, value string, indices []int) *v1.Node { - taints := []v1.Taint{} - for _, index := range indices { - taints = append(taints, createNoExecuteTaint(index)) - } - node.Spec.Taints = taints - return node -} - -type timestampedPod struct { - names []string - timestamp time.Duration -} - -type durationSlice []timestampedPod - -func (a durationSlice) Len() int { return len(a) } -func (a durationSlice) Swap(i, j int) { a[i], a[j] = a[j], a[i] } -func (a durationSlice) Less(i, j int) bool { return a[i].timestamp < a[j].timestamp } - -func TestFilterNoExecuteTaints(t *testing.T) { - taints := []v1.Taint{ - { - Key: "one", - Value: "one", - Effect: v1.TaintEffectNoExecute, - }, - { - Key: "two", - Value: "two", - Effect: v1.TaintEffectNoSchedule, - }, - } - taints = getNoExecuteTaints(taints) - if len(taints) != 1 || taints[0].Key != "one" { - t.Errorf("Filtering doesn't work. Got %v", taints) - } -} - -func TestCreatePod(t *testing.T) { - testCases := []struct { - description string - pod *v1.Pod - taintedNodes map[string][]v1.Taint - expectDelete bool - }{ - { - description: "not scheduled - ignore", - pod: testutil.NewPod("pod1", ""), - taintedNodes: map[string][]v1.Taint{}, - expectDelete: false, - }, - { - description: "scheduled on untainted Node", - pod: testutil.NewPod("pod1", "node1"), - taintedNodes: map[string][]v1.Taint{}, - expectDelete: false, - }, - { - description: "schedule on tainted Node", - pod: testutil.NewPod("pod1", "node1"), - taintedNodes: map[string][]v1.Taint{ - "node1": {createNoExecuteTaint(1)}, - }, - expectDelete: true, - }, - { - description: "schedule on tainted Node with finite toleration", - pod: addToleration(testutil.NewPod("pod1", "node1"), 1, 100), - taintedNodes: map[string][]v1.Taint{ - "node1": {createNoExecuteTaint(1)}, - }, - expectDelete: false, - }, - { - description: "schedule on tainted Node with infinite toleration", - pod: addToleration(testutil.NewPod("pod1", "node1"), 1, -1), - taintedNodes: map[string][]v1.Taint{ - "node1": {createNoExecuteTaint(1)}, - }, - expectDelete: false, - }, - { - description: "schedule on tainted Node with infinite ivalid toleration", - pod: addToleration(testutil.NewPod("pod1", "node1"), 2, -1), - taintedNodes: map[string][]v1.Taint{ - "node1": {createNoExecuteTaint(1)}, - }, - expectDelete: true, - }, - } - - for _, item := range testCases { - stopCh := make(chan struct{}) - fakeClientset := fake.NewSimpleClientset() - controller := NewNoExecuteTaintManager(fakeClientset, (&podHolder{pod: item.pod}).getPod, getNodeFromClientset(fakeClientset), getPodsAssignedToNode(fakeClientset)) - controller.recorder = testutil.NewFakeRecorder() - go controller.Run(stopCh) - controller.taintedNodes = item.taintedNodes - controller.PodUpdated(nil, item.pod) - // wait a bit - time.Sleep(timeForControllerToProgress) - - podDeleted := false - for _, action := range fakeClientset.Actions() { - if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" { - podDeleted = true - } - } - if podDeleted != item.expectDelete { - t.Errorf("%v: Unexpected test result. Expected delete %v, got %v", item.description, item.expectDelete, podDeleted) - } - close(stopCh) - } -} - -func TestDeletePod(t *testing.T) { - stopCh := make(chan struct{}) - fakeClientset := fake.NewSimpleClientset() - controller := NewNoExecuteTaintManager(fakeClientset, getPodFromClientset(fakeClientset), getNodeFromClientset(fakeClientset), getPodsAssignedToNode(fakeClientset)) - controller.recorder = testutil.NewFakeRecorder() - go controller.Run(stopCh) - controller.taintedNodes = map[string][]v1.Taint{ - "node1": {createNoExecuteTaint(1)}, - } - controller.PodUpdated(testutil.NewPod("pod1", "node1"), nil) - // wait a bit to see if nothing will panic - time.Sleep(timeForControllerToProgress) - close(stopCh) -} - -func TestUpdatePod(t *testing.T) { - testCases := []struct { - description string - prevPod *v1.Pod - newPod *v1.Pod - taintedNodes map[string][]v1.Taint - expectDelete bool - additionalSleep time.Duration - }{ - { - description: "scheduling onto tainted Node", - prevPod: testutil.NewPod("pod1", ""), - newPod: testutil.NewPod("pod1", "node1"), - taintedNodes: map[string][]v1.Taint{ - "node1": {createNoExecuteTaint(1)}, - }, - expectDelete: true, - }, - { - description: "scheduling onto tainted Node with toleration", - prevPod: addToleration(testutil.NewPod("pod1", ""), 1, -1), - newPod: addToleration(testutil.NewPod("pod1", "node1"), 1, -1), - taintedNodes: map[string][]v1.Taint{ - "node1": {createNoExecuteTaint(1)}, - }, - expectDelete: false, - }, - { - description: "removing toleration", - prevPod: addToleration(testutil.NewPod("pod1", "node1"), 1, 100), - newPod: testutil.NewPod("pod1", "node1"), - taintedNodes: map[string][]v1.Taint{ - "node1": {createNoExecuteTaint(1)}, - }, - expectDelete: true, - }, - { - description: "lengthening toleration shouldn't work", - prevPod: addToleration(testutil.NewPod("pod1", "node1"), 1, 1), - newPod: addToleration(testutil.NewPod("pod1", "node1"), 1, 100), - taintedNodes: map[string][]v1.Taint{ - "node1": {createNoExecuteTaint(1)}, - }, - expectDelete: true, - additionalSleep: 1500 * time.Millisecond, - }, - } - - for _, item := range testCases { - stopCh := make(chan struct{}) - fakeClientset := fake.NewSimpleClientset() - holder := &podHolder{} - controller := NewNoExecuteTaintManager(fakeClientset, holder.getPod, getNodeFromClientset(fakeClientset), getPodsAssignedToNode(fakeClientset)) - controller.recorder = testutil.NewFakeRecorder() - go controller.Run(stopCh) - controller.taintedNodes = item.taintedNodes - - holder.setPod(item.prevPod) - controller.PodUpdated(nil, item.prevPod) - fakeClientset.ClearActions() - time.Sleep(timeForControllerToProgress) - holder.setPod(item.newPod) - controller.PodUpdated(item.prevPod, item.newPod) - // wait a bit - time.Sleep(timeForControllerToProgress) - if item.additionalSleep > 0 { - time.Sleep(item.additionalSleep) - } - - podDeleted := false - for _, action := range fakeClientset.Actions() { - if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" { - podDeleted = true - } - } - if podDeleted != item.expectDelete { - t.Errorf("%v: Unexpected test result. Expected delete %v, got %v", item.description, item.expectDelete, podDeleted) - } - close(stopCh) - } -} - -func TestCreateNode(t *testing.T) { - testCases := []struct { - description string - pods []v1.Pod - node *v1.Node - expectDelete bool - }{ - { - description: "Creating Node matching already assigned Pod", - pods: []v1.Pod{ - *testutil.NewPod("pod1", "node1"), - }, - node: testutil.NewNode("node1"), - expectDelete: false, - }, - { - description: "Creating tainted Node matching already assigned Pod", - pods: []v1.Pod{ - *testutil.NewPod("pod1", "node1"), - }, - node: addTaintsToNode(testutil.NewNode("node1"), "testTaint1", "taint1", []int{1}), - expectDelete: true, - }, - { - description: "Creating tainted Node matching already assigned tolerating Pod", - pods: []v1.Pod{ - *addToleration(testutil.NewPod("pod1", "node1"), 1, -1), - }, - node: addTaintsToNode(testutil.NewNode("node1"), "testTaint1", "taint1", []int{1}), - expectDelete: false, - }, - } - - for _, item := range testCases { - stopCh := make(chan struct{}) - fakeClientset := fake.NewSimpleClientset(&v1.PodList{Items: item.pods}) - controller := NewNoExecuteTaintManager(fakeClientset, getPodFromClientset(fakeClientset), (&nodeHolder{node: item.node}).getNode, getPodsAssignedToNode(fakeClientset)) - controller.recorder = testutil.NewFakeRecorder() - go controller.Run(stopCh) - controller.NodeUpdated(nil, item.node) - // wait a bit - time.Sleep(timeForControllerToProgress) - - podDeleted := false - for _, action := range fakeClientset.Actions() { - if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" { - podDeleted = true - } - } - if podDeleted != item.expectDelete { - t.Errorf("%v: Unexpected test result. Expected delete %v, got %v", item.description, item.expectDelete, podDeleted) - } - close(stopCh) - } -} - -func TestDeleteNode(t *testing.T) { - stopCh := make(chan struct{}) - fakeClientset := fake.NewSimpleClientset() - controller := NewNoExecuteTaintManager(fakeClientset, getPodFromClientset(fakeClientset), getNodeFromClientset(fakeClientset), getPodsAssignedToNode(fakeClientset)) - controller.recorder = testutil.NewFakeRecorder() - controller.taintedNodes = map[string][]v1.Taint{ - "node1": {createNoExecuteTaint(1)}, - } - go controller.Run(stopCh) - controller.NodeUpdated(testutil.NewNode("node1"), nil) - // wait a bit to see if nothing will panic - time.Sleep(timeForControllerToProgress) - controller.taintedNodesLock.Lock() - if _, ok := controller.taintedNodes["node1"]; ok { - t.Error("Node should have been deleted from taintedNodes list") - } - controller.taintedNodesLock.Unlock() - close(stopCh) -} - -func TestUpdateNode(t *testing.T) { - testCases := []struct { - description string - pods []v1.Pod - oldNode *v1.Node - newNode *v1.Node - expectDelete bool - additionalSleep time.Duration - }{ - { - description: "Added taint", - pods: []v1.Pod{ - *testutil.NewPod("pod1", "node1"), - }, - oldNode: testutil.NewNode("node1"), - newNode: addTaintsToNode(testutil.NewNode("node1"), "testTaint1", "taint1", []int{1}), - expectDelete: true, - }, - { - description: "Added tolerated taint", - pods: []v1.Pod{ - *addToleration(testutil.NewPod("pod1", "node1"), 1, 100), - }, - oldNode: testutil.NewNode("node1"), - newNode: addTaintsToNode(testutil.NewNode("node1"), "testTaint1", "taint1", []int{1}), - expectDelete: false, - }, - { - description: "Only one added taint tolerated", - pods: []v1.Pod{ - *addToleration(testutil.NewPod("pod1", "node1"), 1, 100), - }, - oldNode: testutil.NewNode("node1"), - newNode: addTaintsToNode(testutil.NewNode("node1"), "testTaint1", "taint1", []int{1, 2}), - expectDelete: true, - }, - { - description: "Taint removed", - pods: []v1.Pod{ - *addToleration(testutil.NewPod("pod1", "node1"), 1, 1), - }, - oldNode: addTaintsToNode(testutil.NewNode("node1"), "testTaint1", "taint1", []int{1}), - newNode: testutil.NewNode("node1"), - expectDelete: false, - additionalSleep: 1500 * time.Millisecond, - }, - { - description: "Pod with multiple tolerations are evicted when first one runs out", - pods: []v1.Pod{ - { - ObjectMeta: metav1.ObjectMeta{ - Namespace: "default", - Name: "pod1", - }, - Spec: v1.PodSpec{ - NodeName: "node1", - Tolerations: []v1.Toleration{ - {Key: "testTaint1", Value: "test1", Effect: v1.TaintEffectNoExecute, TolerationSeconds: &[]int64{1}[0]}, - {Key: "testTaint2", Value: "test2", Effect: v1.TaintEffectNoExecute, TolerationSeconds: &[]int64{100}[0]}, - }, - }, - Status: v1.PodStatus{ - Conditions: []v1.PodCondition{ - { - Type: v1.PodReady, - Status: v1.ConditionTrue, - }, - }, - }, - }, - }, - oldNode: testutil.NewNode("node1"), - newNode: addTaintsToNode(testutil.NewNode("node1"), "testTaint1", "taint1", []int{1, 2}), - expectDelete: true, - additionalSleep: 1500 * time.Millisecond, - }, - // From OpenYurt Authors: - // When node runs in autonomy mode, and get NoExecute taint, do not evict pods on it. - { - description: "Added taint, and node runs in autonomy mode", - pods: []v1.Pod{ - *testutil.NewPod("pod1", "node1"), - }, - oldNode: &v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - Annotations: map[string]string{ - node.AnnotationKeyNodeAutonomy: "true", - }, - }, - Status: v1.NodeStatus{ - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, - newNode: addTaintsToNode(&v1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node1", - Annotations: map[string]string{ - node.AnnotationKeyNodeAutonomy: "true", - }, - }, - Status: v1.NodeStatus{ - Capacity: v1.ResourceList{ - v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), - v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), - }, - Conditions: []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), - }, - }, - }, - }, "testTaint1", "taint1", []int{1}), - expectDelete: false, - }, - } - - for _, item := range testCases { - stopCh := make(chan struct{}) - fakeClientset := fake.NewSimpleClientset(&v1.PodList{Items: item.pods}) - controller := NewNoExecuteTaintManager(fakeClientset, getPodFromClientset(fakeClientset), (&nodeHolder{node: item.newNode}).getNode, getPodsAssignedToNode(fakeClientset)) - controller.recorder = testutil.NewFakeRecorder() - go controller.Run(stopCh) - controller.NodeUpdated(item.oldNode, item.newNode) - // wait a bit - time.Sleep(timeForControllerToProgress) - if item.additionalSleep > 0 { - time.Sleep(item.additionalSleep) - } - - podDeleted := false - for _, action := range fakeClientset.Actions() { - if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" { - podDeleted = true - } - } - if podDeleted != item.expectDelete { - t.Errorf("%v: Unexpected test result. Expected delete %v, got %v", item.description, item.expectDelete, podDeleted) - } - close(stopCh) - } -} - -func TestUpdateNodeWithMultipleTaints(t *testing.T) { - taint1 := createNoExecuteTaint(1) - taint2 := createNoExecuteTaint(2) - - minute := int64(60) - pod := testutil.NewPod("pod1", "node1") - pod.Spec.Tolerations = []v1.Toleration{ - {Key: taint1.Key, Operator: v1.TolerationOpExists, Effect: v1.TaintEffectNoExecute}, - {Key: taint2.Key, Operator: v1.TolerationOpExists, Effect: v1.TaintEffectNoExecute, TolerationSeconds: &minute}, - } - podNamespacedName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name} - - untaintedNode := testutil.NewNode("node1") - - doubleTaintedNode := testutil.NewNode("node1") - doubleTaintedNode.Spec.Taints = []v1.Taint{taint1, taint2} - - singleTaintedNode := testutil.NewNode("node1") - singleTaintedNode.Spec.Taints = []v1.Taint{taint1} - - stopCh := make(chan struct{}) - fakeClientset := fake.NewSimpleClientset(pod) - holder := &nodeHolder{node: untaintedNode} - controller := NewNoExecuteTaintManager(fakeClientset, getPodFromClientset(fakeClientset), (holder).getNode, getPodsAssignedToNode(fakeClientset)) - controller.recorder = testutil.NewFakeRecorder() - go controller.Run(stopCh) - - // no taint - holder.setNode(untaintedNode) - controller.handleNodeUpdate(nodeUpdateItem{"node1"}) - // verify pod is not queued for deletion - if controller.taintEvictionQueue.GetWorkerUnsafe(podNamespacedName.String()) != nil { - t.Fatalf("pod queued for deletion with no taints") - } - - // no taint -> infinitely tolerated taint - holder.setNode(singleTaintedNode) - controller.handleNodeUpdate(nodeUpdateItem{"node1"}) - // verify pod is not queued for deletion - if controller.taintEvictionQueue.GetWorkerUnsafe(podNamespacedName.String()) != nil { - t.Fatalf("pod queued for deletion with permanently tolerated taint") - } - - // infinitely tolerated taint -> temporarily tolerated taint - holder.setNode(doubleTaintedNode) - controller.handleNodeUpdate(nodeUpdateItem{"node1"}) - // verify pod is queued for deletion - if controller.taintEvictionQueue.GetWorkerUnsafe(podNamespacedName.String()) == nil { - t.Fatalf("pod not queued for deletion after addition of temporarily tolerated taint") - } - - // OpenYurt Authors: - // In processPodOnNode of taint_manager.go, we will not cancel deletion - // if it's already scheduled. So, do not run this case: - // temporarily tolerated taint -> infinitely tolerated taint - // holder.setNode(singleTaintedNode) - // controller.handleNodeUpdate(nodeUpdateItem{"node1"}) - // // verify pod is not queued for deletion - // if controller.taintEvictionQueue.GetWorkerUnsafe(podNamespacedName.String()) != nil { - // t.Fatalf("pod queued for deletion after removal of temporarily tolerated taint") - // } - - // verify pod is not deleted - for _, action := range fakeClientset.Actions() { - if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" { - t.Error("Unexpected deletion") - } - } - close(stopCh) -} - -func TestUpdateNodeWithMultiplePods(t *testing.T) { - testCases := []struct { - description string - pods []v1.Pod - oldNode *v1.Node - newNode *v1.Node - expectedDeleteTimes durationSlice - }{ - { - description: "Pods with different toleration times are evicted appropriately", - pods: []v1.Pod{ - *testutil.NewPod("pod1", "node1"), - *addToleration(testutil.NewPod("pod2", "node1"), 1, 1), - *addToleration(testutil.NewPod("pod3", "node1"), 1, -1), - }, - oldNode: testutil.NewNode("node1"), - newNode: addTaintsToNode(testutil.NewNode("node1"), "testTaint1", "taint1", []int{1}), - expectedDeleteTimes: durationSlice{ - {[]string{"pod1"}, 0}, - {[]string{"pod2"}, time.Second}, - }, - }, - { - description: "Evict all pods not matching all taints instantly", - pods: []v1.Pod{ - *testutil.NewPod("pod1", "node1"), - *addToleration(testutil.NewPod("pod2", "node1"), 1, 1), - *addToleration(testutil.NewPod("pod3", "node1"), 1, -1), - }, - oldNode: testutil.NewNode("node1"), - newNode: addTaintsToNode(testutil.NewNode("node1"), "testTaint1", "taint1", []int{1, 2}), - expectedDeleteTimes: durationSlice{ - {[]string{"pod1", "pod2", "pod3"}, 0}, - }, - }, - } - - for _, item := range testCases { - t.Logf("Starting testcase %q", item.description) - - stopCh := make(chan struct{}) - fakeClientset := fake.NewSimpleClientset(&v1.PodList{Items: item.pods}) - sort.Sort(item.expectedDeleteTimes) - controller := NewNoExecuteTaintManager(fakeClientset, getPodFromClientset(fakeClientset), (&nodeHolder{node: item.newNode}).getNode, getPodsAssignedToNode(fakeClientset)) - controller.recorder = testutil.NewFakeRecorder() - go controller.Run(stopCh) - controller.NodeUpdated(item.oldNode, item.newNode) - - startedAt := time.Now() - for i := range item.expectedDeleteTimes { - if i == 0 || item.expectedDeleteTimes[i-1].timestamp != item.expectedDeleteTimes[i].timestamp { - // compute a grace duration to give controller time to process updates. Choose big - // enough intervals in the test cases above to avoid flakes. - var increment time.Duration - if i == len(item.expectedDeleteTimes)-1 || item.expectedDeleteTimes[i+1].timestamp == item.expectedDeleteTimes[i].timestamp { - increment = 500 * time.Millisecond - } else { - increment = ((item.expectedDeleteTimes[i+1].timestamp - item.expectedDeleteTimes[i].timestamp) / time.Duration(2)) - } - - sleepTime := item.expectedDeleteTimes[i].timestamp - time.Since(startedAt) + increment - if sleepTime < 0 { - sleepTime = 0 - } - t.Logf("Sleeping for %v", sleepTime) - time.Sleep(sleepTime) - } - - for delay, podName := range item.expectedDeleteTimes[i].names { - deleted := false - for _, action := range fakeClientset.Actions() { - deleteAction, ok := action.(clienttesting.DeleteActionImpl) - if !ok { - t.Logf("Found not-delete action with verb %v. Ignoring.", action.GetVerb()) - continue - } - if deleteAction.GetResource().Resource != "pods" { - continue - } - if podName == deleteAction.GetName() { - deleted = true - } - } - if !deleted { - t.Errorf("Failed to deleted pod %v after %v", podName, delay) - } - } - for _, action := range fakeClientset.Actions() { - deleteAction, ok := action.(clienttesting.DeleteActionImpl) - if !ok { - t.Logf("Found not-delete action with verb %v. Ignoring.", action.GetVerb()) - continue - } - if deleteAction.GetResource().Resource != "pods" { - continue - } - deletedPodName := deleteAction.GetName() - expected := false - for _, podName := range item.expectedDeleteTimes[i].names { - if podName == deletedPodName { - expected = true - } - } - if !expected { - t.Errorf("Pod %v was deleted even though it shouldn't have", deletedPodName) - } - } - fakeClientset.ClearActions() - } - - close(stopCh) - } -} - -func TestGetMinTolerationTime(t *testing.T) { - one := int64(1) - two := int64(2) - oneSec := 1 * time.Second - - tests := []struct { - tolerations []v1.Toleration - expected time.Duration - }{ - { - tolerations: []v1.Toleration{}, - expected: 0, - }, - { - tolerations: []v1.Toleration{ - { - TolerationSeconds: nil, - }, - }, - expected: -1, - }, - { - tolerations: []v1.Toleration{ - { - TolerationSeconds: &one, - }, - { - TolerationSeconds: &two, - }, - }, - expected: oneSec, - }, - - { - tolerations: []v1.Toleration{ - { - TolerationSeconds: &one, - }, - { - TolerationSeconds: nil, - }, - }, - expected: oneSec, - }, - { - tolerations: []v1.Toleration{ - { - TolerationSeconds: nil, - }, - { - TolerationSeconds: &one, - }, - }, - expected: oneSec, - }, - } - - for _, test := range tests { - got := getMinTolerationTime(test.tolerations) - if got != test.expected { - t.Errorf("Incorrect min toleration time: got %v, expected %v", got, test.expected) - } - } -} - -// TestEventualConsistency verifies if getPodsAssignedToNode returns incomplete data -// (e.g. due to watch latency), it will reconcile the remaining pods eventually. -// This scenario is partially covered by TestUpdatePods, but given this is an important -// property of TaintManager, it's better to have explicit test for this. -func TestEventualConsistency(t *testing.T) { - testCases := []struct { - description string - pods []v1.Pod - prevPod *v1.Pod - newPod *v1.Pod - oldNode *v1.Node - newNode *v1.Node - expectDelete bool - }{ - { - description: "existing pod2 scheduled onto tainted Node", - pods: []v1.Pod{ - *testutil.NewPod("pod1", "node1"), - }, - prevPod: testutil.NewPod("pod2", ""), - newPod: testutil.NewPod("pod2", "node1"), - oldNode: testutil.NewNode("node1"), - newNode: addTaintsToNode(testutil.NewNode("node1"), "testTaint1", "taint1", []int{1}), - expectDelete: true, - }, - { - description: "existing pod2 with taint toleration scheduled onto tainted Node", - pods: []v1.Pod{ - *testutil.NewPod("pod1", "node1"), - }, - prevPod: addToleration(testutil.NewPod("pod2", ""), 1, 100), - newPod: addToleration(testutil.NewPod("pod2", "node1"), 1, 100), - oldNode: testutil.NewNode("node1"), - newNode: addTaintsToNode(testutil.NewNode("node1"), "testTaint1", "taint1", []int{1}), - expectDelete: false, - }, - { - description: "new pod2 created on tainted Node", - pods: []v1.Pod{ - *testutil.NewPod("pod1", "node1"), - }, - prevPod: nil, - newPod: testutil.NewPod("pod2", "node1"), - oldNode: testutil.NewNode("node1"), - newNode: addTaintsToNode(testutil.NewNode("node1"), "testTaint1", "taint1", []int{1}), - expectDelete: true, - }, - { - description: "new pod2 with tait toleration created on tainted Node", - pods: []v1.Pod{ - *testutil.NewPod("pod1", "node1"), - }, - prevPod: nil, - newPod: addToleration(testutil.NewPod("pod2", "node1"), 1, 100), - oldNode: testutil.NewNode("node1"), - newNode: addTaintsToNode(testutil.NewNode("node1"), "testTaint1", "taint1", []int{1}), - expectDelete: false, - }, - } - - for _, item := range testCases { - stopCh := make(chan struct{}) - fakeClientset := fake.NewSimpleClientset(&v1.PodList{Items: item.pods}) - holder := &podHolder{} - controller := NewNoExecuteTaintManager(fakeClientset, holder.getPod, (&nodeHolder{node: item.newNode}).getNode, getPodsAssignedToNode(fakeClientset)) - controller.recorder = testutil.NewFakeRecorder() - go controller.Run(stopCh) - - if item.prevPod != nil { - holder.setPod(item.prevPod) - controller.PodUpdated(nil, item.prevPod) - } - - // First we simulate NodeUpdate that should delete 'pod1'. It doesn't know about 'pod2' yet. - controller.NodeUpdated(item.oldNode, item.newNode) - // TODO(mborsz): Remove this sleep and other sleeps in this file. - time.Sleep(timeForControllerToProgress) - - podDeleted := false - for _, action := range fakeClientset.Actions() { - if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" { - podDeleted = true - } - } - if !podDeleted { - t.Errorf("%v: Unexpected test result. Expected delete, got: %v", item.description, podDeleted) - } - fakeClientset.ClearActions() - - // And now the delayed update of 'pod2' comes to the TaintManager. We should delete it as well. - holder.setPod(item.newPod) - controller.PodUpdated(item.prevPod, item.newPod) - // wait a bit - time.Sleep(timeForControllerToProgress) - - podDeleted = false - for _, action := range fakeClientset.Actions() { - if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" { - podDeleted = true - } - } - if podDeleted != item.expectDelete { - t.Errorf("%v: Unexpected test result. Expected delete %v, got %v", item.description, item.expectDelete, podDeleted) - } - close(stopCh) - } -} diff --git a/pkg/controller/nodelifecycle/scheduler/timed_workers.go b/pkg/controller/nodelifecycle/scheduler/timed_workers.go deleted file mode 100644 index ec0edbcda91..00000000000 --- a/pkg/controller/nodelifecycle/scheduler/timed_workers.go +++ /dev/null @@ -1,144 +0,0 @@ -/* -Copyright 2015 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package scheduler - -import ( - "sync" - "time" - - "k8s.io/apimachinery/pkg/types" - "k8s.io/klog/v2" -) - -// WorkArgs keeps arguments that will be passed to the function executed by the worker. -type WorkArgs struct { - NamespacedName types.NamespacedName -} - -// KeyFromWorkArgs creates a key for the given `WorkArgs` -func (w *WorkArgs) KeyFromWorkArgs() string { - return w.NamespacedName.String() -} - -// NewWorkArgs is a helper function to create new `WorkArgs` -func NewWorkArgs(name, namespace string) *WorkArgs { - return &WorkArgs{types.NamespacedName{Namespace: namespace, Name: name}} -} - -// TimedWorker is a responsible for executing a function no earlier than at FireAt time. -type TimedWorker struct { - WorkItem *WorkArgs - CreatedAt time.Time - FireAt time.Time - Timer *time.Timer -} - -// CreateWorker creates a TimedWorker that will execute `f` not earlier than `fireAt`. -func CreateWorker(args *WorkArgs, createdAt time.Time, fireAt time.Time, f func(args *WorkArgs) error) *TimedWorker { - delay := fireAt.Sub(createdAt) - if delay <= 0 { - go f(args) - return nil - } - timer := time.AfterFunc(delay, func() { f(args) }) - return &TimedWorker{ - WorkItem: args, - CreatedAt: createdAt, - FireAt: fireAt, - Timer: timer, - } -} - -// Cancel cancels the execution of function by the `TimedWorker` -func (w *TimedWorker) Cancel() { - if w != nil { - w.Timer.Stop() - } -} - -// TimedWorkerQueue keeps a set of TimedWorkers that are still wait for execution. -type TimedWorkerQueue struct { - sync.Mutex - // map of workers keyed by string returned by 'KeyFromWorkArgs' from the given worker. - workers map[string]*TimedWorker - workFunc func(args *WorkArgs) error -} - -// CreateWorkerQueue creates a new TimedWorkerQueue for workers that will execute -// given function `f`. -func CreateWorkerQueue(f func(args *WorkArgs) error) *TimedWorkerQueue { - return &TimedWorkerQueue{ - workers: make(map[string]*TimedWorker), - workFunc: f, - } -} - -func (q *TimedWorkerQueue) getWrappedWorkerFunc(key string) func(args *WorkArgs) error { - return func(args *WorkArgs) error { - err := q.workFunc(args) - q.Lock() - defer q.Unlock() - if err == nil { - // To avoid duplicated calls we keep the key in the queue, to prevent - // subsequent additions. - q.workers[key] = nil - } else { - delete(q.workers, key) - } - return err - } -} - -// AddWork adds a work to the WorkerQueue which will be executed not earlier than `fireAt`. -func (q *TimedWorkerQueue) AddWork(args *WorkArgs, createdAt time.Time, fireAt time.Time) { - key := args.KeyFromWorkArgs() - klog.V(4).Infof("Adding TimedWorkerQueue item %v at %v to be fired at %v", key, createdAt, fireAt) - - q.Lock() - defer q.Unlock() - if _, exists := q.workers[key]; exists { - klog.Warningf("Trying to add already existing work for %+v. Skipping.", args) - return - } - worker := CreateWorker(args, createdAt, fireAt, q.getWrappedWorkerFunc(key)) - q.workers[key] = worker -} - -// CancelWork removes scheduled function execution from the queue. Returns true if work was cancelled. -func (q *TimedWorkerQueue) CancelWork(key string) bool { - q.Lock() - defer q.Unlock() - worker, found := q.workers[key] - result := false - if found { - klog.V(4).Infof("Cancelling TimedWorkerQueue item %v at %v", key, time.Now()) - if worker != nil { - result = true - worker.Cancel() - } - delete(q.workers, key) - } - return result -} - -// GetWorkerUnsafe returns a TimedWorker corresponding to the given key. -// Unsafe method - workers have attached goroutines which can fire after this function is called. -func (q *TimedWorkerQueue) GetWorkerUnsafe(key string) *TimedWorker { - q.Lock() - defer q.Unlock() - return q.workers[key] -} diff --git a/pkg/controller/nodelifecycle/scheduler/timed_workers_test.go b/pkg/controller/nodelifecycle/scheduler/timed_workers_test.go deleted file mode 100644 index 0de8a9be5e6..00000000000 --- a/pkg/controller/nodelifecycle/scheduler/timed_workers_test.go +++ /dev/null @@ -1,141 +0,0 @@ -/* -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package scheduler - -import ( - "sync" - "sync/atomic" - "testing" - "time" -) - -func TestExecute(t *testing.T) { - testVal := int32(0) - wg := sync.WaitGroup{} - wg.Add(5) - queue := CreateWorkerQueue(func(args *WorkArgs) error { - atomic.AddInt32(&testVal, 1) - wg.Done() - return nil - }) - now := time.Now() - queue.AddWork(NewWorkArgs("1", "1"), now, now) - queue.AddWork(NewWorkArgs("2", "2"), now, now) - queue.AddWork(NewWorkArgs("3", "3"), now, now) - queue.AddWork(NewWorkArgs("4", "4"), now, now) - queue.AddWork(NewWorkArgs("5", "5"), now, now) - // Adding the same thing second time should be no-op - queue.AddWork(NewWorkArgs("1", "1"), now, now) - queue.AddWork(NewWorkArgs("2", "2"), now, now) - queue.AddWork(NewWorkArgs("3", "3"), now, now) - queue.AddWork(NewWorkArgs("4", "4"), now, now) - queue.AddWork(NewWorkArgs("5", "5"), now, now) - wg.Wait() - lastVal := atomic.LoadInt32(&testVal) - if lastVal != 5 { - t.Errorf("Expected testVal = 5, got %v", lastVal) - } -} - -func TestExecuteDelayed(t *testing.T) { - testVal := int32(0) - wg := sync.WaitGroup{} - wg.Add(5) - queue := CreateWorkerQueue(func(args *WorkArgs) error { - atomic.AddInt32(&testVal, 1) - wg.Done() - return nil - }) - now := time.Now() - then := now.Add(10 * time.Second) - queue.AddWork(NewWorkArgs("1", "1"), now, then) - queue.AddWork(NewWorkArgs("2", "2"), now, then) - queue.AddWork(NewWorkArgs("3", "3"), now, then) - queue.AddWork(NewWorkArgs("4", "4"), now, then) - queue.AddWork(NewWorkArgs("5", "5"), now, then) - queue.AddWork(NewWorkArgs("1", "1"), now, then) - queue.AddWork(NewWorkArgs("2", "2"), now, then) - queue.AddWork(NewWorkArgs("3", "3"), now, then) - queue.AddWork(NewWorkArgs("4", "4"), now, then) - queue.AddWork(NewWorkArgs("5", "5"), now, then) - wg.Wait() - lastVal := atomic.LoadInt32(&testVal) - if lastVal != 5 { - t.Errorf("Expected testVal = 5, got %v", lastVal) - } -} - -func TestCancel(t *testing.T) { - testVal := int32(0) - wg := sync.WaitGroup{} - wg.Add(3) - queue := CreateWorkerQueue(func(args *WorkArgs) error { - atomic.AddInt32(&testVal, 1) - wg.Done() - return nil - }) - now := time.Now() - then := now.Add(10 * time.Second) - queue.AddWork(NewWorkArgs("1", "1"), now, then) - queue.AddWork(NewWorkArgs("2", "2"), now, then) - queue.AddWork(NewWorkArgs("3", "3"), now, then) - queue.AddWork(NewWorkArgs("4", "4"), now, then) - queue.AddWork(NewWorkArgs("5", "5"), now, then) - queue.AddWork(NewWorkArgs("1", "1"), now, then) - queue.AddWork(NewWorkArgs("2", "2"), now, then) - queue.AddWork(NewWorkArgs("3", "3"), now, then) - queue.AddWork(NewWorkArgs("4", "4"), now, then) - queue.AddWork(NewWorkArgs("5", "5"), now, then) - queue.CancelWork(NewWorkArgs("2", "2").KeyFromWorkArgs()) - queue.CancelWork(NewWorkArgs("4", "4").KeyFromWorkArgs()) - wg.Wait() - lastVal := atomic.LoadInt32(&testVal) - if lastVal != 3 { - t.Errorf("Expected testVal = 3, got %v", lastVal) - } -} - -func TestCancelAndReadd(t *testing.T) { - testVal := int32(0) - wg := sync.WaitGroup{} - wg.Add(4) - queue := CreateWorkerQueue(func(args *WorkArgs) error { - atomic.AddInt32(&testVal, 1) - wg.Done() - return nil - }) - now := time.Now() - then := now.Add(10 * time.Second) - queue.AddWork(NewWorkArgs("1", "1"), now, then) - queue.AddWork(NewWorkArgs("2", "2"), now, then) - queue.AddWork(NewWorkArgs("3", "3"), now, then) - queue.AddWork(NewWorkArgs("4", "4"), now, then) - queue.AddWork(NewWorkArgs("5", "5"), now, then) - queue.AddWork(NewWorkArgs("1", "1"), now, then) - queue.AddWork(NewWorkArgs("2", "2"), now, then) - queue.AddWork(NewWorkArgs("3", "3"), now, then) - queue.AddWork(NewWorkArgs("4", "4"), now, then) - queue.AddWork(NewWorkArgs("5", "5"), now, then) - queue.CancelWork(NewWorkArgs("2", "2").KeyFromWorkArgs()) - queue.CancelWork(NewWorkArgs("4", "4").KeyFromWorkArgs()) - queue.AddWork(NewWorkArgs("2", "2"), now, then) - wg.Wait() - lastVal := atomic.LoadInt32(&testVal) - if lastVal != 4 { - t.Errorf("Expected testVal = 4, got %v", lastVal) - } -} diff --git a/pkg/controller/poolcoordinator/constant/constant.go b/pkg/controller/poolcoordinator/constant/constant.go new file mode 100644 index 00000000000..65d1c7ae88f --- /dev/null +++ b/pkg/controller/poolcoordinator/constant/constant.go @@ -0,0 +1,31 @@ +/* +Copyright 2022 The OpenYurt Authors. +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package constant + +const ( + // nodeutil.AnnotationKeyNodeAutonomy + AnnotationKeyNodeAutonomy = "node.beta.openyurt.io/autonomy" + LabelKeyNodePool = "apps.openyurt.io/nodepool" + + DelegateHeartBeat = "openyurt.io/delegate-heartbeat" + + // when node cannot reach api-server directly but can be delegated lease, we should taint the node as unschedulable + NodeNotSchedulableTaint = "node.openyurt.io/unschedulable" + // number of lease intervals passed before we taint/detaint node as unschedulable + LeaseDelegationThreshold = 4 +) diff --git a/pkg/controller/poolcoordinator/poolcoordinator_controller.go b/pkg/controller/poolcoordinator/poolcoordinator_controller.go new file mode 100644 index 00000000000..13bb584baa7 --- /dev/null +++ b/pkg/controller/poolcoordinator/poolcoordinator_controller.go @@ -0,0 +1,233 @@ +/* +Copyright 2022 The OpenYurt Authors. +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package poolcoordinator + +import ( + "context" + "fmt" + "time" + + coordv1 "k8s.io/api/coordination/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/informers" + informercoordv1 "k8s.io/client-go/informers/coordination/v1" + v1 "k8s.io/client-go/informers/core/v1" + client "k8s.io/client-go/kubernetes" + leaselisterv1 "k8s.io/client-go/listers/coordination/v1" + listerv1 "k8s.io/client-go/listers/core/v1" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/util/workqueue" + "k8s.io/klog/v2" + + "github.com/openyurtio/openyurt/pkg/controller/poolcoordinator/constant" + "github.com/openyurtio/openyurt/pkg/controller/poolcoordinator/utils" +) + +const ( + numWorkers = 5 +) + +type Controller struct { + client client.Interface + nodeInformer v1.NodeInformer + nodeSynced cache.InformerSynced + nodeLister listerv1.NodeLister + leaseInformer informercoordv1.LeaseInformer + leaseSynced cache.InformerSynced + leaseLister leaselisterv1.LeaseNamespaceLister + nodeUpdateQueue workqueue.Interface + + ldc *utils.LeaseDelegatedCounter +} + +func (c *Controller) onLeaseCreate(n interface{}) { + nl := n.(*coordv1.Lease) + if nl.Namespace != corev1.NamespaceNodeLease { + return + } + //klog.Infof("new lease: %v\n", nl) + + key, err := cache.MetaNamespaceKeyFunc(n) + if err == nil { + c.nodeUpdateQueue.Add(key) + } +} + +func (c *Controller) onLeaseUpdate(o interface{}, n interface{}) { + //ol := o.(*coordv1.Lease) + nl := n.(*coordv1.Lease) + if nl.Namespace != corev1.NamespaceNodeLease { + return + } + + //klog.Infof("updated lease for: %v\n", nl.Name) + + key, err := cache.MetaNamespaceKeyFunc(n) + if err == nil { + c.nodeUpdateQueue.Add(key) + } +} + +func NewController(kc client.Interface, informerFactory informers.SharedInformerFactory) *Controller { + ctl := &Controller{ + client: kc, + nodeUpdateQueue: workqueue.NewNamed("poolcoordinator_node"), + ldc: utils.NewLeaseDelegatedCounter(), + } + + if informerFactory != nil { + ctl.nodeInformer = informerFactory.Core().V1().Nodes() + ctl.nodeSynced = ctl.nodeInformer.Informer().HasSynced + ctl.nodeLister = ctl.nodeInformer.Lister() + ctl.leaseInformer = informerFactory.Coordination().V1().Leases() + ctl.leaseSynced = ctl.leaseInformer.Informer().HasSynced + ctl.leaseLister = ctl.leaseInformer.Lister().Leases(corev1.NamespaceNodeLease) + + ctl.leaseInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: ctl.onLeaseCreate, + UpdateFunc: ctl.onLeaseUpdate, + DeleteFunc: nil, + }) + } + + return ctl +} + +func (c *Controller) taintNodeNotSchedulable(name string) { + node, err := c.nodeLister.Get(name) + if err != nil { + klog.Error(err) + return + } + c.doTaintNodeNotSchedulable(node) +} + +func (c *Controller) doTaintNodeNotSchedulable(node *corev1.Node) *corev1.Node { + taints := node.Spec.Taints + if utils.TaintKeyExists(taints, constant.NodeNotSchedulableTaint) { + klog.Infof("taint %s: key %s already exists, nothing to do\n", node.Name, constant.NodeNotSchedulableTaint) + return node + } + nn := node.DeepCopy() + t := corev1.Taint{ + Key: constant.NodeNotSchedulableTaint, + Value: "true", + Effect: corev1.TaintEffectNoSchedule, + } + nn.Spec.Taints = append(nn.Spec.Taints, t) + var err error + if c.client != nil { + nn, err = c.client.CoreV1().Nodes().Update(context.TODO(), nn, metav1.UpdateOptions{}) + if err != nil { + klog.Error(err) + } + } + return nn +} + +func (c *Controller) deTaintNodeNotSchedulable(name string) { + node, err := c.nodeLister.Get(name) + if err != nil { + klog.Error(err) + return + } + c.doDeTaintNodeNotSchedulable(node) +} + +func (c *Controller) doDeTaintNodeNotSchedulable(node *corev1.Node) *corev1.Node { + taints := node.Spec.Taints + taints, deleted := utils.DeleteTaintsByKey(taints, constant.NodeNotSchedulableTaint) + if !deleted { + klog.Infof("detaint %s: no key %s exists, nothing to do\n", node.Name, constant.NodeNotSchedulableTaint) + return node + } + nn := node.DeepCopy() + nn.Spec.Taints = taints + var err error + if c.client != nil { + nn, err = c.client.CoreV1().Nodes().Update(context.TODO(), nn, metav1.UpdateOptions{}) + if err != nil { + klog.Error(err) + } + } + return nn +} + +func (c *Controller) syncHandler(key string) error { + _, name, err := cache.SplitMetaNamespaceKey(key) + if err != nil { + return fmt.Errorf("invalid resource key: %s", key) + } + + nl, err := c.leaseLister.Get(name) + if err != nil { + klog.Errorf("couldn't get lease for %s, maybe it has been deleted\n", name) + c.ldc.Del(name) + return nil + } + + nval, nok := nl.Annotations[constant.DelegateHeartBeat] + + if nok && nval == "true" { + c.ldc.Inc(nl.Name) + if c.ldc.Counter(nl.Name) >= constant.LeaseDelegationThreshold { + c.taintNodeNotSchedulable(nl.Name) + } + } else { + if c.ldc.Counter(nl.Name) >= constant.LeaseDelegationThreshold { + c.deTaintNodeNotSchedulable(nl.Name) + } + c.ldc.Reset(nl.Name) + } + + return nil +} + +func (c *Controller) nodeWorker() { + for { + key, shutdown := c.nodeUpdateQueue.Get() + if shutdown { + klog.Info("node work queue shutdown") + return + } + + if err := c.syncHandler(key.(string)); err != nil { + runtime.HandleError(err) + } + + c.nodeUpdateQueue.Done(key) + } +} + +func (c *Controller) Run(stopCH <-chan struct{}) { + if !cache.WaitForCacheSync(stopCH, c.nodeSynced, c.leaseSynced) { + klog.Error("sync poolcoordinator controller timeout") + } + + defer c.nodeUpdateQueue.ShutDown() + + klog.Info("start node taint workers") + for i := 0; i < numWorkers; i++ { + go wait.Until(c.nodeWorker, time.Second, stopCH) + } + + <-stopCH +} diff --git a/pkg/controller/poolcoordinator/poolcoordinator_controller_test.go b/pkg/controller/poolcoordinator/poolcoordinator_controller_test.go new file mode 100644 index 00000000000..34159c2dc48 --- /dev/null +++ b/pkg/controller/poolcoordinator/poolcoordinator_controller_test.go @@ -0,0 +1,40 @@ +/* +Copyright 2023 The OpenYurt Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package poolcoordinator + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" +) + +func TestTaintNode(t *testing.T) { + c := NewController(nil, nil) + node := &corev1.Node{} + node = c.doDeTaintNodeNotSchedulable(node) + if len(node.Spec.Taints) != 0 { + t.Fail() + } + node = c.doTaintNodeNotSchedulable(node) + if len(node.Spec.Taints) == 0 { + t.Fail() + } + node = c.doDeTaintNodeNotSchedulable(node) + if len(node.Spec.Taints) != 0 { + t.Fail() + } +} diff --git a/pkg/controller/poolcoordinator/utils/file.go b/pkg/controller/poolcoordinator/utils/file.go new file mode 100644 index 00000000000..df456e7d907 --- /dev/null +++ b/pkg/controller/poolcoordinator/utils/file.go @@ -0,0 +1,56 @@ +/* +Copyright 2022 The OpenYurt Authors. +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "os" +) + +func GetEnv(key, fallback string) string { + if value, ok := os.LookupEnv(key); ok { + return value + } + return fallback +} + +func FileExists(filename string) bool { + info, err := os.Stat(filename) + if os.IsNotExist(err) { + return false + } + return !info.IsDir() +} + +func DirExists(filename string) bool { + info, err := os.Stat(filename) + if os.IsNotExist(err) { + return false + } + return info.IsDir() +} + +func EnsureDir(dir string) error { + if DirExists(dir) { + return nil + } + return os.MkdirAll(dir, 0750) +} + +func WriteFile(fn string, data []byte) error { + return os.WriteFile(fn, data, 0660) +} diff --git a/pkg/controller/poolcoordinator/utils/file_test.go b/pkg/controller/poolcoordinator/utils/file_test.go new file mode 100644 index 00000000000..0b6ce5af56b --- /dev/null +++ b/pkg/controller/poolcoordinator/utils/file_test.go @@ -0,0 +1,48 @@ +/* +Copyright 2022 The OpenYurt Authors. +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "testing" +) + +func TestGetEnv(t *testing.T) { + r := GetEnv("", "a") + if r != "a" { + t.Fail() + } +} + +func TestFileExists(t *testing.T) { + WriteFile("/tmp/abcd", []byte("abcd")) + if FileExists("/tmp/abcd") != true { + t.Fail() + } +} + +func TestDirExists(t *testing.T) { + if DirExists("/tmp") != true { + t.Fail() + } +} + +func TestEnsureDir(t *testing.T) { + if EnsureDir("/tmp") != nil { + t.Fail() + } +} diff --git a/pkg/controller/poolcoordinator/utils/lease.go b/pkg/controller/poolcoordinator/utils/lease.go new file mode 100644 index 00000000000..d4d3cd54907 --- /dev/null +++ b/pkg/controller/poolcoordinator/utils/lease.go @@ -0,0 +1,75 @@ +/* +Copyright 2022 The OpenYurt Authors. +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "sync" + + "github.com/openyurtio/openyurt/pkg/controller/poolcoordinator/constant" +) + +type LeaseDelegatedCounter struct { + v map[string]int + lock sync.Mutex +} + +func NewLeaseDelegatedCounter() *LeaseDelegatedCounter { + return &LeaseDelegatedCounter{ + v: make(map[string]int), + } +} + +func (dc *LeaseDelegatedCounter) Inc(name string) { + dc.lock.Lock() + defer dc.lock.Unlock() + + if dc.v[name] >= constant.LeaseDelegationThreshold { + return + } + dc.v[name] += 1 +} + +func (dc *LeaseDelegatedCounter) Dec(name string) { + dc.lock.Lock() + defer dc.lock.Unlock() + + if dc.v[name] > 0 { + dc.v[name] -= 1 + } +} + +func (dc *LeaseDelegatedCounter) Reset(name string) { + dc.lock.Lock() + defer dc.lock.Unlock() + + dc.v[name] = 0 +} + +func (dc *LeaseDelegatedCounter) Del(name string) { + dc.lock.Lock() + defer dc.lock.Unlock() + + delete(dc.v, name) +} + +func (dc *LeaseDelegatedCounter) Counter(name string) int { + dc.lock.Lock() + defer dc.lock.Unlock() + + return dc.v[name] +} diff --git a/pkg/controller/poolcoordinator/utils/lease_test.go b/pkg/controller/poolcoordinator/utils/lease_test.go new file mode 100644 index 00000000000..905c219e924 --- /dev/null +++ b/pkg/controller/poolcoordinator/utils/lease_test.go @@ -0,0 +1,36 @@ +/* +Copyright 2022 The OpenYurt Authors. +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "testing" +) + +func TestLeaseDelegatedCounter(t *testing.T) { + ldc := NewLeaseDelegatedCounter() + ldc.Inc("node1") + expected := 1 + if ldc.Counter("node1") != expected { + t.Errorf("expect %v, but %v returned", expected, ldc.Counter("node1")) + } + ldc.Dec("node1") + expected = 0 + if ldc.Counter("node1") != expected { + t.Errorf("expect %v, but %v returned", expected, ldc.Counter("node1")) + } +} diff --git a/pkg/controller/poolcoordinator/utils/nodepool.go b/pkg/controller/poolcoordinator/utils/nodepool.go new file mode 100644 index 00000000000..91c0f3e10f8 --- /dev/null +++ b/pkg/controller/poolcoordinator/utils/nodepool.go @@ -0,0 +1,153 @@ +/* +Copyright 2022 The OpenYurt Authors. +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "sync" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" + leaselisterv1 "k8s.io/client-go/listers/coordination/v1" + "k8s.io/klog/v2" + + "github.com/openyurtio/openyurt/pkg/controller/poolcoordinator/constant" +) + +type NodepoolMap struct { + nodepools map[string]sets.String + lock sync.Mutex +} + +func NewNodepoolMap() *NodepoolMap { + return &NodepoolMap{ + nodepools: make(map[string]sets.String), + } +} + +func (m *NodepoolMap) Add(pool, node string) { + m.lock.Lock() + defer m.lock.Unlock() + + if m.nodepools[pool] == nil { + m.nodepools[pool] = sets.String{} + } + m.nodepools[pool].Insert(node) +} + +func (m *NodepoolMap) Del(pool, node string) { + m.lock.Lock() + defer m.lock.Unlock() + + if m.nodepools[pool] == nil { + return + } + + m.nodepools[pool].Delete(node) + if m.nodepools[pool].Len() == 0 { + delete(m.nodepools, pool) + } +} + +// delete a node from a pool, if any +func (m *NodepoolMap) DelNode(node string) { + m.lock.Lock() + defer m.lock.Unlock() + + for p := range m.nodepools { + m.nodepools[p].Delete(node) + } +} + +func (m *NodepoolMap) GetPool(node string) (string, bool) { + m.lock.Lock() + defer m.lock.Unlock() + + for p := range m.nodepools { + if m.nodepools[p].Has(node) { + return p, true + } + } + return "", false +} + +func (m *NodepoolMap) Count(pool string) int { + if m.nodepools[pool] != nil { + return m.nodepools[pool].Len() + } + return 0 +} + +func (m *NodepoolMap) Nodes(pool string) []string { + if m.nodepools[pool] != nil { + return m.nodepools[pool].UnsortedList() + } + return []string{} +} + +func (m *NodepoolMap) Sync(nodes []*corev1.Node) { + for _, n := range nodes { + pool, ok := NodeNodepool(n) + if ok { + m.Add(pool, n.Name) + } + } +} + +func NodeIsInAutonomy(node *corev1.Node) bool { + if node != nil && node.Annotations != nil && node.Annotations[constant.AnnotationKeyNodeAutonomy] == "true" { + return true + } + return false +} + +func NodeIsAlive(leaseLister leaselisterv1.LeaseNamespaceLister, nodeName string) bool { + if leaseLister == nil { + return false + } + + lease, err := leaseLister.Get(nodeName) + if err != nil { + klog.Error(err) + return false + } + diff := time.Now().Sub(lease.Spec.RenewTime.Time) + if diff.Seconds() > 40 { + return false + } + return true +} + +func CountAliveNode(leaseLister leaselisterv1.LeaseNamespaceLister, nodes []string) int { + cnt := 0 + for _, n := range nodes { + if NodeIsAlive(leaseLister, n) { + cnt++ + } + } + return cnt +} + +func NodeNodepool(node *corev1.Node) (string, bool) { + if node.Labels != nil { + val, ok := node.Labels[constant.LabelKeyNodePool] + return val, ok + } + + return "", false +} diff --git a/pkg/controller/poolcoordinator/utils/nodepool_test.go b/pkg/controller/poolcoordinator/utils/nodepool_test.go new file mode 100644 index 00000000000..87d3cd9c120 --- /dev/null +++ b/pkg/controller/poolcoordinator/utils/nodepool_test.go @@ -0,0 +1,117 @@ +/* +Copyright 2022 The OpenYurt Authors. +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "reflect" + "sort" + "testing" + + corev1 "k8s.io/api/core/v1" + + "github.com/openyurtio/openyurt/pkg/controller/poolcoordinator/constant" +) + +func TestNodeMap(t *testing.T) { + nm := NewNodepoolMap() + nm.Add("pool1", "node1") + nm.Add("pool1", "node2") + nm.Add("pool2", "node3") + nm.Add("pool2", "node4") + nm.Add("pool2", "node5") + + if nm.Count("pool1") != 2 { + t.Errorf("expect %v, but %v returned", 2, nm.Count("pool1")) + } + if nm.Count("pool2") != 3 { + t.Errorf("expect %v, but %v returned", 3, nm.Count("pool2")) + } + nm.Del("pool2", "node4") + if nm.Count("pool2") != 2 { + t.Errorf("expect %v, but %v returned", 2, nm.Count("pool2")) + } + pool, ok := nm.GetPool("node1") + if !ok { + t.Errorf("node1's pool should be pool1") + } + if pool != "pool1" { + t.Errorf("node1's pool should be pool1") + } + + nodes := nm.Nodes("pool2") + sort.Sort(sort.StringSlice(nodes)) + expected := []string{"node3", "node5"} + if !reflect.DeepEqual(nodes, expected) { + t.Errorf("expect %v, but %v returned", expected, nodes) + } + nm.DelNode("node3") + nodes = nm.Nodes("pool2") + sort.Sort(sort.StringSlice(nodes)) + expected = []string{"node5"} + if !reflect.DeepEqual(nodes, expected) { + t.Errorf("expect %v, but %v returned", expected, nodes) + } + nm.DelNode("node5") + nodes = nm.Nodes("pool2") + sort.Sort(sort.StringSlice(nodes)) + expected = []string{} + if !reflect.DeepEqual(nodes, expected) { + t.Errorf("expect %v, but %v returned", expected, nodes) + } + + nm.Del("pool1", "node1") + nm.Del("pool1", "node2") + if nm.Count("pool1") != 0 { + t.Errorf("expect %v, but %v returned", 0, nm.Count("pool1")) + } + nodes = nm.Nodes("pool1") + expected = []string{} + if !reflect.DeepEqual(nodes, expected) { + t.Errorf("expect %v, but %v returned", expected, nodes) + } + + nm.DelNode("node5") + nm.DelNode("node1") + nodes = nm.Nodes("pool2") + sort.Sort(sort.StringSlice(nodes)) + expected = []string{} + if !reflect.DeepEqual(nodes, expected) { + t.Errorf("expect %v, but %v returned", expected, nodes) + } + +} + +func TestNodeNodepool(t *testing.T) { + node := &corev1.Node{} + node.Labels = map[string]string{} + node.Labels[constant.LabelKeyNodePool] = "ut" + pool, ok := NodeNodepool(node) + if ok == false || pool != "ut" { + t.Fail() + } +} + +func TestNodeIsInAutonomy(t *testing.T) { + node := &corev1.Node{} + node.Annotations = map[string]string{} + node.Annotations[constant.AnnotationKeyNodeAutonomy] = "true" + if NodeIsInAutonomy(node) != true { + t.Fail() + } + +} diff --git a/pkg/controller/poolcoordinator/utils/taints.go b/pkg/controller/poolcoordinator/utils/taints.go new file mode 100644 index 00000000000..cd643b42954 --- /dev/null +++ b/pkg/controller/poolcoordinator/utils/taints.go @@ -0,0 +1,44 @@ +/* +Copyright 2022 The OpenYurt Authors. +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import v1 "k8s.io/api/core/v1" + +// DeleteTaintsByKey removes all the taints that have the same key to given taintKey +func DeleteTaintsByKey(taints []v1.Taint, taintKey string) ([]v1.Taint, bool) { + newTaints := []v1.Taint{} + deleted := false + for i := range taints { + if taintKey == taints[i].Key { + deleted = true + continue + } + newTaints = append(newTaints, taints[i]) + } + return newTaints, deleted +} + +// TaintKeyExists checks if the given taint key exists in list of taints. Returns true if exists false otherwise. +func TaintKeyExists(taints []v1.Taint, taintKeyToMatch string) bool { + for _, taint := range taints { + if taint.Key == taintKeyToMatch { + return true + } + } + return false +} diff --git a/pkg/controller/poolcoordinator/utils/taints_test.go b/pkg/controller/poolcoordinator/utils/taints_test.go new file mode 100644 index 00000000000..9d27032f6ab --- /dev/null +++ b/pkg/controller/poolcoordinator/utils/taints_test.go @@ -0,0 +1,59 @@ +/* +Copyright 2023 The OpenYurt Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "testing" + + v1 "k8s.io/api/core/v1" + + "github.com/openyurtio/openyurt/pkg/controller/poolcoordinator/constant" +) + +func TestDeleteTaintsByKey(t *testing.T) { + taints := []v1.Taint{ + { + Key: constant.NodeNotSchedulableTaint, + Value: "true", + Effect: v1.TaintEffectNoSchedule, + }, + } + _, d := DeleteTaintsByKey(taints, "key") + if d == true { + t.Fail() + } + _, d = DeleteTaintsByKey(taints, constant.NodeNotSchedulableTaint) + if d != true { + t.Fail() + } +} + +func TestTaintKeyExists(t *testing.T) { + taints := []v1.Taint{ + { + Key: constant.NodeNotSchedulableTaint, + Value: "true", + Effect: v1.TaintEffectNoSchedule, + }, + } + if TaintKeyExists(taints, "key") != false { + t.Fail() + } + if TaintKeyExists(taints, constant.NodeNotSchedulableTaint) != true { + t.Fail() + } +} diff --git a/pkg/controller/poolcoordinator/utils/tolerations.go b/pkg/controller/poolcoordinator/utils/tolerations.go new file mode 100644 index 00000000000..08d41164a02 --- /dev/null +++ b/pkg/controller/poolcoordinator/utils/tolerations.go @@ -0,0 +1,110 @@ +/* +Copyright 2022 The OpenYurt Authors. +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + corev1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" + "k8s.io/klog/v2" +) + +// VerifyAgainstWhitelist checks if the provided tolerations +// satisfy the provided whitelist and returns true, otherwise returns false +func VerifyAgainstWhitelist(tolerations, whitelist []corev1.Toleration) bool { + if len(whitelist) == 0 || len(tolerations) == 0 { + return true + } + +next: + for _, t := range tolerations { + for _, w := range whitelist { + if isSuperset(w, t) { + continue next + } + } + return false + } + + return true +} + +// MergeTolerations merges two sets of tolerations into one. If one toleration is a superset of +// another, only the superset is kept. +func MergeTolerations(first, second []corev1.Toleration) ([]corev1.Toleration, bool) { + all := append(first, second...) + var merged []corev1.Toleration + var changed bool + +next: + for i, t := range all { + for _, t2 := range merged { + if isSuperset(t2, t) { + continue next // t is redundant; ignore it + } + } + if i+1 < len(all) { + for _, t2 := range all[i+1:] { + // If the tolerations are equal, prefer the first. + if !apiequality.Semantic.DeepEqual(&t, &t2) && isSuperset(t2, t) { + continue next // t is redundant; ignore it + } + } + } + merged = append(merged, t) + changed = true + } + + return merged, changed +} + +// isSuperset checks whether ss tolerates a superset of t. +func isSuperset(ss, t corev1.Toleration) bool { + if apiequality.Semantic.DeepEqual(&t, &ss) { + return true + } + + if t.Key != ss.Key && + // An empty key with Exists operator means match all keys & values. + (ss.Key != "" || ss.Operator != corev1.TolerationOpExists) { + return false + } + + // An empty effect means match all effects. + if t.Effect != ss.Effect && ss.Effect != "" { + return false + } + + if ss.Effect == corev1.TaintEffectNoExecute { + if ss.TolerationSeconds != nil { + if t.TolerationSeconds == nil || + *t.TolerationSeconds > *ss.TolerationSeconds { + return false + } + } + } + + switch ss.Operator { + case corev1.TolerationOpEqual, "": // empty operator means Equal + return t.Operator == corev1.TolerationOpEqual && t.Value == ss.Value + case corev1.TolerationOpExists: + return true + default: + klog.Errorf("Unknown toleration operator: %s", ss.Operator) + return false + } +} diff --git a/pkg/controller/poolcoordinator/utils/tolerations_test.go b/pkg/controller/poolcoordinator/utils/tolerations_test.go new file mode 100644 index 00000000000..2796f5f6b20 --- /dev/null +++ b/pkg/controller/poolcoordinator/utils/tolerations_test.go @@ -0,0 +1,92 @@ +/* +Copyright 2022 The OpenYurt Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "reflect" + "testing" + + corev1 "k8s.io/api/core/v1" +) + +func TestVerifyAgainstWhitelist(t *testing.T) { + toadd := []corev1.Toleration{ + {Key: "node.kubernetes.io/unreachable", + Operator: "Exists", + Effect: "NoExecute", + TolerationSeconds: nil}, + } + current := []corev1.Toleration{ + {Key: "node.kubernetes.io/unreachable", + Operator: "Exists", + Effect: "NoExecute", + TolerationSeconds: nil}, + {Key: "node.kubernetes.io/not-ready", + Operator: "Exists", + Effect: "NoExecute", + TolerationSeconds: nil}, + } + if VerifyAgainstWhitelist(toadd, current) != true { + t.Fail() + } +} + +func TestMergeTolerations(t *testing.T) { + toadd := []corev1.Toleration{ + {Key: "node.kubernetes.io/unreachable", + Operator: "Exists", + Effect: "NoExecute", + TolerationSeconds: nil}, + {Key: "node.kubernetes.io/not-ready", + Operator: "Exists", + Effect: "NoExecute", + TolerationSeconds: nil}, + } + var ss int64 = 300 + current := []corev1.Toleration{ + {Key: "node.kubernetes.io/unreachable", + Operator: "Exists", + Effect: "NoExecute", + TolerationSeconds: &ss}, + {Key: "node.kubernetes.io/not-ready", + Operator: "Exists", + Effect: "NoExecute", + TolerationSeconds: &ss}, + } + r, m := MergeTolerations(toadd, current) + if m != true { + t.Fail() + } + if reflect.DeepEqual(r, toadd) != true { + t.Fail() + } +} + +func TestIsSuperset(t *testing.T) { + t1 := corev1.Toleration{Key: "node.kubernetes.io/unreachable", + Operator: "Exists", + Effect: "NoExecute", + TolerationSeconds: nil} + var ss int64 = 300 + t2 := corev1.Toleration{Key: "node.kubernetes.io/unreachable", + Operator: "Exists", + Effect: "NoExecute", + TolerationSeconds: &ss} + if isSuperset(t1, t2) != true { + t.Fail() + } +} diff --git a/pkg/webhook/certs.go b/pkg/webhook/certs.go new file mode 100644 index 00000000000..dbccb3060f9 --- /dev/null +++ b/pkg/webhook/certs.go @@ -0,0 +1,166 @@ +/* +Copyright 2022 The OpenYurt Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package webhook + +import ( + "crypto" + "crypto/rand" + "crypto/rsa" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "errors" + "fmt" + "math" + "math/big" + "net" + "time" + + "k8s.io/client-go/util/cert" + "k8s.io/client-go/util/keyutil" +) + +const ( + CAKeyName = "ca-key.pem" + CACertName = "ca-cert.pem" + ServerKeyName = "key.pem" + ServerKeyName2 = "tls.key" + ServerCertName = "cert.pem" + ServerCertName2 = "tls.crt" +) + +type Certs struct { + // PEM encoded private key + Key []byte + // PEM encoded serving certificate + Cert []byte + // PEM encoded CA private key + CAKey []byte + // PEM encoded CA certificate + CACert []byte + // Resource version of the certs + ResourceVersion string +} + +const ( + rsaKeySize = 2048 +) + +// GenerateCerts generate a suite of self signed CA and server cert +func GenerateCerts(serviceNamespace, serviceName string) *Certs { + certs := &Certs{} + certs.generate(serviceNamespace, serviceName) + + return certs +} + +func (c *Certs) generate(serviceNamespace, serviceName string) error { + caKey, err := rsa.GenerateKey(rand.Reader, rsaKeySize) + if err != nil { + return fmt.Errorf("failed to create CA private key: %v", err) + } + caCert, err := cert.NewSelfSignedCACert(cert.Config{CommonName: "yurt-webhooks-cert-ca"}, caKey) + if err != nil { + return fmt.Errorf("failed to create CA cert: %v", err) + } + + key, err := rsa.GenerateKey(rand.Reader, rsaKeySize) + if err != nil { + return fmt.Errorf("failed to create private key: %v", err) + } + + commonName := ServiceToCommonName(serviceNamespace, serviceName) + hostIP := net.ParseIP(commonName) + var altIPs []net.IP + if hostIP.To4() != nil { + altIPs = append(altIPs, hostIP.To4()) + } + dnsNames := []string{serviceName, fmt.Sprintf("%s.%s", serviceName, serviceNamespace), commonName} + cert, err := NewSignedCert( + cert.Config{ + CommonName: commonName, + Usages: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + AltNames: cert.AltNames{IPs: altIPs, DNSNames: dnsNames}, + }, + key, caCert, caKey, + ) + if err != nil { + return fmt.Errorf("failed to create cert: %v", err) + } + + c.Key = EncodePrivateKeyPEM(key) + c.Cert = EncodeCertPEM(cert) + c.CAKey = EncodePrivateKeyPEM(caKey) + c.CACert = EncodeCertPEM(caCert) + + return nil +} + +func NewSignedCert(cfg cert.Config, key crypto.Signer, caCert *x509.Certificate, caKey crypto.Signer) (*x509.Certificate, error) { + serial, err := rand.Int(rand.Reader, new(big.Int).SetInt64(math.MaxInt64)) + if err != nil { + return nil, err + } + if len(cfg.CommonName) == 0 { + return nil, errors.New("must specify a CommonName") + } + if len(cfg.Usages) == 0 { + return nil, errors.New("must specify at least one ExtKeyUsage") + } + + certTmpl := x509.Certificate{ + Subject: pkix.Name{ + CommonName: cfg.CommonName, + Organization: cfg.Organization, + }, + DNSNames: cfg.AltNames.DNSNames, + IPAddresses: cfg.AltNames.IPs, + SerialNumber: serial, + NotBefore: caCert.NotBefore, + NotAfter: time.Now().Add(time.Hour * 24 * 365 * 100).UTC(), + KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, + ExtKeyUsage: cfg.Usages, + } + certDERBytes, err := x509.CreateCertificate(rand.Reader, &certTmpl, caCert, key.Public(), caKey) + if err != nil { + return nil, err + } + return x509.ParseCertificate(certDERBytes) +} + +// EncodePrivateKeyPEM returns PEM-encoded private key data +func EncodePrivateKeyPEM(key *rsa.PrivateKey) []byte { + block := pem.Block{ + Type: keyutil.RSAPrivateKeyBlockType, + Bytes: x509.MarshalPKCS1PrivateKey(key), + } + return pem.EncodeToMemory(&block) +} + +// EncodeCertPEM returns PEM-endcoded certificate data +func EncodeCertPEM(ct *x509.Certificate) []byte { + block := pem.Block{ + Type: cert.CertificateBlockType, + Bytes: ct.Raw, + } + return pem.EncodeToMemory(&block) +} + +// serviceToCommonName generates the CommonName for the certificate when using a k8s service. +func ServiceToCommonName(serviceNamespace, serviceName string) string { + return fmt.Sprintf("%s.%s.svc", serviceName, serviceNamespace) +} diff --git a/pkg/webhook/certs_test.go b/pkg/webhook/certs_test.go new file mode 100644 index 00000000000..4bf2aa7bc07 --- /dev/null +++ b/pkg/webhook/certs_test.go @@ -0,0 +1,29 @@ +/* +Copyright 2023 The OpenYurt Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package webhook + +import ( + "testing" +) + +//func TestGenerateCerts(serviceNamespace, serviceName string) *Certs { +func TestGenerateCerts(t *testing.T) { + certs := GenerateCerts("kube-system", "yurt-conroller-manager-webhook") + if len(certs.CACert) == 0 || len(certs.CAKey) == 0 || len(certs.Cert) == 0 || len(certs.Key) == 0 { + t.Fail() + } +} diff --git a/pkg/webhook/poolcoordinator_webhook.go b/pkg/webhook/poolcoordinator_webhook.go new file mode 100644 index 00000000000..0f9d5b1338e --- /dev/null +++ b/pkg/webhook/poolcoordinator_webhook.go @@ -0,0 +1,673 @@ +/* +Copyright 2022 The OpenYurt Authors. +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package webhook + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "strings" + + "github.com/wI2L/jsondiff" + admissionv1 "k8s.io/api/admission/v1" + admissionregistrationv1 "k8s.io/api/admissionregistration/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + types "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/client-go/informers" + informercoordv1 "k8s.io/client-go/informers/coordination/v1" + v1 "k8s.io/client-go/informers/core/v1" + client "k8s.io/client-go/kubernetes" + leaselisterv1 "k8s.io/client-go/listers/coordination/v1" + listerv1 "k8s.io/client-go/listers/core/v1" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/util/workqueue" + "k8s.io/klog/v2" + + "github.com/openyurtio/openyurt/pkg/controller/poolcoordinator/utils" +) + +const ( + msgNodeAutonomy string = "node autonomy annotated, eviction aborted" + msgPodAvailableNode string = "pod should exist on the specific node, eviction aborted" + msgPodAvailablePoolAndNodeIsAlive string = "node is actually alive in a pool, eviction aborted" + msgPodAvailablePoolAndNodeIsNotAlive string = "node is not alive in a pool, eviction approved" + msgPodDeleteValidated string = "pod deletion validated" + msgPoolHasTooFewReadyNodes string = "nodepool has too few ready nodes" + + // pod can have two autonomy modes: node scope autonomy, or nodepool scope autonomy + PodAutonomyAnnotation = "apps.openyurt.io/autonomy" + PodAutonomyNode = "node" + PodAutonomyPool = "pool" + + // when ready nodes in a pool is below this value, we don't allow pod transition any more + PoolReadyNodeNumberRatioThresholdDefault = 0.35 + + MaxRetries = 30 +) + +type PoolCoordinatorWebhook struct { + client client.Interface + nodeInformer v1.NodeInformer + nodeSynced cache.InformerSynced + nodeLister listerv1.NodeLister + leaseInformer informercoordv1.LeaseInformer + leaseSynced cache.InformerSynced + leaseLister leaselisterv1.LeaseNamespaceLister + + nodepoolMap *utils.NodepoolMap + + nodePoolUpdateQueue workqueue.RateLimitingInterface + + validatingConfigurationName string + mutatingConfigurationName string + validatingName string + mutatingName string + serviceName string + validatingPath string + mutatingPath string + namespace string +} + +type validation struct { + Valid bool + Reason string +} + +type PodAdmission struct { + request *admissionv1.AdmissionRequest + pod *corev1.Pod + node *corev1.Node + leaseLister leaselisterv1.LeaseNamespaceLister + nodepoolMap *utils.NodepoolMap +} + +func getPoolReadyNodeNumberRatioThreshold() float64 { + return PoolReadyNodeNumberRatioThresholdDefault +} + +func (pa *PodAdmission) userIsNodeController() bool { + return strings.Contains(pa.request.UserInfo.Username, "system:serviceaccount:kube-system:node-controller") +} + +func (pa *PodAdmission) validateReview() (*admissionv1.AdmissionReview, error) { + if pa.request.Kind.Kind != "Pod" { + err := fmt.Errorf("only pods are supported here") + return pa.reviewResponse(pa.request.UID, false, http.StatusBadRequest, ""), err + } + + if pa.request.Operation != admissionv1.Delete { + reason := fmt.Sprintf("Operation %v is accepted always", pa.request.Operation) + return pa.reviewResponse(pa.request.UID, true, http.StatusAccepted, reason), nil + } + + val, err := pa.validateDel() + if err != nil { + e := fmt.Sprintf("could not validate pod: %v", err) + return pa.reviewResponse(pa.request.UID, false, http.StatusBadRequest, e), err + } + if !val.Valid { + return pa.reviewResponse(pa.request.UID, false, http.StatusForbidden, val.Reason), nil + } + + return pa.reviewResponse(pa.request.UID, true, http.StatusAccepted, val.Reason), nil +} + +// ValidateDel returns true if a pod is valid to delete/evict +func (pa *PodAdmission) validateDel() (validation, error) { + if pa.request.Operation == admissionv1.Delete { + if pa.userIsNodeController() { + + // node is autonomy annotated + // although pod would be added tolerations to avoid eviction after pool-coordinator introduction, + // for poosible pods created before, we keep the logic for the time being. + if utils.NodeIsInAutonomy(pa.node) { + return validation{Valid: false, Reason: msgNodeAutonomy}, nil + } + + if pa.pod.Annotations != nil { + // pod has annotation of node available + if pa.pod.Annotations[PodAutonomyAnnotation] == PodAutonomyNode { + return validation{Valid: false, Reason: msgPodAvailableNode}, nil + } + + if pa.pod.Annotations[PodAutonomyAnnotation] == PodAutonomyPool { + if utils.NodeIsAlive(pa.leaseLister, pa.node.Name) { + return validation{Valid: false, Reason: msgPodAvailablePoolAndNodeIsAlive}, nil + } else { + pool, ok := utils.NodeNodepool(pa.node) + if ok { + // When number of ready nodes in node pool is below a configurable parameter, + // we don't alllow pods to move within the pool any more. + // This threshold defaluts to one third of the number of pool's nodes. + threshold := getPoolReadyNodeNumberRatioThreshold() + if float64(utils.CountAliveNode(pa.leaseLister, pa.nodepoolMap.Nodes(pool)))/float64(pa.nodepoolMap.Count(pool)) < threshold { + return validation{Valid: false, Reason: msgPoolHasTooFewReadyNodes}, nil + } + } + return validation{Valid: true, Reason: msgPodAvailablePoolAndNodeIsNotAlive}, nil + } + } + } + } + } + return validation{Valid: true, Reason: msgPodDeleteValidated}, nil +} + +func (pa *PodAdmission) mutateAddToleration() ([]byte, error) { + toadd := []corev1.Toleration{ + {Key: "node.kubernetes.io/unreachable", + Operator: "Exists", + Effect: "NoExecute", + TolerationSeconds: nil}, + {Key: "node.kubernetes.io/not-ready", + Operator: "Exists", + Effect: "NoExecute", + TolerationSeconds: nil}, + } + tols := pa.pod.Spec.Tolerations + merged, changed := utils.MergeTolerations(tols, toadd) + if !changed { + return nil, nil + } + + mpod := pa.pod.DeepCopy() + mpod.Spec.Tolerations = merged + + // generate json patch + patch, err := jsondiff.Compare(pa.pod, mpod) + if err != nil { + return nil, err + } + + patchb, err := json.Marshal(patch) + if err != nil { + return nil, err + } + + return patchb, nil +} + +func (pa *PodAdmission) mutateReview() (*admissionv1.AdmissionReview, error) { + if pa.request.Kind.Kind != "Pod" { + err := fmt.Errorf("only pods are supported here") + return pa.reviewResponse(pa.request.UID, false, http.StatusBadRequest, ""), err + } + + if pa.node == nil { + return pa.reviewResponse(pa.request.UID, true, http.StatusAccepted, "node not assigned yet, nothing to do"), nil + } + + if pa.request.Operation != admissionv1.Create && pa.request.Operation != admissionv1.Update { + reason := fmt.Sprintf("Operation %v is accepted always", pa.request.Operation) + return pa.reviewResponse(pa.request.UID, true, http.StatusAccepted, reason), nil + } + + if !utils.NodeIsInAutonomy(pa.node) && + (pa.pod.Annotations == nil || pa.pod.Annotations[PodAutonomyAnnotation] != PodAutonomyNode) { + return pa.reviewResponse(pa.request.UID, true, http.StatusAccepted, "no need of mutation"), nil + } + + // add tolerations if not yet + klog.Infof("add tolerations to pod %s\n", pa.pod.Name) + val, err := pa.mutateAddToleration() + if err != nil { + return pa.reviewResponse(pa.request.UID, true, http.StatusAccepted, "could not merge tolerations"), err + } + if val == nil { + return pa.reviewResponse(pa.request.UID, true, http.StatusAccepted, "tolerations already existed"), nil + } + + return pa.patchReviewResponse(pa.request.UID, val) +} + +func (pa *PodAdmission) reviewResponse(uid types.UID, allowed bool, httpCode int32, reason string) *admissionv1.AdmissionReview { + return &admissionv1.AdmissionReview{ + TypeMeta: metav1.TypeMeta{ + Kind: "AdmissionReview", + APIVersion: "admission.k8s.io/v1", + }, + Response: &admissionv1.AdmissionResponse{ + UID: uid, + Allowed: allowed, + Result: &metav1.Status{ + Code: httpCode, + Message: reason, + }, + }, + } +} + +// patchReviewResponse builds an admission review with given json patch +func (pa *PodAdmission) patchReviewResponse(uid types.UID, patch []byte) (*admissionv1.AdmissionReview, error) { + patchType := admissionv1.PatchTypeJSONPatch + + return &admissionv1.AdmissionReview{ + TypeMeta: metav1.TypeMeta{ + Kind: "AdmissionReview", + APIVersion: "admission.k8s.io/v1", + }, + Response: &admissionv1.AdmissionResponse{ + UID: uid, + Allowed: true, + PatchType: &patchType, + Patch: patch, + }, + }, nil +} + +// ServeHealth returns 200 when things are good +func (h *PoolCoordinatorWebhook) serveHealth(w http.ResponseWriter, r *http.Request) { + klog.Info("uri", r.RequestURI) + fmt.Fprint(w, "OK") +} + +// ServeValidatePods validates an admission request and then writes an admission +func (h *PoolCoordinatorWebhook) serveValidatePods(w http.ResponseWriter, r *http.Request) { + klog.Info("uri", r.RequestURI) + + pa, err := h.NewPodAdmission(r) + if err != nil { + klog.Error(err) + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + out, err := pa.validateReview() + if err != nil { + e := fmt.Sprintf("could not generate admission response: %v", err) + klog.Error(e) + http.Error(w, e, http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "application/json") + + jout, err := json.Marshal(out) + if err != nil { + e := fmt.Sprintf("could not parse admission response: %v", err) + klog.Error(e) + http.Error(w, e, http.StatusInternalServerError) + return + } + + klog.Info("sending response") + klog.Infof("%s", jout) + fmt.Fprintf(w, "%s", jout) +} + +// ServeMutatePods mutates an admission request and then writes an admission +func (h *PoolCoordinatorWebhook) serveMutatePods(w http.ResponseWriter, r *http.Request) { + klog.Info("uri", r.RequestURI) + + pa, err := h.NewPodAdmission(r) + if err != nil { + klog.Error(err) + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + out, err := pa.mutateReview() + if err != nil { + e := fmt.Sprintf("could not generate admission response: %v", err) + klog.Error(e) + http.Error(w, e, http.StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "application/json") + + jout, err := json.Marshal(out) + if err != nil { + e := fmt.Sprintf("could not parse admission response: %v", err) + klog.Error(e) + http.Error(w, e, http.StatusInternalServerError) + return + } + + klog.Info("sending response") + fmt.Fprintf(w, "%s", jout) +} + +// parseRequest extracts an AdmissionReview from an http.Request if possible +func (h *PoolCoordinatorWebhook) parseRequest(r http.Request) (*admissionv1.AdmissionReview, error) { + if r.Header.Get("Content-Type") != "application/json" { + return nil, fmt.Errorf("Content-Type: %q should be %q", + r.Header.Get("Content-Type"), "application/json") + } + + bodybuf := new(bytes.Buffer) + bodybuf.ReadFrom(r.Body) + body := bodybuf.Bytes() + if len(body) == 0 { + return nil, fmt.Errorf("admission request body is empty") + } + + var a admissionv1.AdmissionReview + + if err := json.Unmarshal(body, &a); err != nil { + return nil, fmt.Errorf("could not parse admission review request: %v", err) + } + + if a.Request == nil { + return nil, fmt.Errorf("admission review can't be used: Request field is nil") + } + + return &a, nil +} + +func (h *PoolCoordinatorWebhook) NewPodAdmission(r *http.Request) (*PodAdmission, error) { + in, err := h.parseRequest(*r) + if err != nil { + return nil, err + } + + req := in.Request + + if req.Kind.Kind != "Pod" { + return nil, fmt.Errorf("only pods are supported") + } + + pod := &corev1.Pod{} + + if req.Operation == admissionv1.Delete || req.Operation == admissionv1.Update { + err = json.Unmarshal(req.OldObject.Raw, pod) + } else { + err = json.Unmarshal(req.Object.Raw, pod) + } + + if err != nil { + klog.Error(err) + return nil, err + } + + nodeName := pod.Spec.NodeSelector["kubernetes.io/hostname"] + klog.Infof("pod %s is on node: %s\n", pod.Name, nodeName) + + var node *corev1.Node + if h.nodeLister != nil { + node, _ = h.nodeLister.Get(nodeName) + } + + pa := &PodAdmission{ + request: req, + pod: pod, + node: node, + leaseLister: h.leaseLister, + nodepoolMap: h.nodepoolMap, + } + + klog.Infof("name: %s, namespace: %s, operation: %s, from: %v", + req.Name, req.Namespace, req.Operation, &req.UserInfo) + + return pa, nil +} + +func NewPoolcoordinatorWebhook(kc client.Interface, informerFactory informers.SharedInformerFactory) *PoolCoordinatorWebhook { + h := &PoolCoordinatorWebhook{ + client: kc, + nodePoolUpdateQueue: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()), + } + + h.nodepoolMap = utils.NewNodepoolMap() + + if informerFactory != nil { + h.nodeInformer = informerFactory.Core().V1().Nodes() + h.nodeSynced = h.nodeInformer.Informer().HasSynced + h.nodeLister = h.nodeInformer.Lister() + h.leaseInformer = informerFactory.Coordination().V1().Leases() + h.leaseSynced = h.leaseInformer.Informer().HasSynced + h.leaseLister = h.leaseInformer.Lister().Leases(corev1.NamespaceNodeLease) + h.nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: h.onNodeCreate, + UpdateFunc: h.onNodeUpdate, + DeleteFunc: h.onNodeDelete, + }) + } + + h.serviceName = utils.GetEnv("WEBHOOK_SERVICE_NAME", "yurt-controller-manager-webhook") + h.namespace = utils.GetEnv("WEBHOOK_NAMESPACE", "kube-system") + h.validatingConfigurationName = utils.GetEnv("WEBHOOK_POD_VALIDATING_CONFIGURATION_NAME", "yurt-controller-manager") + h.mutatingConfigurationName = utils.GetEnv("WEBHOOK_POD_MUTATING_CONFIGURATION_NAME", "yurt-controller-manager") + h.validatingName = utils.GetEnv("WEBHOOK_POD_VALIDATING_NAME", "vpoolcoordinator.openyurt.io") + h.mutatingName = utils.GetEnv("WEBHOOK_POD_MUTATING_NAME", "mpoolcoordinator.openyurt.io") + h.validatingPath = utils.GetEnv("WEBHOOK_POD_VALIDATING_PATH", "/pool-coordinator-webhook-validate") + h.mutatingPath = utils.GetEnv("WEBHOOK_POD_MUTATING_PATH", "/pool-coordinator-webhook-mutate") + + return h +} + +func (h *PoolCoordinatorWebhook) onNodeCreate(n interface{}) { + key, err := cache.MetaNamespaceKeyFunc(n) + if err == nil { + h.nodePoolUpdateQueue.Add(key) + } +} + +func (h *PoolCoordinatorWebhook) onNodeDelete(n interface{}) { + key, err := cache.MetaNamespaceKeyFunc(n) + if err == nil { + h.nodePoolUpdateQueue.Add(key) + } +} + +func (h *PoolCoordinatorWebhook) onNodeUpdate(o interface{}, n interface{}) { + key, err := cache.MetaNamespaceKeyFunc(n) + if err == nil { + h.nodePoolUpdateQueue.Add(key) + } +} + +func (h *PoolCoordinatorWebhook) syncHandler(key string) error { + _, name, err := cache.SplitMetaNamespaceKey(key) + if err != nil { + return fmt.Errorf("invalid resource key: %s", key) + } + + node, err := h.nodeLister.Get(name) + if node == nil && err != nil { + // the node has been deleted + h.nodepoolMap.DelNode(name) + klog.Infof("node %s removed\n", name) + return nil + } + + pool, ok := utils.NodeNodepool(node) + if ok { + if opool, ok := h.nodepoolMap.GetPool(name); ok { + if opool == pool { + return nil + } else { + h.nodepoolMap.Del(opool, name) + klog.Infof("pool %s: node %s removed\n", opool, name) + } + } + h.nodepoolMap.Add(pool, name) + klog.Infof("pool %s: node %s added\n", pool, name) + } else { + h.nodepoolMap.DelNode(name) + klog.Infof("node %s removed\n", name) + } + + h.nodePoolUpdateQueue.Done(key) + return nil +} + +func (h *PoolCoordinatorWebhook) nodePoolWorker() { + for { + key, shutdown := h.nodePoolUpdateQueue.Get() + if shutdown { + klog.Info("nodepool work queue shutdown") + return + } + + if err := h.syncHandler(key.(string)); err != nil { + if h.nodePoolUpdateQueue.NumRequeues(key) < MaxRetries { + klog.Infof("error syncing event %v: %v", key, err) + h.nodePoolUpdateQueue.AddRateLimited(key) + h.nodePoolUpdateQueue.Done(key) + continue + } + runtime.HandleError(err) + } + + h.nodePoolUpdateQueue.Forget(key) + h.nodePoolUpdateQueue.Done(key) + } +} + +func (h *PoolCoordinatorWebhook) Handler() []Handler { + return []Handler{ + {h.mutatingPath, h.serveMutatePods}, + {h.validatingPath, h.serveValidatePods}, + } +} + +func (h *PoolCoordinatorWebhook) ensureValidatingConfiguration(certs *Certs) { + fail := admissionregistrationv1.Fail + sideEffects := admissionregistrationv1.SideEffectClassNone + config := &admissionregistrationv1.ValidatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{ + Name: h.validatingConfigurationName, + }, + Webhooks: []admissionregistrationv1.ValidatingWebhook{{ + Name: h.validatingName, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + CABundle: certs.CACert, + Service: &admissionregistrationv1.ServiceReference{ + Name: h.serviceName, + Namespace: h.namespace, + Path: &h.validatingPath, + }, + }, + Rules: []admissionregistrationv1.RuleWithOperations{ + {Operations: []admissionregistrationv1.OperationType{ + admissionregistrationv1.Delete}, + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{""}, + APIVersions: []string{"v1"}, + Resources: []string{"pods"}, + }, + }}, + FailurePolicy: &fail, + SideEffects: &sideEffects, + AdmissionReviewVersions: []string{"v1"}, + }}, + } + + if h.client != nil { + if _, err := h.client.AdmissionregistrationV1().ValidatingWebhookConfigurations().Get( + context.TODO(), h.validatingConfigurationName, metav1.GetOptions{}); err != nil { + if errors.IsNotFound(err) { + klog.Infof("validatewebhookconfiguratiion %s not found, create it.", h.validatingConfigurationName) + if _, err = h.client.AdmissionregistrationV1().ValidatingWebhookConfigurations().Create( + context.TODO(), config, metav1.CreateOptions{}); err != nil { + klog.Fatal(err) + } + } + } else { + klog.Infof("validatingwebhookconfiguratiion %s already exists, update it.", h.validatingConfigurationName) + if _, err = h.client.AdmissionregistrationV1().ValidatingWebhookConfigurations().Update( + context.TODO(), config, metav1.UpdateOptions{}); err != nil { + klog.Fatal(err) + } + } + } +} + +func (h *PoolCoordinatorWebhook) ensureMutatingConfiguration(certs *Certs) { + fail := admissionregistrationv1.Fail + sideEffects := admissionregistrationv1.SideEffectClassNone + config := &admissionregistrationv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{ + Name: h.mutatingConfigurationName, + }, + Webhooks: []admissionregistrationv1.MutatingWebhook{{ + Name: h.mutatingName, + ClientConfig: admissionregistrationv1.WebhookClientConfig{ + CABundle: certs.CACert, + Service: &admissionregistrationv1.ServiceReference{ + Name: h.serviceName, + Namespace: h.namespace, + Path: &h.mutatingPath, + }, + }, + Rules: []admissionregistrationv1.RuleWithOperations{ + {Operations: []admissionregistrationv1.OperationType{ + admissionregistrationv1.Create, + admissionregistrationv1.Update}, + Rule: admissionregistrationv1.Rule{ + APIGroups: []string{""}, + APIVersions: []string{"v1"}, + Resources: []string{"pods"}, + }, + }}, + FailurePolicy: &fail, + SideEffects: &sideEffects, + AdmissionReviewVersions: []string{"v1"}, + }}, + } + + if h.client != nil { + if _, err := h.client.AdmissionregistrationV1().MutatingWebhookConfigurations().Get( + context.TODO(), h.mutatingConfigurationName, metav1.GetOptions{}); err != nil { + if errors.IsNotFound(err) { + klog.Infof("validatewebhookconfiguratiion %s not found, create it.", h.mutatingConfigurationName) + if _, err = h.client.AdmissionregistrationV1().MutatingWebhookConfigurations().Create( + context.TODO(), config, metav1.CreateOptions{}); err != nil { + klog.Fatal(err) + } + } + } else { + klog.Infof("validatingwebhookconfiguratiion %s already exists, update it.", h.mutatingConfigurationName) + if _, err = h.client.AdmissionregistrationV1().MutatingWebhookConfigurations().Update( + context.TODO(), config, metav1.UpdateOptions{}); err != nil { + klog.Fatal(err) + } + } + } +} + +func (h *PoolCoordinatorWebhook) Init(certs *Certs, stopCH <-chan struct{}) { + if !cache.WaitForCacheSync(stopCH, h.nodeSynced, h.leaseSynced) { + klog.Error("sync poolcoordinator webhook timeout") + } + + klog.Info("populate nodepool map") + nl, err := h.nodeLister.List(labels.Everything()) + if err != nil { + klog.Error(err) + } + h.nodepoolMap.Sync(nl) + + klog.Info("start nodepool maintenance worker") + go h.nodePoolWorker() + + go func() { + defer h.nodePoolUpdateQueue.ShutDown() + <-stopCH + }() + + h.ensureValidatingConfiguration(certs) + h.ensureMutatingConfiguration(certs) +} diff --git a/pkg/webhook/poolcoordinator_webhook_test.go b/pkg/webhook/poolcoordinator_webhook_test.go new file mode 100644 index 00000000000..c4af98aca78 --- /dev/null +++ b/pkg/webhook/poolcoordinator_webhook_test.go @@ -0,0 +1,169 @@ +/* +Copyright 2023 The OpenYurt Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package webhook + +import ( + "bytes" + "fmt" + "net/http" + "net/http/httptest" + "testing" + + corev1 "k8s.io/api/core/v1" + + "github.com/openyurtio/openyurt/pkg/controller/poolcoordinator/constant" +) + +func genPodCrteateRequest() *http.Request { + body := []byte(`{"kind":"AdmissionReview","apiVersion":"admission.k8s.io/v1","request":{"uid":"4378f340-83a6-4c8c-88bc-9b8788db02d2","kind":{"group":"","version":"v1","kind":"Pod"},"resource":{"group":"","version":"v1","resource":"pods"},"requestKind":{"group":"","version":"v1","kind":"Pod"},"requestResource":{"group":"","version":"v1","resource":"pods"},"name":"nginx","namespace":"default","operation":"CREATE","userInfo":{"username":"kubernetes-admin","groups":["system:masters","system:authenticated"]},"object":{"kind":"Pod","apiVersion":"v1","metadata":{"name":"nginx","creationTimestamp":null,"labels":{"run":"nginx"},"managedFields":[{"manager":"kubectl","operation":"Update","apiVersion":"v1","time":"2023-01-13T07:21:09Z","fieldsType":"FieldsV1","fieldsV1":{"f:metadata":{"f:labels":{".":{},"f:run":{}}},"f:spec":{"f:containers":{"k:{\"name\":\"nginx\"}":{".":{},"f:image":{},"f:imagePullPolicy":{},"f:name":{},"f:resources":{},"f:terminationMessagePath":{},"f:terminationMessagePolicy":{}}},"f:dnsPolicy":{},"f:enableServiceLinks":{},"f:nodeSelector":{".":{},"f:kubernetes.io/hostname":{}},"f:restartPolicy":{},"f:schedulerName":{},"f:securityContext":{},"f:terminationGracePeriodSeconds":{}}}}]},"spec":{"volumes":[{"name":"default-token-b7xcv","secret":{"secretName":"default-token-b7xcv"}}],"containers":[{"name":"nginx","image":"nginx","resources":{},"volumeMounts":[{"name":"default-token-b7xcv","readOnly":true,"mountPath":"/var/run/secrets/kubernetes.io/serviceaccount"}],"terminationMessagePath":"/dev/termination-log","terminationMessagePolicy":"File","imagePullPolicy":"Always"}],"restartPolicy":"Always","terminationGracePeriodSeconds":30,"dnsPolicy":"ClusterFirst","nodeSelector":{"kubernetes.io/hostname":"ai-ice-vm05"},"serviceAccountName":"default","serviceAccount":"default","securityContext":{},"schedulerName":"default-scheduler","tolerations":[{"key":"node.kubernetes.io/not-ready","operator":"Exists","effect":"NoExecute","tolerationSeconds":300},{"key":"node.kubernetes.io/unreachable","operator":"Exists","effect":"NoExecute","tolerationSeconds":300}],"priority":0,"enableServiceLinks":true},"status":{}},"oldObject":null,"dryRun":false,"options":{"kind":"CreateOptions","apiVersion":"meta.k8s.io/v1"}}}`) + req, _ := http.NewRequest(http.MethodPost, + "yurt-controller-manager-webhook.kube-system.svc:443/pool-coordinator-webhook-mutate?timeout=10s", + bytes.NewReader(body)) + req.Header = http.Header{ + "Accept": []string{"application/json, */*"}, + "Accept-Encoding": []string{"gzip"}, + "Content-Type": []string{"application/json"}, + "User-Agent": []string{"kube-apiserver-admission"}, + } + req.Host = "yurt-controller-manager-webhook.kube-system.svc:443" + req.RemoteAddr = "192.168.122.247:57550" + req.RequestURI = "/pool-coordinator-webhook-mutate?timeout=10s" + + return req +} + +func genPodValidateRequest(body []byte) *http.Request { + req, _ := http.NewRequest(http.MethodPost, + "yurt-controller-manager-webhook.kube-system.svc:443/pool-coordinator-webhook-validate", + bytes.NewReader(body)) + req.Header = http.Header{ + "Accept": []string{"application/json, */*"}, + "Accept-Encoding": []string{"gzip"}, + "Content-Type": []string{"application/json"}, + "User-Agent": []string{"kube-apiserver-admission"}, + } + req.Host = "yurt-controller-manager-webhook.kube-system.svc:443" + req.RemoteAddr = "192.168.122.247:57550" + req.RequestURI = "/pool-coordinator-webhook-validate?timeout=10s" + + return req +} + +func genPodDeleteRequestNormal() *http.Request { + body := []byte(`{"kind":"AdmissionReview","apiVersion":"admission.k8s.io/v1","request":{"uid":"dd7a36a4-229a-4491-b879-af5fe09eaf0d","kind":{"group":"","version":"v1","kind":"Pod"},"resource":{"group":"","version":"v1","resource":"pods"},"requestKind":{"group":"","version":"v1","kind":"Pod"},"requestResource":{"group":"","version":"v1","resource":"pods"},"name":"nginx","namespace":"default","operation":"DELETE","userInfo":{"username":"system:node:ai-ice-vm05","groups":["system:nodes","system:authenticated"]},"object":null,"oldObject":{"kind":"Pod","apiVersion":"v1","metadata":{"name":"nginx","namespace":"default","uid":"41bf977f-0c95-4f97-8c32-baf8412b8a79","resourceVersion":"77701767","creationTimestamp":"2023-01-13T07:21:09Z","deletionTimestamp":"2023-01-13T08:27:00Z","deletionGracePeriodSeconds":0,"labels":{"run":"nginx"},"annotations":{"cni.projectcalico.org/containerID":"c5ab9fac8e3227d0cd8d6013029d6e5e01ae4d57ad040b2995b18f50efac8a8e","cni.projectcalico.org/podIP":"","cni.projectcalico.org/podIPs":""},"managedFields":[{"manager":"kubectl","operation":"Update","apiVersion":"v1","time":"2023-01-13T07:21:09Z","fieldsType":"FieldsV1","fieldsV1":{"f:metadata":{"f:labels":{".":{},"f:run":{}}},"f:spec":{"f:containers":{"k:{\"name\":\"nginx\"}":{".":{},"f:image":{},"f:imagePullPolicy":{},"f:name":{},"f:resources":{},"f:terminationMessagePath":{},"f:terminationMessagePolicy":{}}},"f:dnsPolicy":{},"f:enableServiceLinks":{},"f:nodeSelector":{".":{},"f:kubernetes.io/hostname":{}},"f:restartPolicy":{},"f:schedulerName":{},"f:securityContext":{},"f:terminationGracePeriodSeconds":{}}}},{"manager":"calico","operation":"Update","apiVersion":"v1","time":"2023-01-13T07:21:11Z","fieldsType":"FieldsV1","fieldsV1":{"f:metadata":{"f:annotations":{".":{},"f:cni.projectcalico.org/containerID":{},"f:cni.projectcalico.org/podIP":{},"f:cni.projectcalico.org/podIPs":{}}}}},{"manager":"kubelet","operation":"Update","apiVersion":"v1","time":"2023-01-13T08:27:02Z","fieldsType":"FieldsV1","fieldsV1":{"f:status":{"f:conditions":{"k:{\"type\":\"ContainersReady\"}":{".":{},"f:lastProbeTime":{},"f:lastTransitionTime":{},"f:message":{},"f:reason":{},"f:status":{},"f:type":{}},"k:{\"type\":\"Initialized\"}":{".":{},"f:lastProbeTime":{},"f:lastTransitionTime":{},"f:status":{},"f:type":{}},"k:{\"type\":\"Ready\"}":{".":{},"f:lastProbeTime":{},"f:lastTransitionTime":{},"f:message":{},"f:reason":{},"f:status":{},"f:type":{}}},"f:containerStatuses":{},"f:hostIP":{},"f:phase":{},"f:startTime":{}}}}]},"spec":{"volumes":[{"name":"default-token-b7xcv","secret":{"secretName":"default-token-b7xcv","defaultMode":420}}],"containers":[{"name":"nginx","image":"nginx","resources":{},"volumeMounts":[{"name":"default-token-b7xcv","readOnly":true,"mountPath":"/var/run/secrets/kubernetes.io/serviceaccount"}],"terminationMessagePath":"/dev/termination-log","terminationMessagePolicy":"File","imagePullPolicy":"Always"}],"restartPolicy":"Always","terminationGracePeriodSeconds":30,"dnsPolicy":"ClusterFirst","nodeSelector":{"kubernetes.io/hostname":"ai-ice-vm05"},"serviceAccountName":"default","serviceAccount":"default","nodeName":"ai-ice-vm05","securityContext":{},"schedulerName":"default-scheduler","tolerations":[{"key":"node.kubernetes.io/unreachable","operator":"Exists","effect":"NoExecute"},{"key":"node.kubernetes.io/not-ready","operator":"Exists","effect":"NoExecute"}],"priority":0,"enableServiceLinks":true},"status":{"phase":"Pending","conditions":[{"type":"Initialized","status":"True","lastProbeTime":null,"lastTransitionTime":"2023-01-13T07:21:15Z"},{"type":"Ready","status":"False","lastProbeTime":null,"lastTransitionTime":"2023-01-13T08:27:08Z","reason":"ContainersNotReady","message":"containers with unready status: [nginx]"},{"type":"ContainersReady","status":"False","lastProbeTime":null,"lastTransitionTime":"2023-01-13T08:27:08Z","reason":"ContainersNotReady","message":"containers with unready status: [nginx]"},{"type":"PodScheduled","status":"True","lastProbeTime":null,"lastTransitionTime":"2023-01-13T07:21:09Z"}],"hostIP":"192.168.122.90","startTime":"2023-01-13T07:21:15Z","containerStatuses":[{"name":"nginx","state":{"waiting":{"reason":"ContainerCreating"}},"lastState":{},"ready":false,"restartCount":0,"image":"nginx","imageID":"","started":false}],"qosClass":"BestEffort"}},"dryRun":false,"options":{"kind":"DeleteOptions","apiVersion":"meta.k8s.io/v1","gracePeriodSeconds":0,"preconditions":{"uid":"41bf977f-0c95-4f97-8c32-baf8412b8a79"}}}}`) + return genPodValidateRequest(body) +} + +func genPodDeleteRequestEviction() *http.Request { + body := []byte(`{"kind":"AdmissionReview","apiVersion":"admission.k8s.io/v1","request":{"uid":"dd7a36a4-229a-4491-b879-af5fe09eaf0d","kind":{"group":"","version":"v1","kind":"Pod"},"resource":{"group":"","version":"v1","resource":"pods"},"requestKind":{"group":"","version":"v1","kind":"Pod"},"requestResource":{"group":"","version":"v1","resource":"pods"},"name":"nginx","namespace":"default","operation":"DELETE","userInfo":{"username":"system:serviceaccount:kube-system:node-controller","groups":["system:nodes","system:authenticated"]},"object":null,"oldObject":{"kind":"Pod","apiVersion":"v1","metadata":{"name":"nginx","namespace":"default","uid":"41bf977f-0c95-4f97-8c32-baf8412b8a79","resourceVersion":"77701767","creationTimestamp":"2023-01-13T07:21:09Z","deletionTimestamp":"2023-01-13T08:27:00Z","deletionGracePeriodSeconds":0,"labels":{"run":"nginx"},"annotations":{"cni.projectcalico.org/containerID":"c5ab9fac8e3227d0cd8d6013029d6e5e01ae4d57ad040b2995b18f50efac8a8e","cni.projectcalico.org/podIP":"","cni.projectcalico.org/podIPs":""},"managedFields":[{"manager":"kubectl","operation":"Update","apiVersion":"v1","time":"2023-01-13T07:21:09Z","fieldsType":"FieldsV1","fieldsV1":{"f:metadata":{"f:labels":{".":{},"f:run":{}}},"f:spec":{"f:containers":{"k:{\"name\":\"nginx\"}":{".":{},"f:image":{},"f:imagePullPolicy":{},"f:name":{},"f:resources":{},"f:terminationMessagePath":{},"f:terminationMessagePolicy":{}}},"f:dnsPolicy":{},"f:enableServiceLinks":{},"f:nodeSelector":{".":{},"f:kubernetes.io/hostname":{}},"f:restartPolicy":{},"f:schedulerName":{},"f:securityContext":{},"f:terminationGracePeriodSeconds":{}}}},{"manager":"calico","operation":"Update","apiVersion":"v1","time":"2023-01-13T07:21:11Z","fieldsType":"FieldsV1","fieldsV1":{"f:metadata":{"f:annotations":{".":{},"f:cni.projectcalico.org/containerID":{},"f:cni.projectcalico.org/podIP":{},"f:cni.projectcalico.org/podIPs":{}}}}},{"manager":"kubelet","operation":"Update","apiVersion":"v1","time":"2023-01-13T08:27:02Z","fieldsType":"FieldsV1","fieldsV1":{"f:status":{"f:conditions":{"k:{\"type\":\"ContainersReady\"}":{".":{},"f:lastProbeTime":{},"f:lastTransitionTime":{},"f:message":{},"f:reason":{},"f:status":{},"f:type":{}},"k:{\"type\":\"Initialized\"}":{".":{},"f:lastProbeTime":{},"f:lastTransitionTime":{},"f:status":{},"f:type":{}},"k:{\"type\":\"Ready\"}":{".":{},"f:lastProbeTime":{},"f:lastTransitionTime":{},"f:message":{},"f:reason":{},"f:status":{},"f:type":{}}},"f:containerStatuses":{},"f:hostIP":{},"f:phase":{},"f:startTime":{}}}}]},"spec":{"volumes":[{"name":"default-token-b7xcv","secret":{"secretName":"default-token-b7xcv","defaultMode":420}}],"containers":[{"name":"nginx","image":"nginx","resources":{},"volumeMounts":[{"name":"default-token-b7xcv","readOnly":true,"mountPath":"/var/run/secrets/kubernetes.io/serviceaccount"}],"terminationMessagePath":"/dev/termination-log","terminationMessagePolicy":"File","imagePullPolicy":"Always"}],"restartPolicy":"Always","terminationGracePeriodSeconds":30,"dnsPolicy":"ClusterFirst","nodeSelector":{"kubernetes.io/hostname":"ai-ice-vm05"},"serviceAccountName":"default","serviceAccount":"default","nodeName":"ai-ice-vm05","securityContext":{},"schedulerName":"default-scheduler","tolerations":[{"key":"node.kubernetes.io/unreachable","operator":"Exists","effect":"NoExecute"},{"key":"node.kubernetes.io/not-ready","operator":"Exists","effect":"NoExecute"}],"priority":0,"enableServiceLinks":true},"status":{"phase":"Pending","conditions":[{"type":"Initialized","status":"True","lastProbeTime":null,"lastTransitionTime":"2023-01-13T07:21:15Z"},{"type":"Ready","status":"False","lastProbeTime":null,"lastTransitionTime":"2023-01-13T08:27:08Z","reason":"ContainersNotReady","message":"containers with unready status: [nginx]"},{"type":"ContainersReady","status":"False","lastProbeTime":null,"lastTransitionTime":"2023-01-13T08:27:08Z","reason":"ContainersNotReady","message":"containers with unready status: [nginx]"},{"type":"PodScheduled","status":"True","lastProbeTime":null,"lastTransitionTime":"2023-01-13T07:21:09Z"}],"hostIP":"192.168.122.90","startTime":"2023-01-13T07:21:15Z","containerStatuses":[{"name":"nginx","state":{"waiting":{"reason":"ContainerCreating"}},"lastState":{},"ready":false,"restartCount":0,"image":"nginx","imageID":"","started":false}],"qosClass":"BestEffort"}},"dryRun":false,"options":{"kind":"DeleteOptions","apiVersion":"meta.k8s.io/v1","gracePeriodSeconds":0,"preconditions":{"uid":"41bf977f-0c95-4f97-8c32-baf8412b8a79"}}}}`) + return genPodValidateRequest(body) +} + +func TestPodMutate(t *testing.T) { + fmt.Println(">>>> Test pod create") + req := genPodCrteateRequest() + + h := NewPoolcoordinatorWebhook(nil, nil) + + fmt.Println(">>>>>>>> Test when node is nil") + res := httptest.NewRecorder() + h.serveMutatePods(res, req) + + fmt.Println(">>>>>>>> Test when node is not nil") + req = genPodCrteateRequest() + pa, _ := h.NewPodAdmission(req) + pa.node = &corev1.Node{} + pa.pod.Annotations = map[string]string{} + pa.pod.Annotations[PodAutonomyAnnotation] = PodAutonomyNode + rev, _ := pa.mutateReview() + fmt.Printf("%v", rev) + +} + +func TestPodValidate(t *testing.T) { + fmt.Println("Test pod validate") + h := NewPoolcoordinatorWebhook(nil, nil) + + fmt.Println(">>>> Test normal pod delete") + req := genPodDeleteRequestNormal() + + pa, _ := h.NewPodAdmission(req) + rev, _ := pa.validateReview() + fmt.Printf("%v\n", rev) + if rev.Response.Allowed != true { + t.Fail() + } + + res := httptest.NewRecorder() + h.serveValidatePods(res, req) + + fmt.Println(">>>> Test pod eviction") + + fmt.Println(">>>>>>>> Test when node is nil") + req = genPodDeleteRequestEviction() + + pa, _ = h.NewPodAdmission(req) + pa.node = &corev1.Node{} + + fmt.Println(">>>>>>>> Test when node is in autonomy (leagcy)") + pa.node.Annotations = map[string]string{} + pa.node.Annotations[constant.AnnotationKeyNodeAutonomy] = "true" + rev, _ = pa.validateReview() + fmt.Printf("%v", rev) + if rev.Response.Allowed != false { + t.Fail() + } + + pa.node.Annotations = map[string]string{} + + fmt.Println(">>>>>>>> Test when pod autonomy mode is node") + pa.pod.Annotations[PodAutonomyAnnotation] = PodAutonomyNode + rev, _ = pa.validateReview() + fmt.Printf("%v\n", rev) + if rev.Response.Allowed != false { + t.Fail() + } + + fmt.Println(">>>>>>>> Test when pod autonomy mode is pool") + pa.pod.Annotations[PodAutonomyAnnotation] = PodAutonomyPool + pa.node.Labels = map[string]string{} + pa.node.Labels[constant.LabelKeyNodePool] = "ut" + pa.nodepoolMap.Add("ut", "ai-ice-vm05") + rev, _ = pa.validateReview() + fmt.Printf("%v\n", rev) + if rev.Response.Allowed != false { + t.Fail() + } +} + +func TestEnsureMutatingConfiguration(t *testing.T) { + h := NewPoolcoordinatorWebhook(nil, nil) + h.ensureMutatingConfiguration(&Certs{}) +} + +func TestEnsureValidatingConfiguration(t *testing.T) { + h := NewPoolcoordinatorWebhook(nil, nil) + h.ensureValidatingConfiguration(&Certs{}) +} + +func TestHandler(t *testing.T) { + h := NewPoolcoordinatorWebhook(nil, nil) + hs := h.Handler() + if len(hs) <= 0 { + t.Fail() + } +} diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go new file mode 100644 index 00000000000..46a3bc7dafb --- /dev/null +++ b/pkg/webhook/webhook.go @@ -0,0 +1,116 @@ +/* +Copyright 2022 The OpenYurt Authors. +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package webhook + +import ( + "fmt" + "net/http" + "time" + + "k8s.io/client-go/informers" + client "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + + "github.com/openyurtio/openyurt/pkg/controller/poolcoordinator/utils" +) + +type Handler struct { + Path string + HttpHandler func(http.ResponseWriter, *http.Request) +} + +type Webhook interface { + Handler() []Handler + Init(*Certs, <-chan struct{}) +} + +type WebhookManager struct { + client client.Interface + certdir string + webhooks []Webhook +} + +func webhookCertDir() string { + return utils.GetEnv("WEBHOOK_CERT_DIR", "/tmp/k8s-webhook-server/serving-certs") +} + +func webhookNamespace() string { + return utils.GetEnv("WEBHOOK_NAMESPACE", "kube-system") +} + +func webhookServiceName() string { + return utils.GetEnv("WEBHOOK_SERVICE_NAME", "yurt-controller-manager-webhook") +} + +func webhookServicePort() string { + return utils.GetEnv("WEBHOOK_SERVICE_PORT", "9443") +} + +func NewWebhookManager(kc client.Interface, informerFactory informers.SharedInformerFactory) *WebhookManager { + m := &WebhookManager{} + + m.certdir = webhookCertDir() + + h := NewPoolcoordinatorWebhook(kc, informerFactory) + m.addWebhook(h) + + return m +} + +func (m *WebhookManager) addWebhook(webhook Webhook) { + m.webhooks = append(m.webhooks, webhook) +} + +func (m *WebhookManager) Run(stopCH <-chan struct{}) { + err := utils.EnsureDir(m.certdir) + if err != nil { + klog.Error(err) + } + crt := m.certdir + "/tls.crt" + key := m.certdir + "/tls.key" + + certs := GenerateCerts(webhookNamespace(), webhookServiceName()) + err = utils.WriteFile(crt, certs.Cert) + if err != nil { + klog.Error(err) + } + err = utils.WriteFile(key, certs.Key) + if err != nil { + klog.Error(err) + } + + for { + if utils.FileExists(crt) && utils.FileExists(key) { + klog.Info("tls key and cert ok.") + break + } else { + klog.Info("Wating for tls key and cert...") + time.Sleep(time.Second) + } + } + + for _, h := range m.webhooks { + h.Init(certs, stopCH) + for _, hh := range h.Handler() { + http.HandleFunc(hh.Path, hh.HttpHandler) + } + } + + klog.Infof("Listening on port %s ...", webhookServicePort()) + klog.Fatal(http.ListenAndServeTLS(fmt.Sprintf(":%s", webhookServicePort()), crt, key, nil)) +}