From a1a8b35b36dd287fe12685c29c86d10d07129c5f Mon Sep 17 00:00:00 2001 From: renxiangyu Date: Wed, 21 Feb 2024 16:49:39 +0800 Subject: [PATCH] feat: use webhooks to prevent the pod of notready's kosmosnode from being evicted Signed-off-by: renxiangyu --- Makefile | 1 + cmd/webhook/app/options/options.go | 63 +++++++++++ cmd/webhook/app/options/validation.go | 10 ++ cmd/webhook/app/webhook.go | 103 ++++++++++++++++++ cmd/webhook/main.go | 17 +++ deploy/webhook/deployment.yaml | 53 +++++++++ deploy/webhook/rbac.yaml | 33 ++++++ deploy/webhook/service.yaml | 12 ++ deploy/webhook/tls-secret.yaml | 9 ++ .../webhook/validate-delete-pod-config.yaml | 26 +++++ hack/gen-webhook-certs.sh | 42 +++++++ hack/util.sh | 1 + pkg/utils/k8s.go | 11 ++ pkg/webhook/validate_delete_pods.go | 74 +++++++++++++ 14 files changed, 455 insertions(+) create mode 100644 cmd/webhook/app/options/options.go create mode 100644 cmd/webhook/app/options/validation.go create mode 100644 cmd/webhook/app/webhook.go create mode 100644 cmd/webhook/main.go create mode 100644 deploy/webhook/deployment.yaml create mode 100644 deploy/webhook/rbac.yaml create mode 100644 deploy/webhook/service.yaml create mode 100644 deploy/webhook/tls-secret.yaml create mode 100644 deploy/webhook/validate-delete-pod-config.yaml create mode 100644 hack/gen-webhook-certs.sh create mode 100644 pkg/webhook/validate_delete_pods.go diff --git a/Makefile b/Makefile index f90b439fd..4d78a1ce4 100644 --- a/Makefile +++ b/Makefile @@ -18,6 +18,7 @@ TARGETS := clusterlink-controller-manager \ clusterlink-proxy \ clustertree-cluster-manager \ scheduler \ + webhook CTL_TARGETS := kosmosctl diff --git a/cmd/webhook/app/options/options.go b/cmd/webhook/app/options/options.go new file mode 100644 index 000000000..6e29a5752 --- /dev/null +++ b/cmd/webhook/app/options/options.go @@ -0,0 +1,63 @@ +package options + +import ( + "github.com/spf13/pflag" + "k8s.io/client-go/tools/leaderelection/resourcelock" + componentbaseconfig "k8s.io/component-base/config" + + "github.com/kosmos.io/kosmos/pkg/utils" + "github.com/kosmos.io/kosmos/pkg/webhook" +) + +type Options struct { + LeaderElection componentbaseconfig.LeaderElectionConfiguration + KubernetesOptions KubernetesOptions + WebhookServerOptions WebhookServerOptions + PodValidatorOptions webhook.PodValidatorOptions +} + +type KubernetesOptions struct { + KubeConfig string `json:"kubeconfig" yaml:"kubeconfig"` + Master string `json:"master,omitempty" yaml:"master,omitempty"` + QPS float32 `json:"qps,omitempty" yaml:"qps,omitempty"` + Burst int `json:"burst,omitempty" yaml:"burst,omitempty"` +} + +type WebhookServerOptions struct { + Host string + Port int + CertDir string + CertName string + KeyName string +} + +func NewOptions() *Options { + return &Options{ + LeaderElection: componentbaseconfig.LeaderElectionConfiguration{ + LeaderElect: true, + ResourceLock: resourcelock.LeasesResourceLock, + ResourceNamespace: utils.DefaultNamespace, + ResourceName: "network-manager", + }, + } +} + +func (o *Options) AddFlags(flags *pflag.FlagSet) { + if o == nil { + return + } + + flags.BoolVar(&o.LeaderElection.LeaderElect, "leader-elect", true, "Start a leader election client and gain leadership before executing the main loop. Enable this when running replicated components for high availability.") + flags.StringVar(&o.LeaderElection.ResourceName, "leader-elect-resource-name", "kosmos-webhook", "The name of resource object that is used for locking during leader election.") + flags.StringVar(&o.LeaderElection.ResourceNamespace, "leader-elect-resource-namespace", utils.DefaultNamespace, "The namespace of resource object that is used for locking during leader election.") + flags.Float32Var(&o.KubernetesOptions.QPS, "kube-qps", 40.0, "QPS to use while talking with kube-apiserver.") + flags.IntVar(&o.KubernetesOptions.Burst, "kube-burst", 60, "Burst to use while talking with kube-apiserver.") + flags.StringVar(&o.KubernetesOptions.KubeConfig, "kubeconfig", "", "Path for kubernetes kubeconfig file, if left blank, will use in cluster way.") + flags.StringVar(&o.KubernetesOptions.Master, "master", "", "Used to generate kubeconfig for downloading, if not specified, will use host in kubeconfig.") + flags.StringVar(&o.WebhookServerOptions.Host, "bind-address", "0.0.0.0", "The IP address on which to listen for the --secure-port port.") + flags.IntVar(&o.WebhookServerOptions.Port, "secure-port", 9443, "The secure port on which to serve HTTPS.") + flags.StringVar(&o.WebhookServerOptions.CertDir, "cert-dir", "/etc/certs", "The directory that contains the server key and certificate.") + flags.StringVar(&o.WebhookServerOptions.CertName, "tls-cert-file-name", "tls.crt", "The name of server certificate.") + flags.StringVar(&o.WebhookServerOptions.KeyName, "tls-private-key-file-name", "tls.key", "The name of server key.") + flags.StringArrayVar(&o.PodValidatorOptions.UsernamesNeedToPrevent, "usernames-need-to-prevent", []string{"system:serviceaccount:kube-system:node-controller"}, "Usernames that need to prevent deleting pods on NotReady kosmos nodes.") +} diff --git a/cmd/webhook/app/options/validation.go b/cmd/webhook/app/options/validation.go new file mode 100644 index 000000000..6d18c85a2 --- /dev/null +++ b/cmd/webhook/app/options/validation.go @@ -0,0 +1,10 @@ +package options + +import "k8s.io/apimachinery/pkg/util/validation/field" + +// Validate checks Options and return a slice of found errs. +func (o *Options) Validate() field.ErrorList { + errs := field.ErrorList{} + + return errs +} diff --git a/cmd/webhook/app/webhook.go b/cmd/webhook/app/webhook.go new file mode 100644 index 000000000..3640f9584 --- /dev/null +++ b/cmd/webhook/app/webhook.go @@ -0,0 +1,103 @@ +package app + +import ( + "context" + "fmt" + "net/http" + + "github.com/spf13/cobra" + "k8s.io/client-go/tools/clientcmd" + cliflag "k8s.io/component-base/cli/flag" + "k8s.io/klog/v2" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/healthz" + "sigs.k8s.io/controller-runtime/pkg/webhook" + + "github.com/kosmos.io/kosmos/cmd/webhook/app/options" + "github.com/kosmos.io/kosmos/pkg/scheme" + "github.com/kosmos.io/kosmos/pkg/sharedcli/klogflag" + kosmoswebhook "github.com/kosmos.io/kosmos/pkg/webhook" +) + +func NewWebhookCommand(ctx context.Context) *cobra.Command { + opts := options.NewOptions() + + cmd := &cobra.Command{ + Use: "kosmos-webhook", + Long: `TODO`, + RunE: func(cmd *cobra.Command, args []string) error { + if errs := opts.Validate(); len(errs) != 0 { + return errs.ToAggregate() + } + if err := Run(ctx, opts); err != nil { + return err + } + return nil + }, + Args: func(cmd *cobra.Command, args []string) error { + for _, arg := range args { + if len(arg) > 0 { + return fmt.Errorf("%q does not take any arguments, got %q", cmd.CommandPath(), args) + } + } + return nil + }, + } + + fss := cliflag.NamedFlagSets{} + + genericFlagSet := fss.FlagSet("generic") + opts.AddFlags(genericFlagSet) + + logsFlagSet := fss.FlagSet("logs") + klogflag.Add(logsFlagSet) + + cmd.Flags().AddFlagSet(genericFlagSet) + cmd.Flags().AddFlagSet(logsFlagSet) + + return cmd +} + +func Run(ctx context.Context, opts *options.Options) error { + config, err := clientcmd.BuildConfigFromFlags(opts.KubernetesOptions.Master, opts.KubernetesOptions.KubeConfig) + if err != nil { + panic(err) + } + config.QPS, config.Burst = opts.KubernetesOptions.QPS, opts.KubernetesOptions.Burst + + mgr, err := controllerruntime.NewManager(config, controllerruntime.Options{ + Logger: klog.Background(), + Scheme: scheme.NewSchema(), + WebhookServer: &webhook.Server{ + Host: opts.WebhookServerOptions.Host, + Port: opts.WebhookServerOptions.Port, + CertDir: opts.WebhookServerOptions.CertDir, + CertName: opts.WebhookServerOptions.CertName, + KeyName: opts.WebhookServerOptions.KeyName, + }, + MetricsBindAddress: "0", + HealthProbeBindAddress: "0", + LeaderElection: opts.LeaderElection.LeaderElect, + LeaderElectionID: opts.LeaderElection.ResourceName, + LeaderElectionNamespace: opts.LeaderElection.ResourceNamespace, + }) + if err != nil { + klog.Errorf("failed to build webhook server: %v", err) + return err + } + + hookServer := mgr.GetWebhookServer() + // register an Admission Webhook + hookServer.Register("/validate-delete-pod", &webhook.Admission{Handler: &kosmoswebhook.PodValidator{ + Client: mgr.GetClient(), + Options: opts.PodValidatorOptions, + }}) + hookServer.WebhookMux.Handle("/health", http.StripPrefix("/health", &healthz.Handler{})) + + if err := mgr.Start(ctx); err != nil { + klog.Errorf("failed to start webhook manager: %v", err) + return err + } + + return nil +} diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go new file mode 100644 index 000000000..0bcbe68f2 --- /dev/null +++ b/cmd/webhook/main.go @@ -0,0 +1,17 @@ +package main + +import ( + "os" + + apiserver "k8s.io/apiserver/pkg/server" + "k8s.io/component-base/cli" + + "github.com/kosmos.io/kosmos/cmd/webhook/app" +) + +func main() { + ctx := apiserver.SetupSignalContext() + cmd := app.NewWebhookCommand(ctx) + code := cli.Run(cmd) + os.Exit(code) +} diff --git a/deploy/webhook/deployment.yaml b/deploy/webhook/deployment.yaml new file mode 100644 index 000000000..81658c4cb --- /dev/null +++ b/deploy/webhook/deployment.yaml @@ -0,0 +1,53 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kosmos-webhook + namespace: kosmos-system +spec: + selector: + matchLabels: + app: kosmos-webhook + template: + metadata: + labels: + app: kosmos-webhook + spec: + serviceAccountName: kosmos-webhook + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kosmos.io/node + operator: DoesNotExist + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: app + operator: In + values: + - kosmos-webhook + namespaces: + - kosmos-system + topologyKey: kubernetes.io/hostname + containers: + - image: ghcr.io/kosmos-io/webhook:__VERSION__ + name: kosmos-webhook + volumeMounts: + - name: tls + mountPath: "/etc/certs" + command: + - webhook + - --v=4 + resources: + limits: + memory: 500Mi + cpu: 500m + requests: + cpu: 500m + memory: 500Mi + volumes: + - name: tls + secret: + secretName: kosmos-webhook-tls \ No newline at end of file diff --git a/deploy/webhook/rbac.yaml b/deploy/webhook/rbac.yaml new file mode 100644 index 000000000..9bc3ff6ba --- /dev/null +++ b/deploy/webhook/rbac.yaml @@ -0,0 +1,33 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kosmos-webhook + namespace: kosmos-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kosmos-webhook +rules: + - apiGroups: ['*'] + resources: ["nodes", "pods"] + verbs: ["*"] + - apiGroups: [ "coordination.k8s.io" ] + resources: [ "leases" ] + verbs: [ "get", "list", "watch", "create", "update", "patch", "delete" ] + - apiGroups: [""] + resources: ["events"] + verbs: ["create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kosmos-webhook +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kosmos-webhook +subjects: + - kind: ServiceAccount + name: kosmos-webhook + namespace: kosmos-system \ No newline at end of file diff --git a/deploy/webhook/service.yaml b/deploy/webhook/service.yaml new file mode 100644 index 000000000..0b0419bfb --- /dev/null +++ b/deploy/webhook/service.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Service +metadata: + name: kosmos-webhook + namespace: kosmos-system +spec: + ports: + - port: 9443 + protocol: TCP + targetPort: 9443 + selector: + app: kosmos-webhook \ No newline at end of file diff --git a/deploy/webhook/tls-secret.yaml b/deploy/webhook/tls-secret.yaml new file mode 100644 index 000000000..805c3a1f9 --- /dev/null +++ b/deploy/webhook/tls-secret.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +data: + tls.crt: __BASE64_SERVER_CRT__ + tls.key: __BASE64_SERVER_KEY__ +kind: Secret +metadata: + name: kosmos-webhook-tls + namespace: kosmos-system +type: kubernetes.io/tls \ No newline at end of file diff --git a/deploy/webhook/validate-delete-pod-config.yaml b/deploy/webhook/validate-delete-pod-config.yaml new file mode 100644 index 000000000..0659e84cb --- /dev/null +++ b/deploy/webhook/validate-delete-pod-config.yaml @@ -0,0 +1,26 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: "validate-delete-pod.kosmos.io" +webhooks: + - name: "validate-delete-pod.kosmos.io" + rules: + - apiGroups: [""] + apiVersions: ["v1"] + operations: ["DELETE"] + resources: ["pods"] + scope: "*" + admissionReviewVersions: ["v1"] + # FailurePolicy defines how unrecognized errors from the admission endpoint are handled - allowed values are + # Ignore or Fail. Defaults to Fail. + failurePolicy: Ignore + sideEffects: None + timeoutSeconds: 3 + clientConfig: + service: + namespace: kosmos-system + name: kosmos-webhook + path: /validate-delete-pod + port: 9443 + caBundle: | + __BASE64_CA_CRT__ \ No newline at end of file diff --git a/hack/gen-webhook-certs.sh b/hack/gen-webhook-certs.sh new file mode 100644 index 000000000..b1aa3aadb --- /dev/null +++ b/hack/gen-webhook-certs.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +WEBHOOK_NAME="kosmos-webhook" +NAMESPACE="kosmos-system" +# 有效期 +DAYS="36500" + +openssl genrsa -out ca.key 2048 + +# generate CA private keys and self-signed certificates: +openssl req -new -x509 -days ${DAYS} -key ca.key \ + -subj "/C=CN/CN=${WEBHOOK_NAME}"\ + -out ca.crt + +# fenerating Mutating Webhook Server private key and certificate signing request (csr) +openssl req -newkey rsa:2048 -nodes -keyout server.key \ + -subj "/C=CN/CN=${WEBHOOK_NAME}" \ + -out server.csr + +printf "subjectAltName=DNS:${WEBHOOK_NAME}.${NAMESPACE}.svc" > extfile.txt + +# sign the mutating webhook server certificate using the CA +openssl x509 -req \ + -extfile extfile.txt \ + -days ${DAYS} \ + -in server.csr \ + -CA ca.crt -CAkey ca.key -CAcreateserial \ + -out server.crt + +echo +echo ">> Generating kube secrets..." +kubectl create secret tls ${WEBHOOK_NAME}-tls \ + --cert=server.crt \ + --key=server.key \ + --dry-run=client -o yaml \ + > tls-secret.yaml + +echo +echo ">> MutatingWebhookConfiguration caBundle:" +cat ca.crt | base64 | fold + +rm ca.crt ca.key ca.srl server.crt server.csr server.key extfile.txt \ No newline at end of file diff --git a/hack/util.sh b/hack/util.sh index d879d7837..cd53333c7 100755 --- a/hack/util.sh +++ b/hack/util.sh @@ -23,6 +23,7 @@ CLUSTERLINK_TARGET_SOURCE=( clusterlink-controller-manager=cmd/clusterlink/controller-manager clustertree-cluster-manager=cmd/clustertree/cluster-manager kosmosctl=cmd/kosmosctl + webhook=cmd/webhook ) #https://textkool.com/en/ascii-art-generator?hl=default&vl=default&font=DOS%20Rebel&text=KOSMOS diff --git a/pkg/utils/k8s.go b/pkg/utils/k8s.go index f5de46db0..8ad7bdd9e 100644 --- a/pkg/utils/k8s.go +++ b/pkg/utils/k8s.go @@ -42,6 +42,7 @@ func CreateMergePatch(original, new interface{}) ([]byte, error) { return patch, nil } +// IsKosmosNode judge whether node is kosmos's node func IsKosmosNode(node *corev1.Node) bool { if node == nil { return false @@ -182,3 +183,13 @@ func ListResourceClusters(anno map[string]string) []string { owners := strings.Split(anno[KosmosResourceOwnersAnnotations], ",") return owners } + +// IsNotReady judge whether node is not ready +func IsNotReady(node *corev1.Node) bool { + for _, condition := range node.Status.Conditions { + if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue { + return false + } + } + return true +} diff --git a/pkg/webhook/validate_delete_pods.go b/pkg/webhook/validate_delete_pods.go new file mode 100644 index 000000000..657a15f66 --- /dev/null +++ b/pkg/webhook/validate_delete_pods.go @@ -0,0 +1,74 @@ +package webhook + +import ( + "context" + + admissionv1 "k8s.io/api/admission/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/klog/v2" + "k8s.io/utils/strings/slices" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + + "github.com/kosmos.io/kosmos/pkg/utils" +) + +type PodValidator struct { + client.Client + decoder *admission.Decoder + Options PodValidatorOptions +} + +type PodValidatorOptions struct { + UsernamesNeedToPrevent []string +} + +var _ admission.Handler = &PodValidator{} +var _ admission.DecoderInjector = &PodValidator{} + +func (v *PodValidator) Handle(ctx context.Context, req admission.Request) admission.Response { + if req.Operation != admissionv1.Delete { + return admission.Allowed("") + } + + // decode the old object from the request + pod := &corev1.Pod{} + err := v.decoder.DecodeRaw(req.OldObject, pod) + if err != nil { + klog.Warningf("Decode oldObject error: %v, skip", err) + return admission.Allowed("") + } + + if pod.Spec.NodeName == "" { + klog.V(4).Infof("Pod %s's nodeName is empty, skip", err) + return admission.Allowed("") + } + + node := &corev1.Node{} + if err = v.Client.Get(ctx, types.NamespacedName{ + Name: pod.Spec.NodeName, + }, node); err != nil { + klog.V(4).Infof("Failed to get pod %s/%s's node, nodeName: %s, error: %v", pod.Namespace, pod.Name, pod.Spec.NodeName, err) + return admission.Allowed("") + } + + // if the node is a kosmos node and is in the NotReady state, + // and the requested username is in the list of usernames that need to be prevented from deletion, + // the deletion operation is blocked. + if utils.IsKosmosNode(node) && utils.IsNotReady(node) && v.needToPrevent(req.UserInfo.Username) { + klog.Infof("Kosmos prevents pod deletion, name: %s, ns: %s", pod.Name, pod.Namespace) + return admission.Denied("Deleting pods of notReady kosmos nodes is not allowed.") + } + + return admission.Allowed("") +} + +func (v *PodValidator) InjectDecoder(d *admission.Decoder) error { + v.decoder = d + return nil +} + +func (v *PodValidator) needToPrevent(username string) bool { + return slices.Contains(v.Options.UsernamesNeedToPrevent, username) +}