From 495fc56074258bdf1b8a49dccd0065f6b1ec28a8 Mon Sep 17 00:00:00 2001 From: Yecheng Fu Date: Tue, 7 Apr 2020 13:45:04 +0800 Subject: [PATCH 1/3] kill tidb-operator pods randomly in e2e --- ci/pingcap_tidb_operator_build_kind.groovy | 4 +- tests/config.go | 7 ++- tests/e2e/config/config.go | 4 ++ tests/e2e/e2e.go | 19 ++++++ tests/e2e/util/operator/operator.go | 71 ++++++++++++++++++++++ 5 files changed, 100 insertions(+), 5 deletions(-) create mode 100644 tests/e2e/util/operator/operator.go diff --git a/ci/pingcap_tidb_operator_build_kind.groovy b/ci/pingcap_tidb_operator_build_kind.groovy index 4b8baf5b0b..6ff759fa60 100644 --- a/ci/pingcap_tidb_operator_build_kind.groovy +++ b/ci/pingcap_tidb_operator_build_kind.groovy @@ -238,13 +238,13 @@ def call(BUILD_BRANCH, CREDENTIALS_ID, CODECOV_CREDENTIALS_ID) { def MIRRORS = "DOCKER_IO_MIRROR=http://172.16.4.143:5000 QUAY_IO_MIRROR=http://172.16.4.143:5001" def builds = [:] builds["E2E v1.12.10"] = { - build("${MIRRORS} RUNNER_SUITE_NAME=e2e-v1.12 IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=6 KUBE_VERSION=v1.12.10 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.12.10_ ./hack/e2e.sh -- --preload-images", artifacts) + build("${MIRRORS} RUNNER_SUITE_NAME=e2e-v1.12 IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=6 KUBE_VERSION=v1.12.10 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.12.10_ ./hack/e2e.sh -- --preload-images --operator-killer", artifacts) } builds["E2E v1.12.10 AdvancedStatefulSet"] = { build("${MIRRORS} RUNNER_SUITE_NAME=e2e-v1.12-advanced-statefulset IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=6 KUBE_VERSION=v1.12.10 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.12.10_advanced_statefulset ./hack/e2e.sh -- --preload-images --operator-features AdvancedStatefulSet=true", artifacts) } builds["E2E v1.18.0"] = { - build("${MIRRORS} RUNNER_SUITE_NAME=e2e-v1.18 IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=6 KUBE_VERSION=v1.18.0 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.18.0_ ./hack/e2e.sh -- -preload-images", artifacts) + build("${MIRRORS} RUNNER_SUITE_NAME=e2e-v1.18 IMAGE_TAG=${GITHASH} SKIP_BUILD=y GINKGO_NODES=6 KUBE_VERSION=v1.18.0 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.18.0_ ./hack/e2e.sh -- -preload-images --operator-killer", artifacts) } builds["E2E v1.12.10 Serial"] = { build("${MIRRORS} RUNNER_SUITE_NAME=e2e-v1.12-serial IMAGE_TAG=${GITHASH} SKIP_BUILD=y KUBE_VERSION=v1.12.10 REPORT_DIR=\$(pwd)/artifacts REPORT_PREFIX=v1.12.10_serial_ ./hack/e2e.sh -- --preload-images --ginkgo.focus='\\[Serial\\]' --install-operator=false", artifacts) diff --git a/tests/config.go b/tests/config.go index 5cc4875e11..9dd70dd8b6 100644 --- a/tests/config.go +++ b/tests/config.go @@ -21,10 +21,9 @@ import ( "os" "strings" - "github.com/pingcap/tidb-operator/tests/slack" - + utiloperator "github.com/pingcap/tidb-operator/tests/e2e/util/operator" "github.com/pingcap/tidb-operator/tests/pkg/blockwriter" - + "github.com/pingcap/tidb-operator/tests/slack" "gopkg.in/yaml.v2" "k8s.io/klog" ) @@ -77,6 +76,8 @@ type Config struct { E2EImage string `yaml:"e2e_image" json:"e2e_image"` PreloadImages bool `yaml:"preload_images" json:"preload_images"` + + OperatorKiller utiloperator.OperatorKillerConfig } // Nodes defines a series of nodes that belong to the same physical node. diff --git a/tests/e2e/config/config.go b/tests/e2e/config/config.go index 73cf85028c..a037d122aa 100644 --- a/tests/e2e/config/config.go +++ b/tests/e2e/config/config.go @@ -17,6 +17,7 @@ import ( "flag" "fmt" "io/ioutil" + "time" "github.com/pingcap/tidb-operator/tests" v1 "k8s.io/api/core/v1" @@ -46,6 +47,9 @@ func RegisterTiDBOperatorFlags(flags *flag.FlagSet) { flags.StringVar(&TestConfig.ChartDir, "chart-dir", "", "chart dir") flags.BoolVar(&TestConfig.PreloadImages, "preload-images", false, "if set, preload images in the bootstrap of e2e process") flags.StringVar(&TestConfig.BackupImage, "backup-image", "", "backup image") + flags.BoolVar(&TestConfig.OperatorKiller.Enabled, "operator-killer", false, "whether to enable operator kill") + flags.DurationVar(&TestConfig.OperatorKiller.Interval, "operator-killer-interval", 5*time.Minute, "interval between operator kills") + flags.Float64Var(&TestConfig.OperatorKiller.JitterFactor, "operator-killer-jitter-factor", 1, "factor used to jitter operator kills") } func AfterReadingAllFlags() error { diff --git a/tests/e2e/e2e.go b/tests/e2e/e2e.go index 7df4fccf10..afe46d83a7 100644 --- a/tests/e2e/e2e.go +++ b/tests/e2e/e2e.go @@ -35,10 +35,12 @@ import ( e2econfig "github.com/pingcap/tidb-operator/tests/e2e/config" utilimage "github.com/pingcap/tidb-operator/tests/e2e/util/image" utilnode "github.com/pingcap/tidb-operator/tests/e2e/util/node" + utiloperator "github.com/pingcap/tidb-operator/tests/e2e/util/operator" v1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" apiextensionsclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" runtimeutils "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" @@ -256,6 +258,20 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { ginkgo.By("Installing tidb-operator") oa.CleanOperatorOrDie(ocfg) oa.DeployOperatorOrDie(ocfg) + if e2econfig.TestConfig.OperatorKiller.Enabled { + operatorKiller := utiloperator.NewOperatorKiller(e2econfig.TestConfig.OperatorKiller, kubeCli, func() ([]v1.Pod, error) { + podList, err := kubeCli.CoreV1().Pods(ocfg.Namespace).List(metav1.ListOptions{ + LabelSelector: labels.SelectorFromSet(map[string]string{ + "app.kubernetes.io/name": "tidb-operator", + }).String(), + }) + if err != nil { + return nil, err + } + return podList.Items, nil + }) + go operatorKiller.Run(e2econfig.TestConfig.OperatorKiller.StopCh) + } } else { ginkgo.By("Skip installing tidb-operator") } @@ -269,6 +285,9 @@ var _ = ginkgo.SynchronizedAfterSuite(func() { framework.CleanupSuite() }, func() { framework.AfterSuiteActions() + if e2econfig.TestConfig.OperatorKiller.Enabled { + close(e2econfig.TestConfig.OperatorKiller.StopCh) + } }) // RunE2ETests checks configuration parameters (specified through flags) and then runs diff --git a/tests/e2e/util/operator/operator.go b/tests/e2e/util/operator/operator.go new file mode 100644 index 0000000000..0098825214 --- /dev/null +++ b/tests/e2e/util/operator/operator.go @@ -0,0 +1,71 @@ +// Copyright 2020 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package operator + +import ( + "time" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/kubernetes/test/e2e/framework" +) + +// OperatorKillerConfig describes configuration for operator killer. +type OperatorKillerConfig struct { + Enabled bool + // Interval is time between operator failures. + Interval time.Duration + // Operator pods will be deleted between [Interval, Interval * (1.0 + JitterFactor)]. + JitterFactor float64 + // StopCh is a channel that is used to notify OperatorKiller to stop killing operator pods. + StopCh chan struct{} +} + +// OperatorKiller deletes pods of tidb-operator to simulate operator failures. +type OperatorKiller struct { + config OperatorKillerConfig + client kubernetes.Interface + podLister func() ([]v1.Pod, error) +} + +// NewOperatorKiller creates a new operator killer. +func NewOperatorKiller(config OperatorKillerConfig, client kubernetes.Interface, podLister func() ([]v1.Pod, error)) *OperatorKiller { + config.StopCh = make(chan struct{}) + return &OperatorKiller{ + config: config, + client: client, + podLister: podLister, + } +} + +// Run starts OperatorKiller until stopCh is closed. +func (k *OperatorKiller) Run(stopCh <-chan struct{}) { + // wait.JitterUntil starts work immediately, so wait first. + time.Sleep(wait.Jitter(k.config.Interval, k.config.JitterFactor)) + wait.JitterUntil(func() { + pods, err := k.podLister() + if err != nil { + framework.Logf("failed to list operator pods: %v", err) + return + } + for _, pod := range pods { + err = k.client.CoreV1().Pods(pod.Namespace).Delete(pod.Name, &metav1.DeleteOptions{}) + if err != nil { + framework.Logf("failed to delete pod %s/%s: %v", pod.Namespace, pod.Name, err) + } + } + }, k.config.Interval, k.config.JitterFactor, true, stopCh) +} From 1879b75dcdb7ab089c980dfa43c07db62af2f274 Mon Sep 17 00:00:00 2001 From: Yecheng Fu Date: Tue, 7 Apr 2020 14:40:18 +0800 Subject: [PATCH 2/3] don't use channel in configuration struct --- tests/e2e/e2e.go | 11 ++++++++--- tests/e2e/util/operator/operator.go | 3 --- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/e2e/e2e.go b/tests/e2e/e2e.go index afe46d83a7..9991599cdc 100644 --- a/tests/e2e/e2e.go +++ b/tests/e2e/e2e.go @@ -60,6 +60,10 @@ import ( _ "k8s.io/kubernetes/test/e2e/framework/providers/gce" ) +var ( + operatorKillerStopCh chan struct{} +) + // This is modified from framework.SetupSuite(). // setupSuite is the boilerplate that can be used to setup ginkgo test suites, on the SynchronizedBeforeSuite step. // There are certain operations we only want to run once per overall test invocation @@ -270,7 +274,8 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { } return podList.Items, nil }) - go operatorKiller.Run(e2econfig.TestConfig.OperatorKiller.StopCh) + operatorKillerStopCh := make(chan struct{}) + go operatorKiller.Run(operatorKillerStopCh) } } else { ginkgo.By("Skip installing tidb-operator") @@ -285,8 +290,8 @@ var _ = ginkgo.SynchronizedAfterSuite(func() { framework.CleanupSuite() }, func() { framework.AfterSuiteActions() - if e2econfig.TestConfig.OperatorKiller.Enabled { - close(e2econfig.TestConfig.OperatorKiller.StopCh) + if operatorKillerStopCh != nil { + close(operatorKillerStopCh) } }) diff --git a/tests/e2e/util/operator/operator.go b/tests/e2e/util/operator/operator.go index 0098825214..617d04cc77 100644 --- a/tests/e2e/util/operator/operator.go +++ b/tests/e2e/util/operator/operator.go @@ -30,8 +30,6 @@ type OperatorKillerConfig struct { Interval time.Duration // Operator pods will be deleted between [Interval, Interval * (1.0 + JitterFactor)]. JitterFactor float64 - // StopCh is a channel that is used to notify OperatorKiller to stop killing operator pods. - StopCh chan struct{} } // OperatorKiller deletes pods of tidb-operator to simulate operator failures. @@ -43,7 +41,6 @@ type OperatorKiller struct { // NewOperatorKiller creates a new operator killer. func NewOperatorKiller(config OperatorKillerConfig, client kubernetes.Interface, podLister func() ([]v1.Pod, error)) *OperatorKiller { - config.StopCh = make(chan struct{}) return &OperatorKiller{ config: config, client: client, From abac6fef41d64ded164749c0e65bec764c453657 Mon Sep 17 00:00:00 2001 From: Yecheng Fu Date: Tue, 7 Apr 2020 16:37:40 +0800 Subject: [PATCH 3/3] add successful log --- tests/e2e/util/operator/operator.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/e2e/util/operator/operator.go b/tests/e2e/util/operator/operator.go index 617d04cc77..58d0b87188 100644 --- a/tests/e2e/util/operator/operator.go +++ b/tests/e2e/util/operator/operator.go @@ -62,6 +62,8 @@ func (k *OperatorKiller) Run(stopCh <-chan struct{}) { err = k.client.CoreV1().Pods(pod.Namespace).Delete(pod.Name, &metav1.DeleteOptions{}) if err != nil { framework.Logf("failed to delete pod %s/%s: %v", pod.Namespace, pod.Name, err) + } else { + framework.Logf("successfully deleted tidb-operator pod %s/%s", pod.Namespace, pod.Name) } } }, k.config.Interval, k.config.JitterFactor, true, stopCh)