From fa861dcce31501bc7a5274e13d1779e75e2f3cc3 Mon Sep 17 00:00:00 2001 From: weekface Date: Thu, 4 Apr 2019 19:33:21 +0800 Subject: [PATCH 1/2] fix tikv failover --- .gitignore | 1 + charts/tidb-cluster/values.yaml | 2 +- charts/tidb-operator/values.yaml | 2 + cmd/controller-manager/main.go | 4 +- go.mod | 9 +-- go.sum | 14 +++++ .../tidbcluster/tidb_cluster_controller.go | 3 +- .../tidb_cluster_controller_test.go | 1 + pkg/manager/member/tikv_failover.go | 14 ++--- pkg/manager/member/tikv_failover_test.go | 56 ++++--------------- 10 files changed, 45 insertions(+), 61 deletions(-) diff --git a/.gitignore b/.gitignore index 31fb962bf4..e8c41829a9 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ tests/images/stability-test/bin/ tests/images/e2e/bin/ tests/images/fault-trigger/bin/ tests/images/e2e/tidb-cluster/ +tests/images/e2e/tidb-backup/ tests/images/e2e/tidb-operator/ *.tar tmp/ diff --git a/charts/tidb-cluster/values.yaml b/charts/tidb-cluster/values.yaml index c0fd8be1fb..fee5171ac3 100644 --- a/charts/tidb-cluster/values.yaml +++ b/charts/tidb-cluster/values.yaml @@ -52,7 +52,7 @@ pd: # maxStoreDownTime is how long a store will be considered `down` when disconnected # if a store is considered `down`, the regions will be migrated to other stores - maxStoreDownTime: 1h + maxStoreDownTime: 30m # maxReplicas is the number of replicas for each region maxReplicas: 3 resources: diff --git a/charts/tidb-operator/values.yaml b/charts/tidb-operator/values.yaml index 2de213e0e9..bde3d7b254 100644 --- a/charts/tidb-operator/values.yaml +++ b/charts/tidb-operator/values.yaml @@ -32,6 +32,8 @@ controllerManager: autoFailover: false # pd failover period default(5m) pdFailoverPeriod: 5m + # tikv failover period default(5m) + tikvFailoverPeriod: 5m # tidb failover period default(5m) tidbFailoverPeriod: 5m diff --git a/cmd/controller-manager/main.go b/cmd/controller-manager/main.go index 226668e4b3..f9b96bb135 100644 --- a/cmd/controller-manager/main.go +++ b/cmd/controller-manager/main.go @@ -42,6 +42,7 @@ var ( workers int autoFailover bool pdFailoverPeriod time.Duration + tikvFailoverPeriod time.Duration tidbFailoverPeriod time.Duration leaseDuration = 15 * time.Second renewDuration = 5 * time.Second @@ -58,6 +59,7 @@ func init() { flag.StringVar(&controller.DefaultStorageClassName, "default-storage-class-name", "standard", "Default storage class name") flag.BoolVar(&autoFailover, "auto-failover", false, "Auto failover") flag.DurationVar(&pdFailoverPeriod, "pd-failover-period", time.Duration(5*time.Minute), "PD failover period default(5m)") + flag.DurationVar(&tikvFailoverPeriod, "tikv-failover-period", time.Duration(5*time.Minute), "TiKV failover period default(5m)") flag.DurationVar(&tidbFailoverPeriod, "tidb-failover-period", time.Duration(5*time.Minute), "TiDB failover period") flag.Parse() @@ -119,7 +121,7 @@ func main() { }, } - tcController := tidbcluster.NewController(kubeCli, cli, informerFactory, kubeInformerFactory, autoFailover, pdFailoverPeriod, tidbFailoverPeriod) + tcController := tidbcluster.NewController(kubeCli, cli, informerFactory, kubeInformerFactory, autoFailover, pdFailoverPeriod, tikvFailoverPeriod, tidbFailoverPeriod) stop := make(chan struct{}) defer close(stop) go informerFactory.Start(stop) diff --git a/go.mod b/go.mod index 7b07961781..a780a1a2f8 100644 --- a/go.mod +++ b/go.mod @@ -65,12 +65,10 @@ require ( go.uber.org/atomic v1.3.2 // indirect go.uber.org/multierr v1.1.0 // indirect go.uber.org/zap v1.9.1 // indirect - golang.org/x/crypto v0.0.0-20180807104621-f027049dab0a // indirect - golang.org/x/net v0.0.0-20180808004115-f9ce57c11b24 + golang.org/x/net v0.0.0-20190311183353-d8887717615a golang.org/x/sync v0.0.0-20181108010431-42b317875d0f // indirect - golang.org/x/sys v0.0.0-20180807162357-acbc56fc7007 // indirect - golang.org/x/text v0.3.0 // indirect golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2 // indirect + golang.org/x/tools v0.0.0-20190403183509-8a44e74612bc // indirect google.golang.org/appengine v1.1.0 // indirect google.golang.org/genproto v0.0.0-20180731170733-daca94659cb5 // indirect google.golang.org/grpc v1.12.0 // indirect @@ -87,6 +85,9 @@ require ( k8s.io/apimachinery v0.0.0-20180228050457-302974c03f7e k8s.io/apiserver v0.0.0-20180807214737-1b22bcf04547 k8s.io/client-go v0.0.0-20180424211516-33f2870a2b83 + k8s.io/code-generator v0.0.0-20180228050103-7ead8f38b01c // indirect + k8s.io/gengo v0.0.0-20190327210449-e17681d19d3a // indirect + k8s.io/klog v0.2.0 // indirect k8s.io/kube-openapi v0.0.0-20180731170545-e3762e86a74c // indirect k8s.io/kubernetes v1.10.2 ) diff --git a/go.sum b/go.sum index c7eac8186c..e961b5cce5 100644 --- a/go.sum +++ b/go.sum @@ -130,16 +130,24 @@ go.uber.org/zap v1.9.1 h1:XCJQEf3W6eZaVwhRBof6ImoYGJSITeKWsyeh3HFu/5o= go.uber.org/zap v1.9.1/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= golang.org/x/crypto v0.0.0-20180807104621-f027049dab0a h1:PulT0Y50PcfTWomfsD39bSQyVrjjWdIuJKfyR4nOCJw= golang.org/x/crypto v0.0.0-20180807104621-f027049dab0a/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 h1:VklqNMn3ovrHsnt90PveolxSbWFaJdECFbxSq0Mqo2M= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/net v0.0.0-20180808004115-f9ce57c11b24 h1:mEsFm194MmS9vCwxFy+zwu0EU7ZkxxMD1iH++vmGdUY= golang.org/x/net v0.0.0-20180808004115-f9ce57c11b24/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a h1:oWX7TPOiFAMXLq8o0ikBYfCJVlRHBcsciT5bXOrH628= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f h1:Bl/8QSvNqXvPGPGXa2z5xUTmV7VDcZyvRZ+QQXkXTZQ= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180807162357-acbc56fc7007 h1:UnHxDq9ldm4vol94wlSWDF3SU4IyC8IWVWtg266CzoY= golang.org/x/sys v0.0.0-20180807162357-acbc56fc7007/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a h1:1BGLXjeY4akVXGgbC9HugT3Jv3hCI0z56oJR5vAMgBU= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2 h1:+DCIGbF/swA92ohVg0//6X2IVY3KZs6p9mix0ziNYJM= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20190403183509-8a44e74612bc h1:9OQUxGJQk/Rt2SmlbFsqnsyFaX1YiLbBfUJezBkCaa0= +golang.org/x/tools v0.0.0-20190403183509-8a44e74612bc/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= google.golang.org/appengine v1.1.0 h1:igQkv0AAhEIvTEpD5LIpAfav2eeVO9HBTjvKHVJPRSs= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/genproto v0.0.0-20180731170733-daca94659cb5 h1:2PjFmwzH/sxgW9CRJDlEiwMHO8rOk1eMDzVL14HC1e4= @@ -174,6 +182,12 @@ k8s.io/apiserver v0.0.0-20180807214737-1b22bcf04547 h1:6ywv8+3hlyxlevR3XvNSECNIo k8s.io/apiserver v0.0.0-20180807214737-1b22bcf04547/go.mod h1:6bqaTSOSJavUIXUtfaR9Os9JtTCm8ZqH2SUl2S60C4w= k8s.io/client-go v0.0.0-20180424211516-33f2870a2b83 h1:zFG4vdZo+7VePgo3YFfw+ZZURiwY0BGAzdt20E6X9M0= k8s.io/client-go v0.0.0-20180424211516-33f2870a2b83/go.mod h1:7vJpHMYJwNQCWgzmNV+VYUl1zCObLyodBc8nIyt8L5s= +k8s.io/code-generator v0.0.0-20180228050103-7ead8f38b01c h1:tpq+nvY9+2K8qRsOIUzJY8Mq7sNrfisNwQLc4/Xw76o= +k8s.io/code-generator v0.0.0-20180228050103-7ead8f38b01c/go.mod h1:MYiN+ZJZ9HkETbgVZdWw2AsuAi9PZ4V80cwfuf2axe8= +k8s.io/gengo v0.0.0-20190327210449-e17681d19d3a h1:QoHVuRquf80YZ+/bovwxoMO3Q/A3nt3yTgS0/0nejuk= +k8s.io/gengo v0.0.0-20190327210449-e17681d19d3a/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0= +k8s.io/klog v0.2.0 h1:0ElL0OHzF3N+OhoJTL0uca20SxtYt4X4+bzHeqrB83c= +k8s.io/klog v0.2.0/go.mod h1:Gq+BEi5rUBO/HRz0bTSXDUcqjScdoY3a9IHpCEIOOfk= k8s.io/kube-openapi v0.0.0-20180731170545-e3762e86a74c h1:3KSCztE7gPitlZmWbNwue/2U0YruD65DqX3INopDAQM= k8s.io/kube-openapi v0.0.0-20180731170545-e3762e86a74c/go.mod h1:BXM9ceUBTj2QnfH2MK1odQs778ajze1RxcmP6S8RVVc= k8s.io/kubernetes v1.10.2 h1:ad/NtArD2zm//BVvQpiWHHpSxryJrSd73B68Z18WFzM= diff --git a/pkg/controller/tidbcluster/tidb_cluster_controller.go b/pkg/controller/tidbcluster/tidb_cluster_controller.go index 677802af01..38eb2f77b1 100644 --- a/pkg/controller/tidbcluster/tidb_cluster_controller.go +++ b/pkg/controller/tidbcluster/tidb_cluster_controller.go @@ -73,6 +73,7 @@ func NewController( kubeInformerFactory kubeinformers.SharedInformerFactory, autoFailover bool, pdFailoverPeriod time.Duration, + tikvFailoverPeriod time.Duration, tidbFailoverPeriod time.Duration, ) *Controller { eventBroadcaster := record.NewBroadcaster() @@ -100,7 +101,7 @@ func NewController( pdScaler := mm.NewPDScaler(pdControl, pvcInformer.Lister(), pvcControl) tikvScaler := mm.NewTiKVScaler(pdControl, pvcInformer.Lister(), pvcControl, podInformer.Lister()) pdFailover := mm.NewPDFailover(cli, pdControl, pdFailoverPeriod, podInformer.Lister(), podControl, pvcInformer.Lister(), pvcControl, pvInformer.Lister()) - tikvFailover := mm.NewTiKVFailover(pdControl) + tikvFailover := mm.NewTiKVFailover(tikvFailoverPeriod) tidbFailover := mm.NewTiDBFailover(tidbFailoverPeriod) pdUpgrader := mm.NewPDUpgrader(pdControl, podControl, podInformer.Lister()) tikvUpgrader := mm.NewTiKVUpgrader(pdControl, podControl, podInformer.Lister()) diff --git a/pkg/controller/tidbcluster/tidb_cluster_controller_test.go b/pkg/controller/tidbcluster/tidb_cluster_controller_test.go index c9a442bb0a..f5745f9da0 100644 --- a/pkg/controller/tidbcluster/tidb_cluster_controller_test.go +++ b/pkg/controller/tidbcluster/tidb_cluster_controller_test.go @@ -233,6 +233,7 @@ func newFakeTidbClusterController() (*Controller, cache.Indexer, cache.Indexer) autoFailover, 5*time.Minute, 5*time.Minute, + 5*time.Minute, ) tcc.tcListerSynced = alwaysReady tcc.setListerSynced = alwaysReady diff --git a/pkg/manager/member/tikv_failover.go b/pkg/manager/member/tikv_failover.go index 6724cd4689..8f39e1d8fa 100644 --- a/pkg/manager/member/tikv_failover.go +++ b/pkg/manager/member/tikv_failover.go @@ -17,30 +17,24 @@ import ( "time" "github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1" - "github.com/pingcap/tidb-operator/pkg/controller" ) type tikvFailover struct { - pdControl controller.PDControlInterface + tikvFailoverPeriod time.Duration } // NewTiKVFailover returns a tikv Failover -func NewTiKVFailover(pdControl controller.PDControlInterface) Failover { - return &tikvFailover{pdControl} +func NewTiKVFailover(tikvFailoverPeriod time.Duration) Failover { + return &tikvFailover{tikvFailoverPeriod} } func (tf *tikvFailover) Failover(tc *v1alpha1.TidbCluster) error { - cfg, err := tf.pdControl.GetPDClient(tc).GetConfig() - if err != nil { - return err - } - for storeID, store := range tc.Status.TiKV.Stores { podName := store.PodName if store.LastTransitionTime.IsZero() { continue } - deadline := store.LastTransitionTime.Add(cfg.Schedule.MaxStoreDownTime.Duration) + deadline := store.LastTransitionTime.Add(tf.tikvFailoverPeriod) exist := false for _, failureStore := range tc.Status.TiKV.FailureStores { if failureStore.PodName == podName { diff --git a/pkg/manager/member/tikv_failover_test.go b/pkg/manager/member/tikv_failover_test.go index b5be0ca0a8..2c82de803e 100644 --- a/pkg/manager/member/tikv_failover_test.go +++ b/pkg/manager/member/tikv_failover_test.go @@ -14,15 +14,11 @@ package member import ( - "fmt" "testing" "time" . "github.com/onsi/gomega" - "github.com/pingcap/pd/pkg/typeutil" - "github.com/pingcap/pd/server" "github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1" - "github.com/pingcap/tidb-operator/pkg/controller" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -30,28 +26,16 @@ func TestTiKVFailoverFailover(t *testing.T) { g := NewGomegaWithT(t) type testcase struct { - name string - update func(*v1alpha1.TidbCluster) - getCfgErr bool - err bool - expectFn func(*v1alpha1.TidbCluster) + name string + update func(*v1alpha1.TidbCluster) + err bool + expectFn func(*v1alpha1.TidbCluster) } testFn := func(test *testcase, t *testing.T) { t.Log(test.name) tc := newTidbClusterForPD() test.update(tc) - tikvFailover, fakePDControl := newFakeTiKVFailover() - pdClient := controller.NewFakePDClient() - fakePDControl.SetPDClient(tc, pdClient) - - pdClient.AddReaction(controller.GetConfigActionType, func(action *controller.Action) (interface{}, error) { - if test.getCfgErr { - return nil, fmt.Errorf("get config failed") - } - return &server.Config{ - Schedule: server.ScheduleConfig{MaxStoreDownTime: typeutil.Duration{Duration: 1 * time.Hour}}, - }, nil - }) + tikvFailover := newFakeTiKVFailover() err := tikvFailover.Failover(tc) if test.err { @@ -79,23 +63,12 @@ func TestTiKVFailoverFailover(t *testing.T) { }, } }, - getCfgErr: false, - err: false, + err: false, expectFn: func(tc *v1alpha1.TidbCluster) { g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(2)) }, }, - { - name: "get config failed", - update: func(*v1alpha1.TidbCluster) {}, - getCfgErr: true, - err: true, - expectFn: func(tc *v1alpha1.TidbCluster) { - g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) - g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(0)) - }, - }, { name: "tikv state is not Down", update: func(tc *v1alpha1.TidbCluster) { @@ -103,8 +76,7 @@ func TestTiKVFailoverFailover(t *testing.T) { "1": {State: v1alpha1.TiKVStateUp, PodName: "tikv-1"}, } }, - getCfgErr: false, - err: false, + err: false, expectFn: func(tc *v1alpha1.TidbCluster) { g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(0)) @@ -121,8 +93,7 @@ func TestTiKVFailoverFailover(t *testing.T) { }, } }, - getCfgErr: false, - err: false, + err: false, expectFn: func(tc *v1alpha1.TidbCluster) { g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(0)) @@ -138,8 +109,7 @@ func TestTiKVFailoverFailover(t *testing.T) { }, } }, - getCfgErr: false, - err: false, + err: false, expectFn: func(tc *v1alpha1.TidbCluster) { g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(0)) @@ -162,8 +132,7 @@ func TestTiKVFailoverFailover(t *testing.T) { }, } }, - getCfgErr: false, - err: false, + err: false, expectFn: func(tc *v1alpha1.TidbCluster) { g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(1)) @@ -175,7 +144,6 @@ func TestTiKVFailoverFailover(t *testing.T) { } } -func newFakeTiKVFailover() (*tikvFailover, *controller.FakePDControl) { - pdControl := controller.NewFakePDControl() - return &tikvFailover{pdControl}, pdControl +func newFakeTiKVFailover() *tikvFailover { + return &tikvFailover{1 * time.Hour} } From e6d7947685e8d84e3bdb6dbd1df86b4543c05af8 Mon Sep 17 00:00:00 2001 From: weekface Date: Thu, 4 Apr 2019 19:47:39 +0800 Subject: [PATCH 2/2] add TODO --- tests/failover.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/failover.go b/tests/failover.go index ea1631909d..59aa87cce4 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -214,6 +214,9 @@ func (oa *operatorActions) pdFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster) return false } +// TODO we should confirm the tombstone exists, important!!!!!! +// for example: offline the same pod again and again, and see it in the tombstone stores +// offline two pods, and see them in the tombstone stores func (oa *operatorActions) tikvFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster) bool { failure := false for _, failureStore := range tc.Status.TiKV.FailureStores {