Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix tikv failover #368

Merged
merged 4 commits into from
Apr 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/tidb-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ pd:

# maxStoreDownTime is how long a store will be considered `down` when disconnected
# if a store is considered `down`, the regions will be migrated to other stores
maxStoreDownTime: 1h
maxStoreDownTime: 30m
# maxReplicas is the number of replicas for each region
maxReplicas: 3
resources:
Expand Down
2 changes: 2 additions & 0 deletions charts/tidb-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ controllerManager:
autoFailover: false
# pd failover period default(5m)
pdFailoverPeriod: 5m
# tikv failover period default(5m)
tikvFailoverPeriod: 5m
# tidb failover period default(5m)
tidbFailoverPeriod: 5m

Expand Down
5 changes: 3 additions & 2 deletions cmd/controller-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ var (
workers int
autoFailover bool
pdFailoverPeriod time.Duration
tikvFailoverPeriod time.Duration
tidbFailoverPeriod time.Duration
leaseDuration = 15 * time.Second
renewDuration = 5 * time.Second
Expand All @@ -59,6 +60,7 @@ func init() {
flag.StringVar(&controller.DefaultStorageClassName, "default-storage-class-name", "standard", "Default storage class name")
flag.BoolVar(&autoFailover, "auto-failover", false, "Auto failover")
flag.DurationVar(&pdFailoverPeriod, "pd-failover-period", time.Duration(5*time.Minute), "PD failover period default(5m)")
flag.DurationVar(&tikvFailoverPeriod, "tikv-failover-period", time.Duration(5*time.Minute), "TiKV failover period default(5m)")
flag.DurationVar(&tidbFailoverPeriod, "tidb-failover-period", time.Duration(5*time.Minute), "TiDB failover period")

flag.Parse()
Expand Down Expand Up @@ -120,8 +122,7 @@ func main() {
},
}

tcController := tidbcluster.NewController(kubeCli, cli, informerFactory, kubeInformerFactory, autoFailover, pdFailoverPeriod, tidbFailoverPeriod)

tcController := tidbcluster.NewController(kubeCli, cli, informerFactory, kubeInformerFactory, autoFailover, pdFailoverPeriod, tikvFailoverPeriod, tidbFailoverPeriod)
controllerCtx, cancel := context.WithCancel(context.Background())
defer cancel()
go informerFactory.Start(controllerCtx.Done())
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -202,10 +202,14 @@ golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180807162357-acbc56fc7007 h1:UnHxDq9ldm4vol94wlSWDF3SU4IyC8IWVWtg266CzoY=
golang.org/x/sys v0.0.0-20180807162357-acbc56fc7007/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a h1:1BGLXjeY4akVXGgbC9HugT3Jv3hCI0z56oJR5vAMgBU=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2 h1:+DCIGbF/swA92ohVg0//6X2IVY3KZs6p9mix0ziNYJM=
golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/tools v0.0.0-20190403183509-8a44e74612bc h1:9OQUxGJQk/Rt2SmlbFsqnsyFaX1YiLbBfUJezBkCaa0=
golang.org/x/tools v0.0.0-20190403183509-8a44e74612bc/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
google.golang.org/appengine v1.1.0 h1:igQkv0AAhEIvTEpD5LIpAfav2eeVO9HBTjvKHVJPRSs=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
Expand Down
3 changes: 2 additions & 1 deletion pkg/controller/tidbcluster/tidb_cluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ func NewController(
kubeInformerFactory kubeinformers.SharedInformerFactory,
autoFailover bool,
pdFailoverPeriod time.Duration,
tikvFailoverPeriod time.Duration,
tidbFailoverPeriod time.Duration,
) *Controller {
eventBroadcaster := record.NewBroadcaster()
Expand Down Expand Up @@ -100,7 +101,7 @@ func NewController(
pdScaler := mm.NewPDScaler(pdControl, pvcInformer.Lister(), pvcControl)
tikvScaler := mm.NewTiKVScaler(pdControl, pvcInformer.Lister(), pvcControl, podInformer.Lister())
pdFailover := mm.NewPDFailover(cli, pdControl, pdFailoverPeriod, podInformer.Lister(), podControl, pvcInformer.Lister(), pvcControl, pvInformer.Lister())
tikvFailover := mm.NewTiKVFailover(pdControl)
tikvFailover := mm.NewTiKVFailover(tikvFailoverPeriod)
tidbFailover := mm.NewTiDBFailover(tidbFailoverPeriod)
pdUpgrader := mm.NewPDUpgrader(pdControl, podControl, podInformer.Lister())
tikvUpgrader := mm.NewTiKVUpgrader(pdControl, podControl, podInformer.Lister())
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/tidbcluster/tidb_cluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ func newFakeTidbClusterController() (*Controller, cache.Indexer, cache.Indexer)
autoFailover,
5*time.Minute,
5*time.Minute,
5*time.Minute,
)
tcc.tcListerSynced = alwaysReady
tcc.setListerSynced = alwaysReady
Expand Down
14 changes: 4 additions & 10 deletions pkg/manager/member/tikv_failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,30 +17,24 @@ import (
"time"

"github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1"
"github.com/pingcap/tidb-operator/pkg/controller"
)

type tikvFailover struct {
pdControl controller.PDControlInterface
tikvFailoverPeriod time.Duration
}

// NewTiKVFailover returns a tikv Failover
func NewTiKVFailover(pdControl controller.PDControlInterface) Failover {
return &tikvFailover{pdControl}
func NewTiKVFailover(tikvFailoverPeriod time.Duration) Failover {
return &tikvFailover{tikvFailoverPeriod}
}

func (tf *tikvFailover) Failover(tc *v1alpha1.TidbCluster) error {
cfg, err := tf.pdControl.GetPDClient(tc).GetConfig()
if err != nil {
return err
}

for storeID, store := range tc.Status.TiKV.Stores {
podName := store.PodName
if store.LastTransitionTime.IsZero() {
continue
}
deadline := store.LastTransitionTime.Add(cfg.Schedule.MaxStoreDownTime.Duration)
deadline := store.LastTransitionTime.Add(tf.tikvFailoverPeriod)
exist := false
for _, failureStore := range tc.Status.TiKV.FailureStores {
if failureStore.PodName == podName {
Expand Down
56 changes: 12 additions & 44 deletions pkg/manager/member/tikv_failover_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,44 +14,28 @@
package member

import (
"fmt"
"testing"
"time"

. "github.com/onsi/gomega"
"github.com/pingcap/pd/pkg/typeutil"
"github.com/pingcap/pd/server"
"github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1"
"github.com/pingcap/tidb-operator/pkg/controller"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestTiKVFailoverFailover(t *testing.T) {
g := NewGomegaWithT(t)

type testcase struct {
name string
update func(*v1alpha1.TidbCluster)
getCfgErr bool
err bool
expectFn func(*v1alpha1.TidbCluster)
name string
update func(*v1alpha1.TidbCluster)
err bool
expectFn func(*v1alpha1.TidbCluster)
}
testFn := func(test *testcase, t *testing.T) {
t.Log(test.name)
tc := newTidbClusterForPD()
test.update(tc)
tikvFailover, fakePDControl := newFakeTiKVFailover()
pdClient := controller.NewFakePDClient()
fakePDControl.SetPDClient(tc, pdClient)

pdClient.AddReaction(controller.GetConfigActionType, func(action *controller.Action) (interface{}, error) {
if test.getCfgErr {
return nil, fmt.Errorf("get config failed")
}
return &server.Config{
Schedule: server.ScheduleConfig{MaxStoreDownTime: typeutil.Duration{Duration: 1 * time.Hour}},
}, nil
})
tikvFailover := newFakeTiKVFailover()

err := tikvFailover.Failover(tc)
if test.err {
Expand Down Expand Up @@ -79,32 +63,20 @@ func TestTiKVFailoverFailover(t *testing.T) {
},
}
},
getCfgErr: false,
err: false,
err: false,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(2))
},
},
{
name: "get config failed",
update: func(*v1alpha1.TidbCluster) {},
getCfgErr: true,
err: true,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(0))
},
},
{
name: "tikv state is not Down",
update: func(tc *v1alpha1.TidbCluster) {
tc.Status.TiKV.Stores = map[string]v1alpha1.TiKVStore{
"1": {State: v1alpha1.TiKVStateUp, PodName: "tikv-1"},
}
},
getCfgErr: false,
err: false,
err: false,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(0))
Expand All @@ -121,8 +93,7 @@ func TestTiKVFailoverFailover(t *testing.T) {
},
}
},
getCfgErr: false,
err: false,
err: false,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(0))
Expand All @@ -138,8 +109,7 @@ func TestTiKVFailoverFailover(t *testing.T) {
},
}
},
getCfgErr: false,
err: false,
err: false,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(0))
Expand All @@ -162,8 +132,7 @@ func TestTiKVFailoverFailover(t *testing.T) {
},
}
},
getCfgErr: false,
err: false,
err: false,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(1))
Expand All @@ -175,7 +144,6 @@ func TestTiKVFailoverFailover(t *testing.T) {
}
}

func newFakeTiKVFailover() (*tikvFailover, *controller.FakePDControl) {
pdControl := controller.NewFakePDControl()
return &tikvFailover{pdControl}, pdControl
func newFakeTiKVFailover() *tikvFailover {
return &tikvFailover{1 * time.Hour}
}
3 changes: 3 additions & 0 deletions tests/failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,9 @@ func (oa *operatorActions) pdFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster)
return false
}

// TODO we should confirm the tombstone exists, important!!!!!!
// for example: offline the same pod again and again, and see it in the tombstone stores
// offline two pods, and see them in the tombstone stores
func (oa *operatorActions) tikvFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster) bool {
failure := false
for _, failureStore := range tc.Status.TiKV.FailureStores {
Expand Down