diff --git a/metrics/metrics.go b/metrics/metrics.go index 3b07785a5..1087b3dec 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -32,6 +32,7 @@ const ( HEALTHY = 0.0 REDIS_REPLICA_MISMATCH = "REDIS_STATEFULSET_REPLICAS_MISMATCH" SENTINEL_REPLICA_MISMATCH = "SENTINEL_DEPLOYMENT_REPLICAS_MISMATCH" + NO_MASTER = "NO_MASTER_AVAILABLE" NUMBER_OF_MASTERS = "MASTER_COUNT_IS_NOT_ONE" SENTINEL_WRONG_MASTER = "SENTINEL_IS_CONFIGURED_WITH_WRONG_MASTER_IP" SLAVE_WRONG_MASTER = "SLAVE_IS_CONFIGURED_WITH_WRONG_MASTER_IP" @@ -65,6 +66,7 @@ const ( MAKE_MASTER = "MAKE_INSTANCE_AS_MASTER" MAKE_SLAVE_OF = "MAKE_SLAVE_OF_GIVEN_MASTER_INSTANCE" GET_SENTINEL_MONITOR = "SENTINEL_GET_MASTER_INSTANCE" + CHECK_SENTINEL_QUORUM = "SENTINEL_CKQUORUM" SLAVE_IS_READY = "CHECK_IF_SLAVE_IS_READY" ) diff --git a/mocks/operator/redisfailover/service/RedisFailoverCheck.go b/mocks/operator/redisfailover/service/RedisFailoverCheck.go index c21c37f1a..e58ad55b9 100644 --- a/mocks/operator/redisfailover/service/RedisFailoverCheck.go +++ b/mocks/operator/redisfailover/service/RedisFailoverCheck.go @@ -29,6 +29,27 @@ func (_m *RedisFailoverCheck) CheckAllSlavesFromMaster(master string, rFailover return r0 } +// CheckIfMasterLocalhost provides a mock function with given fields: rFailover +func (_m *RedisFailoverCheck) CheckIfMasterLocalhost(rFailover *v1.RedisFailover) (bool, error) { + ret := _m.Called(rFailover) + + var r0 bool + if rf, ok := ret.Get(0).(func(*v1.RedisFailover) bool); ok { + r0 = rf(rFailover) + } else { + r0 = ret.Get(0).(bool) + } + + var r1 error + if rf, ok := ret.Get(1).(func(*v1.RedisFailover) error); ok { + r1 = rf(rFailover) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + // CheckRedisNumber provides a mock function with given fields: rFailover func (_m *RedisFailoverCheck) CheckRedisNumber(rFailover *v1.RedisFailover) error { ret := _m.Called(rFailover) @@ -113,6 +134,27 @@ func (_m *RedisFailoverCheck) CheckSentinelNumberInMemory(sentinel string, rFail return r0 } +// CheckSentinelQuorum provides a mock function with given fields: rFailover +func (_m *RedisFailoverCheck) CheckSentinelQuorum(rFailover *v1.RedisFailover) (int, error) { + ret := _m.Called(rFailover) + + var r0 int + if rf, ok := ret.Get(0).(func(*v1.RedisFailover) int); ok { + r0 = rf(rFailover) + } else { + r0 = ret.Get(0).(int) + } + + var r1 error + if rf, ok := ret.Get(1).(func(*v1.RedisFailover) error); ok { + r1 = rf(rFailover) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + // CheckSentinelSlavesNumberInMemory provides a mock function with given fields: sentinel, rFailover func (_m *RedisFailoverCheck) CheckSentinelSlavesNumberInMemory(sentinel string, rFailover *v1.RedisFailover) error { ret := _m.Called(sentinel, rFailover) @@ -148,8 +190,8 @@ func (_m *RedisFailoverCheck) GetMasterIP(rFailover *v1.RedisFailover) (string, return r0, r1 } -// GetMinimumRedisPodTime provides a mock function with given fields: rFailover -func (_m *RedisFailoverCheck) GetMinimumRedisPodTime(rFailover *v1.RedisFailover) (time.Duration, error) { +// GetMaxRedisPodTime provides a mock function with given fields: rFailover +func (_m *RedisFailoverCheck) GetMaxRedisPodTime(rFailover *v1.RedisFailover) (time.Duration, error) { ret := _m.Called(rFailover) var r0 time.Duration diff --git a/mocks/service/redis/Client.go b/mocks/service/redis/Client.go index b52028d19..b44674d51 100644 --- a/mocks/service/redis/Client.go +++ b/mocks/service/redis/Client.go @@ -205,6 +205,20 @@ func (_m *Client) ResetSentinel(ip string) error { return r0 } +// SentinelCheckQuorum provides a mock function with given fields: ip +func (_m *Client) SentinelCheckQuorum(ip string) error { + ret := _m.Called(ip) + + var r0 error + if rf, ok := ret.Get(0).(func(string) error); ok { + r0 = rf(ip) + } else { + r0 = ret.Error(0) + } + + return r0 +} + // SetCustomRedisConfig provides a mock function with given fields: ip, port, configs, password func (_m *Client) SetCustomRedisConfig(ip string, port string, configs []string, password string) error { ret := _m.Called(ip, port, configs, password) diff --git a/operator/redisfailover/checker.go b/operator/redisfailover/checker.go index 0324b341b..89a0c2f19 100644 --- a/operator/redisfailover/checker.go +++ b/operator/redisfailover/checker.go @@ -9,10 +9,6 @@ import ( "github.com/spotahome/redis-operator/metrics" ) -const ( - timeToPrepare = 2 * time.Minute -) - // UpdateRedisesPods if the running version of pods are equal to the statefulset one func (r *RedisFailoverHandler) UpdateRedisesPods(rf *redisfailoverv1.RedisFailover) error { redises, err := r.rfChecker.GetRedisesIPs(rf) @@ -118,34 +114,71 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e if err != nil { return err } + switch nMasters { case 0: - setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("no masters detected")) - redisesIP, err := r.rfChecker.GetRedisesIPs(rf) - if err != nil { - return err - } - if len(redisesIP) == 1 { - if err := r.rfHealer.MakeMaster(redisesIP[0], rf); err != nil { + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, errors.New("no masters detected")) + //when number of redis replicas is 1 , the redis is configured for standalone master mode + //Configure to master + if rf.Spec.Redis.Replicas == 1 { + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Resource spec with standalone master - operator will set the master") + err = r.rfHealer.SetOldestAsMaster(rf) + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err) + if err != nil { + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master") return err } - break + return nil } - minTime, err2 := r.rfChecker.GetMinimumRedisPodTime(rf) - if err2 != nil { - return err2 + //During the First boot(New deployment or all pods of the statefulsets have restarted), + //Sentinesl will not be able to choose the master , so operator should select a master + //Also in scenarios where Sentinels is not in a position to choose a master like , No quorum reached + //Operator can choose a master , These scenarios can be checked by asking the all the sentinels + //if its in a postion to choose a master also check if the redis is configured with local host IP as master. + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Number of Masters running is 0") + maxUptime, err := r.rfChecker.GetMaxRedisPodTime(rf) + if err != nil { + return err } - if minTime > timeToPrepare { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("time %.f more than expected. Not even one master, fixing...", minTime.Round(time.Second).Seconds()) - // We can consider there's an error - if err2 := r.rfHealer.SetOldestAsMaster(rf); err2 != nil { + + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("No master avaiable but max pod up time is : %f", maxUptime.Round(time.Second).Seconds()) + //Check If Sentinel has quorum to take a failover decision + noqrm_cnt, err := r.rfChecker.CheckSentinelQuorum(rf) + if err != nil { + // Sentinels are not in a situation to choose a master we pick one + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Quorum not available for sentinel to choose master,estimated unhealthy sentinels :%d , Operator to step-in", noqrm_cnt) + err2 := r.rfHealer.SetOldestAsMaster(rf) + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err2) + if err2 != nil { + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master") return err2 } } else { - // We'll wait until failover is done - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("No master found, wait until failover") - return nil + //sentinels are having a quorum to make a failover , but check if redis are not having local hostip (first boot) as master + status, err2 := r.rfChecker.CheckIfMasterLocalhost(rf) + if err2 != nil { + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("CheckIfMasterLocalhost failed retry later") + return err2 + } else if status { + // all avaialable redis pods have local host ip as master + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("all available redis is having local loop back as master , operator initiates master selection") + err3 := r.rfHealer.SetOldestAsMaster(rf) + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err3) + if err3 != nil { + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master") + return err3 + } + + } else { + + // We'll wait until failover is done + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("no master found, wait until failover or fix manually") + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, errors.New("no master not fixed, wait until failover or fix manually")) + return nil + } + } + case 1: setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, nil) default: diff --git a/operator/redisfailover/checker_test.go b/operator/redisfailover/checker_test.go index c8ae26970..2e83a47b4 100644 --- a/operator/redisfailover/checker_test.go +++ b/operator/redisfailover/checker_test.go @@ -7,7 +7,6 @@ import ( "time" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/mock" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -25,7 +24,9 @@ func TestCheckAndHeal(t *testing.T) { name string nMasters int nRedis int - forceNewMaster bool + forceNewMasterNoQrm bool + forceNewMasterFirstBoot bool + singleMasterTest bool slavesOK bool sentinelMonitorOK bool sentinelNumberInMemoryOK bool @@ -39,7 +40,9 @@ func TestCheckAndHeal(t *testing.T) { name: "Everything ok, no need to heal", nMasters: 1, nRedis: 3, - forceNewMaster: false, + singleMasterTest: false, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: false, slavesOK: true, sentinelMonitorOK: true, sentinelNumberInMemoryOK: true, @@ -53,7 +56,9 @@ func TestCheckAndHeal(t *testing.T) { name: "Multiple masters", nMasters: 2, nRedis: 3, - forceNewMaster: false, + singleMasterTest: false, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: false, slavesOK: true, sentinelMonitorOK: true, sentinelNumberInMemoryOK: true, @@ -67,7 +72,9 @@ func TestCheckAndHeal(t *testing.T) { name: "No masters but wait", nMasters: 0, nRedis: 3, - forceNewMaster: false, + singleMasterTest: false, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: false, slavesOK: true, sentinelMonitorOK: true, sentinelNumberInMemoryOK: true, @@ -81,7 +88,9 @@ func TestCheckAndHeal(t *testing.T) { name: "No masters, only one redis available, make master", nMasters: 0, nRedis: 1, - forceNewMaster: false, + singleMasterTest: true, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: false, slavesOK: true, sentinelMonitorOK: true, sentinelNumberInMemoryOK: true, @@ -92,10 +101,27 @@ func TestCheckAndHeal(t *testing.T) { allowSentinels: false, }, { - name: "No masters, set random", + name: "No masters,No sentinel quorum set random", nMasters: 0, nRedis: 3, - forceNewMaster: true, + singleMasterTest: false, + forceNewMasterNoQrm: true, + forceNewMasterFirstBoot: false, + slavesOK: true, + sentinelMonitorOK: true, + sentinelNumberInMemoryOK: true, + redisCheckNumberOK: true, + redisSetMasterOnAllOK: true, + sentinelSlavesNumberInMemoryOK: true, + allowSentinels: false, + }, + { + name: "No masters,Sentinel Quorum but slave of local host set random", + nMasters: 0, + nRedis: 3, + singleMasterTest: false, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: true, slavesOK: true, sentinelMonitorOK: true, sentinelNumberInMemoryOK: true, @@ -108,7 +134,9 @@ func TestCheckAndHeal(t *testing.T) { name: "Slaves from master wrong", nMasters: 1, nRedis: 3, - forceNewMaster: false, + singleMasterTest: false, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: false, slavesOK: false, sentinelMonitorOK: true, sentinelNumberInMemoryOK: true, @@ -122,7 +150,9 @@ func TestCheckAndHeal(t *testing.T) { name: "Sentinels not pointing correct monitor", nMasters: 1, nRedis: 3, - forceNewMaster: false, + singleMasterTest: false, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: false, slavesOK: true, sentinelMonitorOK: false, sentinelNumberInMemoryOK: true, @@ -136,7 +166,9 @@ func TestCheckAndHeal(t *testing.T) { name: "Sentinels with wrong number of sentinels", nMasters: 1, nRedis: 3, - forceNewMaster: false, + singleMasterTest: false, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: false, slavesOK: true, sentinelMonitorOK: true, sentinelNumberInMemoryOK: false, @@ -150,7 +182,9 @@ func TestCheckAndHeal(t *testing.T) { name: "Sentinels with wrong number of slaves", nMasters: 1, nRedis: 3, - forceNewMaster: false, + singleMasterTest: false, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: false, slavesOK: true, sentinelMonitorOK: true, sentinelNumberInMemoryOK: true, @@ -251,6 +285,9 @@ func TestCheckAndHeal(t *testing.T) { allowSentinels = test.allowSentinels rf.Spec.BootstrapNode.AllowSentinels = allowSentinels } + if test.singleMasterTest { + rf.Spec.Redis.Replicas = 1 + } expErr := false continueTests := true @@ -297,18 +334,26 @@ func TestCheckAndHeal(t *testing.T) { mrfc.On("GetNumberMasters", rf).Once().Return(test.nMasters, nil) switch test.nMasters { case 0: - mrfc.On("GetRedisesIPs", rf).Once().Return(make([]string, test.nRedis), nil) - if test.nRedis == 1 { - mrfh.On("MakeMaster", mock.Anything, rf).Once().Return(nil) + //mrfc.On("GetRedisesIPs", rf).Once().Return(make([]string, test.nRedis), nil) + if rf.Spec.Redis.Replicas == 1 { + mrfh.On("SetOldestAsMaster", rf).Once().Return(nil) + continueTests = false break } - if test.forceNewMaster { - mrfc.On("GetMinimumRedisPodTime", rf).Once().Return(1*time.Hour, nil) + mrfc.On("GetMaxRedisPodTime", rf).Once().Return(1*time.Hour, nil) + if test.forceNewMasterNoQrm { + mrfc.On("CheckSentinelQuorum", rf).Once().Return(1, errors.New("")) + mrfh.On("SetOldestAsMaster", rf).Once().Return(nil) + } else if test.forceNewMasterFirstBoot { + mrfc.On("CheckSentinelQuorum", rf).Once().Return(3, nil) + mrfc.On("CheckIfMasterLocalhost", rf).Once().Return(true, nil) mrfh.On("SetOldestAsMaster", rf).Once().Return(nil) } else { - mrfc.On("GetMinimumRedisPodTime", rf).Once().Return(1*time.Second, nil) + mrfc.On("CheckSentinelQuorum", rf).Once().Return(3, nil) + mrfc.On("CheckIfMasterLocalhost", rf).Once().Return(false, nil) continueTests = false } + case 1: break default: diff --git a/operator/redisfailover/service/check.go b/operator/redisfailover/service/check.go index 1a2a94914..e54f49005 100644 --- a/operator/redisfailover/service/check.go +++ b/operator/redisfailover/service/check.go @@ -23,12 +23,14 @@ type RedisFailoverCheck interface { CheckAllSlavesFromMaster(master string, rFailover *redisfailoverv1.RedisFailover) error CheckSentinelNumberInMemory(sentinel string, rFailover *redisfailoverv1.RedisFailover) error CheckSentinelSlavesNumberInMemory(sentinel string, rFailover *redisfailoverv1.RedisFailover) error + CheckSentinelQuorum(rFailover *redisfailoverv1.RedisFailover) (int, error) + CheckIfMasterLocalhost(rFailover *redisfailoverv1.RedisFailover) (bool, error) CheckSentinelMonitor(sentinel string, monitor ...string) error GetMasterIP(rFailover *redisfailoverv1.RedisFailover) (string, error) GetNumberMasters(rFailover *redisfailoverv1.RedisFailover) (int, error) GetRedisesIPs(rFailover *redisfailoverv1.RedisFailover) ([]string, error) GetSentinelsIPs(rFailover *redisfailoverv1.RedisFailover) ([]string, error) - GetMinimumRedisPodTime(rFailover *redisfailoverv1.RedisFailover) (time.Duration, error) + GetMaxRedisPodTime(rFailover *redisfailoverv1.RedisFailover) (time.Duration, error) GetRedisesSlavesPods(rFailover *redisfailoverv1.RedisFailover) ([]string, error) GetRedisesMasterPod(rFailover *redisfailoverv1.RedisFailover) (string, error) GetStatefulSetUpdateRevision(rFailover *redisfailoverv1.RedisFailover) (string, error) @@ -148,6 +150,81 @@ func (r *RedisFailoverChecker) CheckSentinelNumberInMemory(sentinel string, rf * return nil } +// This function will check if the local host ip is set as the master for all currently available pods +// This can be used to detect the fresh boot of all the redis pods +// This function returns true if it all available pods have local host ip as master, +// false if atleast one of the ip is not local hostip +// false and error if any function fails +func (r *RedisFailoverChecker) CheckIfMasterLocalhost(rFailover *redisfailoverv1.RedisFailover) (bool, error) { + + var lhmaster int = 0 + redisIps, err := r.GetRedisesIPs(rFailover) + if len(redisIps) == 0 || err != nil { + r.logger.Warningf("CheckIfMasterLocalhost GetRedisesIPs Failed- unable to fetch any redis Ips Currently") + return false, errors.New("unable to fetch any redis Ips Currently") + } + password, err := k8s.GetRedisPassword(r.k8sService, rFailover) + if err != nil { + r.logger.Errorf("CheckIfMasterLocalhost -- GetRedisPassword Failed") + return false, err + } + rport := getRedisPort(rFailover.Spec.Redis.Port) + for _, sip := range redisIps { + master, err := r.redisClient.GetSlaveOf(sip, rport, password) + if err != nil { + r.logger.Warningf("CheckIfMasterLocalhost -- GetSlaveOf Failed") + return false, err + } else if master == "" { + r.logger.Warningf("CheckIfMasterLocalhost -- Master already available ?? check manually") + return false, errors.New("unexpected master state, fix manually") + } else { + if master == "127.0.0.1" { + lhmaster++ + } + } + } + if lhmaster == len(redisIps) { + r.logger.Infof("all available redis configured localhost as master , opertor must heal") + return true, nil + } + r.logger.Infof("atleast one pod does not have localhost as master , opertor should not heal") + return false, nil +} + +// This function will call the sentinel client apis to check with sentinel if the sentinel is in a state +// to heal the redis system +func (r *RedisFailoverChecker) CheckSentinelQuorum(rFailover *redisfailoverv1.RedisFailover) (int, error) { + + var unhealthyCnt int = -1 + + sentinels, err := r.GetSentinelsIPs(rFailover) + if err != nil { + r.logger.Warningf("CheckSentinelQuorum Error in getting sentinel Ip's") + return unhealthyCnt, err + } + if len(sentinels) < int(getQuorum(rFailover)) { + unhealthyCnt = int(getQuorum(rFailover)) - len(sentinels) + r.logger.Warningf("insufficnet sentinel to reach Quorum - Unhealthy count: %d", unhealthyCnt) + return unhealthyCnt, errors.New("insufficnet sentinel to reach Quorum") + } + + unhealthyCnt = 0 + for _, sip := range sentinels { + err = r.redisClient.SentinelCheckQuorum(sip) + if err != nil { + unhealthyCnt += 1 + } else { + continue + } + } + if unhealthyCnt < int(getQuorum(rFailover)) { + return unhealthyCnt, nil + } else { + r.logger.Errorf("insufficnet sentinel to reach Quorum - Unhealthy count: %d", unhealthyCnt) + return unhealthyCnt, errors.New("insufficnet sentinel to reach Quorum") + } +} + // CheckSentinelSlavesNumberInMemory controls that the provided sentinel has only the expected slaves number. func (r *RedisFailoverChecker) CheckSentinelSlavesNumberInMemory(sentinel string, rf *redisfailoverv1.RedisFailover) error { nSlaves, err := r.redisClient.GetNumberSentinelSlavesInMemory(sentinel) @@ -266,12 +343,12 @@ func (r *RedisFailoverChecker) GetSentinelsIPs(rf *redisfailoverv1.RedisFailover return sentinels, nil } -// GetMinimumRedisPodTime returns the minimum time a pod is alive -func (r *RedisFailoverChecker) GetMinimumRedisPodTime(rf *redisfailoverv1.RedisFailover) (time.Duration, error) { - minTime := 100000 * time.Hour // More than ten years +// GetMaxRedisPodTime returns the MAX uptime among the active Pods +func (r *RedisFailoverChecker) GetMaxRedisPodTime(rf *redisfailoverv1.RedisFailover) (time.Duration, error) { + maxTime := 0 * time.Hour rps, err := r.k8sService.GetStatefulSetPods(rf.Namespace, GetRedisName(rf)) if err != nil { - return minTime, err + return maxTime, err } for _, redisNode := range rps.Items { if redisNode.Status.StartTime == nil { @@ -279,12 +356,12 @@ func (r *RedisFailoverChecker) GetMinimumRedisPodTime(rf *redisfailoverv1.RedisF } start := redisNode.Status.StartTime.Round(time.Second) alive := time.Since(start) - r.logger.Infof("Pod %s has been alive for %.f seconds", redisNode.Status.PodIP, alive.Seconds()) - if alive < minTime { - minTime = alive + r.logger.Debugf("Pod %s has been alive for %.f seconds", redisNode.Status.PodIP, alive.Seconds()) + if alive > maxTime { + maxTime = alive } } - return minTime, nil + return maxTime, nil } // GetRedisesSlavesPods returns pods names of the Redis slave nodes diff --git a/operator/redisfailover/service/check_test.go b/operator/redisfailover/service/check_test.go index cd81c901a..af64d7e13 100644 --- a/operator/redisfailover/service/check_test.go +++ b/operator/redisfailover/service/check_test.go @@ -658,7 +658,7 @@ func TestGetNumberMastersTwo(t *testing.T) { assert.Equal(2, masterNumber, "the master number should be ok") } -func TestGetMinimumRedisPodTimeGetStatefulSetPodsError(t *testing.T) { +func TestGetMaxRedisPodTimeGetStatefulSetPodsError(t *testing.T) { assert := assert.New(t) rf := generateRF() @@ -669,11 +669,11 @@ func TestGetMinimumRedisPodTimeGetStatefulSetPodsError(t *testing.T) { checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) - _, err := checker.GetMinimumRedisPodTime(rf) + _, err := checker.GetMaxRedisPodTime(rf) assert.Error(err) } -func TestGetMinimumRedisPodTime(t *testing.T) { +func TestGetMaxRedisPodTime(t *testing.T) { assert := assert.New(t) rf := generateRF() @@ -707,11 +707,11 @@ func TestGetMinimumRedisPodTime(t *testing.T) { checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) - minTime, err := checker.GetMinimumRedisPodTime(rf) + maxTime, err := checker.GetMaxRedisPodTime(rf) assert.NoError(err) - expected := now.Sub(oneMinute).Round(time.Second) - assert.Equal(expected, minTime.Round(time.Second), "the closest time should be given") + expected := now.Sub(oneHour).Round(time.Second) + assert.Equal(expected, maxTime.Round(time.Second), "the closest time should be given") } func TestGetRedisPodsNames(t *testing.T) { diff --git a/operator/redisfailover/service/heal.go b/operator/redisfailover/service/heal.go index 7c87da427..390a6ac48 100644 --- a/operator/redisfailover/service/heal.go +++ b/operator/redisfailover/service/heal.go @@ -112,6 +112,7 @@ func (r *RedisFailoverHealer) SetOldestAsMaster(rf *redisfailoverv1.RedisFailove newMasterIP = pod.Status.PodIP r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("New master is %s with ip %s", pod.Name, newMasterIP) if err := r.redisClient.MakeMaster(newMasterIP, port, password); err != nil { + newMasterIP = "" r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Make new master failed, master ip: %s, error: %v", pod.Status.PodIP, err) continue } @@ -134,7 +135,11 @@ func (r *RedisFailoverHealer) SetOldestAsMaster(rf *redisfailoverv1.RedisFailove } } } - return nil + if newMasterIP == "" { + return errors.New("SetOldestAsMaster- unable to set master") + } else { + return nil + } } // SetMasterOnAll puts all redis nodes as a slave of a given master @@ -151,18 +156,15 @@ func (r *RedisFailoverHealer) SetMasterOnAll(masterIP string, rf *redisfailoverv port := getRedisPort(rf.Spec.Redis.Port) for _, pod := range ssp.Items { - if pod.Status.PodIP == masterIP { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Ensure pod %s is master", pod.Name) - if err := r.redisClient.MakeMaster(masterIP, port, password); err != nil { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Make master failed, master ip: %s, error: %v", masterIP, err) - return err - } - - err = r.setMasterLabelIfNecessary(rf.Namespace, pod) - if err != nil { - return err - } + //During this configuration process if there is a new master selected , bailout + isMaster, err := r.redisClient.IsMaster(masterIP, port, password) + if err != nil || !isMaster { + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("check master failed maybe this node is not ready(ip changed), or sentinel made a switch: %s", masterIP) + return err } else { + if pod.Status.PodIP == masterIP { + continue + } r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Making pod %s slave of %s", pod.Name, masterIP) if err := r.redisClient.MakeSlaveOfWithPort(pod.Status.PodIP, masterIP, port, password); err != nil { r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Make slave failed, slave ip: %s, master ip: %s, error: %v", pod.Status.PodIP, masterIP, err) diff --git a/operator/redisfailover/service/heal_test.go b/operator/redisfailover/service/heal_test.go index f9feb8ea4..27bd8cc11 100644 --- a/operator/redisfailover/service/heal_test.go +++ b/operator/redisfailover/service/heal_test.go @@ -41,7 +41,7 @@ func TestSetOldestAsMasterNewMasterError(t *testing.T) { healer := rfservice.NewRedisFailoverHealer(ms, mr, log.DummyLogger{}) err := healer.SetOldestAsMaster(rf) - assert.NoError(err) + assert.Error(err) } func TestSetOldestAsMaster(t *testing.T) { @@ -204,8 +204,7 @@ func TestSetMasterOnAllMakeMasterError(t *testing.T) { ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(pods, nil) ms.On("UpdatePodLabels", namespace, mock.AnythingOfType("string"), mock.Anything).Once().Return(nil) mr := &mRedisService.Client{} - mr.On("MakeMaster", "0.0.0.0", "0", "").Once().Return(errors.New("")) - + mr.On("IsMaster", "0.0.0.0", "0", "").Return(false, errors.New("")) healer := rfservice.NewRedisFailoverHealer(ms, mr, log.DummyLogger{}) err := healer.SetMasterOnAll("0.0.0.0", rf) @@ -236,7 +235,7 @@ func TestSetMasterOnAllMakeSlaveOfError(t *testing.T) { ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(pods, nil) ms.On("UpdatePodLabels", namespace, mock.AnythingOfType("string"), mock.Anything).Return(nil) mr := &mRedisService.Client{} - mr.On("MakeMaster", "0.0.0.0", "0", "").Once().Return(nil) + mr.On("IsMaster", "0.0.0.0", "0", "").Return(true, nil) mr.On("MakeSlaveOfWithPort", "1.1.1.1", "0.0.0.0", "0", "").Once().Return(errors.New("")) healer := rfservice.NewRedisFailoverHealer(ms, mr, log.DummyLogger{}) @@ -269,7 +268,7 @@ func TestSetMasterOnAll(t *testing.T) { ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(pods, nil) ms.On("UpdatePodLabels", namespace, mock.AnythingOfType("string"), mock.Anything).Return(nil) mr := &mRedisService.Client{} - mr.On("MakeMaster", "0.0.0.0", "0", "").Once().Return(nil) + mr.On("IsMaster", "0.0.0.0", "0", "").Return(true, nil) mr.On("MakeSlaveOfWithPort", "1.1.1.1", "0.0.0.0", "0", "").Once().Return(nil) healer := rfservice.NewRedisFailoverHealer(ms, mr, log.DummyLogger{}) diff --git a/service/redis/client.go b/service/redis/client.go index bb89c1f96..8c42690bf 100644 --- a/service/redis/client.go +++ b/service/redis/client.go @@ -30,6 +30,7 @@ type Client interface { SetCustomSentinelConfig(ip string, configs []string) error SetCustomRedisConfig(ip string, port string, configs []string, password string) error SlaveIsReady(ip, port, password string) (bool, error) + SentinelCheckQuorum(ip string) error } type client struct { @@ -327,6 +328,47 @@ func (c *client) SetCustomSentinelConfig(ip string, configs []string) error { return nil } +func (c *client) SentinelCheckQuorum(ip string) error { + + options := &rediscli.Options{ + Addr: net.JoinHostPort(ip, sentinelPort), + Password: "", + DB: 0, + } + rClient := rediscli.NewSentinelClient(options) + defer rClient.Close() + cmd := rClient.CkQuorum(context.TODO(), masterName) + res, err := cmd.Result() + + if err != nil { + log.Warnf("Unable to get result for CKQUORUM comand") + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.CHECK_SENTINEL_QUORUM, metrics.FAIL, getRedisError(err)) + return err + } + log.Debugf("SentinelCheckQuorum cmd result: %s", res) + s := strings.Split(res, " ") + status := s[0] + quorum := s[1] + + if status == "" { + log.Errorf("quorum command result unexpected output") + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.CHECK_SENTINEL_QUORUM, metrics.FAIL, "quorum command result unexpected output") + return fmt.Errorf("quorum command result unexpected output") + } + if status == "(error)" && quorum == "NOQUORUM" { + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.CHECK_SENTINEL_QUORUM, metrics.SUCCESS, "NOQUORUM") + return fmt.Errorf("quorum Not available") + + } else if status == "OK" { + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.CHECK_SENTINEL_QUORUM, metrics.SUCCESS, "QUORUM") + return nil + } else { + log.Errorf("quorum command status unexpected !!!") + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.CHECK_SENTINEL_QUORUM, metrics.FAIL, "quorum command status unexpected output") + return fmt.Errorf("quorum status unexpected %s", status) + } + +} func (c *client) SetCustomRedisConfig(ip string, port string, configs []string, password string) error { options := &rediscli.Options{ Addr: net.JoinHostPort(ip, port),