From 7b4cf3d98fef142a8a57c0eb29b9c2f44d11c7e7 Mon Sep 17 00:00:00 2001 From: dinesh-murugiah Date: Mon, 5 Dec 2022 16:12:07 +0530 Subject: [PATCH 1/6] Fixes to avoid operator conflict with the sentinel in choosing master --- operator/redisfailover/checker.go | 40 +++++++++---------- operator/redisfailover/service/check.go | 53 ++++++++++++++++++++----- operator/redisfailover/service/heal.go | 25 ++++++------ service/redis/client.go | 40 +++++++++++++++++++ 4 files changed, 116 insertions(+), 42 deletions(-) diff --git a/operator/redisfailover/checker.go b/operator/redisfailover/checker.go index 0324b341b..03a954487 100644 --- a/operator/redisfailover/checker.go +++ b/operator/redisfailover/checker.go @@ -9,10 +9,6 @@ import ( "github.com/spotahome/redis-operator/metrics" ) -const ( - timeToPrepare = 2 * time.Minute -) - // UpdateRedisesPods if the running version of pods are equal to the statefulset one func (r *RedisFailoverHandler) UpdateRedisesPods(rf *redisfailoverv1.RedisFailover) error { redises, err := r.rfChecker.GetRedisesIPs(rf) @@ -120,32 +116,34 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e } switch nMasters { case 0: + //During the First boot(New deployment or all pods of the statefulsets have restarted), + //Sentinesl will not be able to choose the master , so operator should select a master + //Also in scenarios where Sentinels is not in a position to choose a master like , No quorum reached + //Operator can choose a master , all the abobe scenarios can be checked by asking the all the sentinels + //if its in a postion to choose a master. + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Number of Masters running is 0") setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("no masters detected")) - redisesIP, err := r.rfChecker.GetRedisesIPs(rf) - if err != nil { - return err - } - if len(redisesIP) == 1 { - if err := r.rfHealer.MakeMaster(redisesIP[0], rf); err != nil { - return err - } - break - } - minTime, err2 := r.rfChecker.GetMinimumRedisPodTime(rf) + maxUptime, err2 := r.rfChecker.GetMaxRedisPodTime(rf) if err2 != nil { return err2 } - if minTime > timeToPrepare { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("time %.f more than expected. Not even one master, fixing...", minTime.Round(time.Second).Seconds()) - // We can consider there's an error - if err2 := r.rfHealer.SetOldestAsMaster(rf); err2 != nil { - return err2 + + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("No master avaiable but max pod up time is : %f", maxUptime.Round(time.Second).Seconds()) + //Check If Sentinel can choose a master + unHealthySentinels, err2 := r.rfChecker.CheckSentinelQuorum(rf) + if err2 != nil { + // Sentinels are not in a situation to choose a master we pick one + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Quorum not available for sentinel to choose master,estimated unhealthy sentinels :%d , Operator to step-in", unHealthySentinels) + if err3 := r.rfHealer.SetOldestAsMaster(rf); err3 != nil { + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master") + return err3 } } else { // We'll wait until failover is done - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("No master found, wait until failover") + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("No master found, wait until failover") return nil } + case 1: setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, nil) default: diff --git a/operator/redisfailover/service/check.go b/operator/redisfailover/service/check.go index 1a2a94914..f95f4af3b 100644 --- a/operator/redisfailover/service/check.go +++ b/operator/redisfailover/service/check.go @@ -23,12 +23,13 @@ type RedisFailoverCheck interface { CheckAllSlavesFromMaster(master string, rFailover *redisfailoverv1.RedisFailover) error CheckSentinelNumberInMemory(sentinel string, rFailover *redisfailoverv1.RedisFailover) error CheckSentinelSlavesNumberInMemory(sentinel string, rFailover *redisfailoverv1.RedisFailover) error + CheckSentinelQuorum(rFailover *redisfailoverv1.RedisFailover) (int, error) CheckSentinelMonitor(sentinel string, monitor ...string) error GetMasterIP(rFailover *redisfailoverv1.RedisFailover) (string, error) GetNumberMasters(rFailover *redisfailoverv1.RedisFailover) (int, error) GetRedisesIPs(rFailover *redisfailoverv1.RedisFailover) ([]string, error) GetSentinelsIPs(rFailover *redisfailoverv1.RedisFailover) ([]string, error) - GetMinimumRedisPodTime(rFailover *redisfailoverv1.RedisFailover) (time.Duration, error) + GetMaxRedisPodTime(rFailover *redisfailoverv1.RedisFailover) (time.Duration, error) GetRedisesSlavesPods(rFailover *redisfailoverv1.RedisFailover) ([]string, error) GetRedisesMasterPod(rFailover *redisfailoverv1.RedisFailover) (string, error) GetStatefulSetUpdateRevision(rFailover *redisfailoverv1.RedisFailover) (string, error) @@ -148,6 +149,40 @@ func (r *RedisFailoverChecker) CheckSentinelNumberInMemory(sentinel string, rf * return nil } +// This function will call the sentinel client apis to check with sentinel if the sentinel is in a state +// to heal the redis system +func (r *RedisFailoverChecker) CheckSentinelQuorum(rFailover *redisfailoverv1.RedisFailover) (int, error) { + + var unhealthyCnt int = -1 + + sentinels, err := r.GetSentinelsIPs(rFailover) + if err != nil { + r.logger.Errorf("CheckSentinelQuorum Error in getting sentinel Ip's") + return unhealthyCnt, err + } + if len(sentinels) < int(getQuorum(rFailover)) { + unhealthyCnt = int(getQuorum(rFailover)) - len(sentinels) + r.logger.Errorf("insufficnet sentinel to reach Quorum - Unhealthy count: %d", unhealthyCnt) + return unhealthyCnt, errors.New("insufficnet sentinel to reach Quorum") + } + + unhealthyCnt = 0 + for _, sip := range sentinels { + err = r.redisClient.SentinelCheckQuorum(sip, "mymaster") + if err != nil { + unhealthyCnt += 1 + } else { + continue + } + } + if unhealthyCnt < int(getQuorum(rFailover)) { + return unhealthyCnt, nil + } else { + r.logger.Errorf("insufficnet sentinel to reach Quorum - Unhealthy count: %d", unhealthyCnt) + return unhealthyCnt, errors.New("insufficnet sentinel to reach Quorum") + } +} + // CheckSentinelSlavesNumberInMemory controls that the provided sentinel has only the expected slaves number. func (r *RedisFailoverChecker) CheckSentinelSlavesNumberInMemory(sentinel string, rf *redisfailoverv1.RedisFailover) error { nSlaves, err := r.redisClient.GetNumberSentinelSlavesInMemory(sentinel) @@ -266,12 +301,12 @@ func (r *RedisFailoverChecker) GetSentinelsIPs(rf *redisfailoverv1.RedisFailover return sentinels, nil } -// GetMinimumRedisPodTime returns the minimum time a pod is alive -func (r *RedisFailoverChecker) GetMinimumRedisPodTime(rf *redisfailoverv1.RedisFailover) (time.Duration, error) { - minTime := 100000 * time.Hour // More than ten years +// GetMaxRedisPodTime returns the MAX uptime among the active Pods +func (r *RedisFailoverChecker) GetMaxRedisPodTime(rf *redisfailoverv1.RedisFailover) (time.Duration, error) { + maxTime := 0 * time.Hour rps, err := r.k8sService.GetStatefulSetPods(rf.Namespace, GetRedisName(rf)) if err != nil { - return minTime, err + return maxTime, err } for _, redisNode := range rps.Items { if redisNode.Status.StartTime == nil { @@ -279,12 +314,12 @@ func (r *RedisFailoverChecker) GetMinimumRedisPodTime(rf *redisfailoverv1.RedisF } start := redisNode.Status.StartTime.Round(time.Second) alive := time.Since(start) - r.logger.Infof("Pod %s has been alive for %.f seconds", redisNode.Status.PodIP, alive.Seconds()) - if alive < minTime { - minTime = alive + r.logger.Debugf("Pod %s has been alive for %.f seconds", redisNode.Status.PodIP, alive.Seconds()) + if alive > maxTime { + maxTime = alive } } - return minTime, nil + return maxTime, nil } // GetRedisesSlavesPods returns pods names of the Redis slave nodes diff --git a/operator/redisfailover/service/heal.go b/operator/redisfailover/service/heal.go index 7c87da427..d469ee562 100644 --- a/operator/redisfailover/service/heal.go +++ b/operator/redisfailover/service/heal.go @@ -112,6 +112,7 @@ func (r *RedisFailoverHealer) SetOldestAsMaster(rf *redisfailoverv1.RedisFailove newMasterIP = pod.Status.PodIP r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("New master is %s with ip %s", pod.Name, newMasterIP) if err := r.redisClient.MakeMaster(newMasterIP, port, password); err != nil { + newMasterIP = "" r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Make new master failed, master ip: %s, error: %v", pod.Status.PodIP, err) continue } @@ -134,7 +135,11 @@ func (r *RedisFailoverHealer) SetOldestAsMaster(rf *redisfailoverv1.RedisFailove } } } - return nil + if newMasterIP == "" { + return errors.New("SetOldestAsMaster- unable to set master") + } else { + return nil + } } // SetMasterOnAll puts all redis nodes as a slave of a given master @@ -151,18 +156,14 @@ func (r *RedisFailoverHealer) SetMasterOnAll(masterIP string, rf *redisfailoverv port := getRedisPort(rf.Spec.Redis.Port) for _, pod := range ssp.Items { - if pod.Status.PodIP == masterIP { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Ensure pod %s is master", pod.Name) - if err := r.redisClient.MakeMaster(masterIP, port, password); err != nil { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Make master failed, master ip: %s, error: %v", masterIP, err) - return err - } - - err = r.setMasterLabelIfNecessary(rf.Namespace, pod) - if err != nil { - return err - } + isMaster, err := r.redisClient.IsMaster(masterIP, port, password) + if err != nil || !isMaster { + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("check master failed maybe this node is not ready(ip changed), or sentinel made a switch: %s", masterIP) + return err } else { + if pod.Status.PodIP == masterIP { + continue + } r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Making pod %s slave of %s", pod.Name, masterIP) if err := r.redisClient.MakeSlaveOfWithPort(pod.Status.PodIP, masterIP, port, password); err != nil { r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Make slave failed, slave ip: %s, master ip: %s, error: %v", pod.Status.PodIP, masterIP, err) diff --git a/service/redis/client.go b/service/redis/client.go index bb89c1f96..b19713bb0 100644 --- a/service/redis/client.go +++ b/service/redis/client.go @@ -30,6 +30,7 @@ type Client interface { SetCustomSentinelConfig(ip string, configs []string) error SetCustomRedisConfig(ip string, port string, configs []string, password string) error SlaveIsReady(ip, port, password string) (bool, error) + SentinelCheckQuorum(ip string, master string) error } type client struct { @@ -327,6 +328,45 @@ func (c *client) SetCustomSentinelConfig(ip string, configs []string) error { return nil } +func (c *client) SentinelCheckQuorum(ip string, master string) error { + + options := &rediscli.Options{ + Addr: net.JoinHostPort(ip, sentinelPort), + Password: "", + DB: 0, + } + rClient := rediscli.NewSentinelClient(options) + defer rClient.Close() + cmd := rClient.CkQuorum(context.TODO(), master) + res, err := cmd.Result() + if err != nil { + log.Errorf("Unable to get result for CKQUORUM comand") + return err + } + + log.Errorf("CKQUORUM OUTPUT IS:%s", res) + + s := strings.Split(res, " ") + status := s[0] + quorum := s[1] + + if status == "" { + log.Errorf("quorum command result unexpected output") + return fmt.Errorf("quorum command result unexpected output") + } + if status == "(error)" && quorum == "NOQUORUM" { + log.Errorf("quorum command result - NOQUORUM") + return fmt.Errorf("quorum Not available") + + } else if status == "OK" { + log.Errorf("quorum command result - QUORUM") + return nil + } else { + log.Errorf("quorum command result unexpected !!!") + return fmt.Errorf("quorum result unexpected %s", status) + } + +} func (c *client) SetCustomRedisConfig(ip string, port string, configs []string, password string) error { options := &rediscli.Options{ Addr: net.JoinHostPort(ip, port), From 4ac4766adf64c7d2afeffe389329173718bf1cb8 Mon Sep 17 00:00:00 2001 From: dinesh-murugiah Date: Tue, 6 Dec 2022 20:03:52 +0530 Subject: [PATCH 2/6] Running mocks and fixing first boot scenarios --- metrics/metrics.go | 1 + .../service/RedisFailoverCheck.go | 63 +++++++++++++++++ mocks/service/redis/Client.go | 14 ++++ operator/redisfailover/checker.go | 60 ++++++++++++---- operator/redisfailover/service/check.go | 70 ++++++++++++++++++- operator/redisfailover/service/heal.go | 1 + service/redis/client.go | 6 +- 7 files changed, 194 insertions(+), 21 deletions(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index 3b07785a5..c40aab54c 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -32,6 +32,7 @@ const ( HEALTHY = 0.0 REDIS_REPLICA_MISMATCH = "REDIS_STATEFULSET_REPLICAS_MISMATCH" SENTINEL_REPLICA_MISMATCH = "SENTINEL_DEPLOYMENT_REPLICAS_MISMATCH" + NO_MASTER = "NO_MASTER_AVAILABLE" NUMBER_OF_MASTERS = "MASTER_COUNT_IS_NOT_ONE" SENTINEL_WRONG_MASTER = "SENTINEL_IS_CONFIGURED_WITH_WRONG_MASTER_IP" SLAVE_WRONG_MASTER = "SLAVE_IS_CONFIGURED_WITH_WRONG_MASTER_IP" diff --git a/mocks/operator/redisfailover/service/RedisFailoverCheck.go b/mocks/operator/redisfailover/service/RedisFailoverCheck.go index c21c37f1a..643d6b0ef 100644 --- a/mocks/operator/redisfailover/service/RedisFailoverCheck.go +++ b/mocks/operator/redisfailover/service/RedisFailoverCheck.go @@ -29,6 +29,27 @@ func (_m *RedisFailoverCheck) CheckAllSlavesFromMaster(master string, rFailover return r0 } +// CheckIfMasterLocalhost provides a mock function with given fields: rFailover +func (_m *RedisFailoverCheck) CheckIfMasterLocalhost(rFailover *v1.RedisFailover) (bool, error) { + ret := _m.Called(rFailover) + + var r0 bool + if rf, ok := ret.Get(0).(func(*v1.RedisFailover) bool); ok { + r0 = rf(rFailover) + } else { + r0 = ret.Get(0).(bool) + } + + var r1 error + if rf, ok := ret.Get(1).(func(*v1.RedisFailover) error); ok { + r1 = rf(rFailover) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + // CheckRedisNumber provides a mock function with given fields: rFailover func (_m *RedisFailoverCheck) CheckRedisNumber(rFailover *v1.RedisFailover) error { ret := _m.Called(rFailover) @@ -113,6 +134,27 @@ func (_m *RedisFailoverCheck) CheckSentinelNumberInMemory(sentinel string, rFail return r0 } +// CheckSentinelQuorum provides a mock function with given fields: rFailover +func (_m *RedisFailoverCheck) CheckSentinelQuorum(rFailover *v1.RedisFailover) (int, error) { + ret := _m.Called(rFailover) + + var r0 int + if rf, ok := ret.Get(0).(func(*v1.RedisFailover) int); ok { + r0 = rf(rFailover) + } else { + r0 = ret.Get(0).(int) + } + + var r1 error + if rf, ok := ret.Get(1).(func(*v1.RedisFailover) error); ok { + r1 = rf(rFailover) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + // CheckSentinelSlavesNumberInMemory provides a mock function with given fields: sentinel, rFailover func (_m *RedisFailoverCheck) CheckSentinelSlavesNumberInMemory(sentinel string, rFailover *v1.RedisFailover) error { ret := _m.Called(sentinel, rFailover) @@ -148,6 +190,27 @@ func (_m *RedisFailoverCheck) GetMasterIP(rFailover *v1.RedisFailover) (string, return r0, r1 } +// GetMaxRedisPodTime provides a mock function with given fields: rFailover +func (_m *RedisFailoverCheck) GetMaxRedisPodTime(rFailover *v1.RedisFailover) (time.Duration, error) { + ret := _m.Called(rFailover) + + var r0 time.Duration + if rf, ok := ret.Get(0).(func(*v1.RedisFailover) time.Duration); ok { + r0 = rf(rFailover) + } else { + r0 = ret.Get(0).(time.Duration) + } + + var r1 error + if rf, ok := ret.Get(1).(func(*v1.RedisFailover) error); ok { + r1 = rf(rFailover) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + // GetMinimumRedisPodTime provides a mock function with given fields: rFailover func (_m *RedisFailoverCheck) GetMinimumRedisPodTime(rFailover *v1.RedisFailover) (time.Duration, error) { ret := _m.Called(rFailover) diff --git a/mocks/service/redis/Client.go b/mocks/service/redis/Client.go index b52028d19..b44674d51 100644 --- a/mocks/service/redis/Client.go +++ b/mocks/service/redis/Client.go @@ -205,6 +205,20 @@ func (_m *Client) ResetSentinel(ip string) error { return r0 } +// SentinelCheckQuorum provides a mock function with given fields: ip +func (_m *Client) SentinelCheckQuorum(ip string) error { + ret := _m.Called(ip) + + var r0 error + if rf, ok := ret.Get(0).(func(string) error); ok { + r0 = rf(ip) + } else { + r0 = ret.Error(0) + } + + return r0 +} + // SetCustomRedisConfig provides a mock function with given fields: ip, port, configs, password func (_m *Client) SetCustomRedisConfig(ip string, port string, configs []string, password string) error { ret := _m.Called(ip, port, configs, password) diff --git a/operator/redisfailover/checker.go b/operator/redisfailover/checker.go index 03a954487..977c4490b 100644 --- a/operator/redisfailover/checker.go +++ b/operator/redisfailover/checker.go @@ -114,34 +114,64 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e if err != nil { return err } + switch nMasters { case 0: + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, errors.New("no masters detected")) + //when number of redis replicas is 1 , the redis is configured for standalone master mode + //Configure to master + if rf.Spec.Redis.Replicas == 1 { + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Resource spec with standalone master - operator will set the master") + if err = r.rfHealer.SetOldestAsMaster(rf); err != nil { + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master") + return err + } + return nil + } //During the First boot(New deployment or all pods of the statefulsets have restarted), //Sentinesl will not be able to choose the master , so operator should select a master //Also in scenarios where Sentinels is not in a position to choose a master like , No quorum reached - //Operator can choose a master , all the abobe scenarios can be checked by asking the all the sentinels - //if its in a postion to choose a master. + //Operator can choose a master , These scenarios can be checked by asking the all the sentinels + //if its in a postion to choose a master also check if the redis is configured with local host IP as master. r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Number of Masters running is 0") - setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("no masters detected")) - maxUptime, err2 := r.rfChecker.GetMaxRedisPodTime(rf) - if err2 != nil { - return err2 + maxUptime, err := r.rfChecker.GetMaxRedisPodTime(rf) + if err != nil { + return err } r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("No master avaiable but max pod up time is : %f", maxUptime.Round(time.Second).Seconds()) - //Check If Sentinel can choose a master - unHealthySentinels, err2 := r.rfChecker.CheckSentinelQuorum(rf) - if err2 != nil { + //Check If Sentinel has quorum to take a failover decision + noqrm_cnt, err := r.rfChecker.CheckSentinelQuorum(rf) + if err != nil { // Sentinels are not in a situation to choose a master we pick one - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Quorum not available for sentinel to choose master,estimated unhealthy sentinels :%d , Operator to step-in", unHealthySentinels) - if err3 := r.rfHealer.SetOldestAsMaster(rf); err3 != nil { + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Quorum not available for sentinel to choose master,estimated unhealthy sentinels :%d , Operator to step-in", noqrm_cnt) + if err2 := r.rfHealer.SetOldestAsMaster(rf); err2 != nil { r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master") - return err3 + return err2 } + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, nil) } else { - // We'll wait until failover is done - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("No master found, wait until failover") - return nil + //sentinels are having a quorum to make a failover , but check if redis are not having local hostip (first boot) as master + status, err2 := r.rfChecker.CheckIfMasterLocalhost(rf) + if err2 != nil { + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("CheckIfMasterLocalhost failed retry later") + return err2 + } else if status { + // all avaialable redis pods have local host ip as master + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("all available redis is having local loop back as master , operator initiates master selection") + if err3 := r.rfHealer.SetOldestAsMaster(rf); err3 != nil { + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master") + return err3 + } + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, nil) + } else { + + // We'll wait until failover is done + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("no master found, wait until failover or fix manually") + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, errors.New("no master not fixed, wait until failover or fix manually")) + return nil + } + } case 1: diff --git a/operator/redisfailover/service/check.go b/operator/redisfailover/service/check.go index f95f4af3b..22688532d 100644 --- a/operator/redisfailover/service/check.go +++ b/operator/redisfailover/service/check.go @@ -24,11 +24,13 @@ type RedisFailoverCheck interface { CheckSentinelNumberInMemory(sentinel string, rFailover *redisfailoverv1.RedisFailover) error CheckSentinelSlavesNumberInMemory(sentinel string, rFailover *redisfailoverv1.RedisFailover) error CheckSentinelQuorum(rFailover *redisfailoverv1.RedisFailover) (int, error) + CheckIfMasterLocalhost(rFailover *redisfailoverv1.RedisFailover) (bool, error) CheckSentinelMonitor(sentinel string, monitor ...string) error GetMasterIP(rFailover *redisfailoverv1.RedisFailover) (string, error) GetNumberMasters(rFailover *redisfailoverv1.RedisFailover) (int, error) GetRedisesIPs(rFailover *redisfailoverv1.RedisFailover) ([]string, error) GetSentinelsIPs(rFailover *redisfailoverv1.RedisFailover) ([]string, error) + GetMinimumRedisPodTime(rFailover *redisfailoverv1.RedisFailover) (time.Duration, error) GetMaxRedisPodTime(rFailover *redisfailoverv1.RedisFailover) (time.Duration, error) GetRedisesSlavesPods(rFailover *redisfailoverv1.RedisFailover) ([]string, error) GetRedisesMasterPod(rFailover *redisfailoverv1.RedisFailover) (string, error) @@ -149,6 +151,47 @@ func (r *RedisFailoverChecker) CheckSentinelNumberInMemory(sentinel string, rf * return nil } +// This function will check if the local host ip is set as the master for all currently available pods +// This can be used to detect the fresh boot of all the redis pods +// This function returns true if it all available pods have local host ip as master, +// false if atleast one of the ip is not local hostip +// false and error if any function fails +func (r *RedisFailoverChecker) CheckIfMasterLocalhost(rFailover *redisfailoverv1.RedisFailover) (bool, error) { + + var lhmaster int = 0 + redisIps, err := r.GetRedisesIPs(rFailover) + if len(redisIps) == 0 || err != nil { + r.logger.Warningf("CheckIfMasterLocalhost GetRedisesIPs Failed- unable to fetch any redis Ips Currently") + return false, errors.New("unable to fetch any redis Ips Currently") + } + password, err := k8s.GetRedisPassword(r.k8sService, rFailover) + if err != nil { + r.logger.Errorf("CheckIfMasterLocalhost -- GetRedisPassword Failed") + return false, err + } + rport := getRedisPort(rFailover.Spec.Redis.Port) + for _, sip := range redisIps { + master, err := r.redisClient.GetSlaveOf(sip, rport, password) + if err != nil { + r.logger.Warningf("CheckIfMasterLocalhost -- GetSlaveOf Failed") + return false, err + } else if master == "" { + r.logger.Warningf("CheckIfMasterLocalhost -- Master already available ?? check manually") + return false, errors.New("unexpected master state, fix manually") + } else { + if master == "127.0.0.1" { + lhmaster++ + } + } + } + if lhmaster == len(redisIps) { + r.logger.Infof("all available redis configured localhost as master , opertor must heal") + return true, nil + } + r.logger.Infof("atleast one pod does not have localhost as master , opertor should not heal") + return false, nil +} + // This function will call the sentinel client apis to check with sentinel if the sentinel is in a state // to heal the redis system func (r *RedisFailoverChecker) CheckSentinelQuorum(rFailover *redisfailoverv1.RedisFailover) (int, error) { @@ -157,18 +200,18 @@ func (r *RedisFailoverChecker) CheckSentinelQuorum(rFailover *redisfailoverv1.Re sentinels, err := r.GetSentinelsIPs(rFailover) if err != nil { - r.logger.Errorf("CheckSentinelQuorum Error in getting sentinel Ip's") + r.logger.Warningf("CheckSentinelQuorum Error in getting sentinel Ip's") return unhealthyCnt, err } if len(sentinels) < int(getQuorum(rFailover)) { unhealthyCnt = int(getQuorum(rFailover)) - len(sentinels) - r.logger.Errorf("insufficnet sentinel to reach Quorum - Unhealthy count: %d", unhealthyCnt) + r.logger.Warningf("insufficnet sentinel to reach Quorum - Unhealthy count: %d", unhealthyCnt) return unhealthyCnt, errors.New("insufficnet sentinel to reach Quorum") } unhealthyCnt = 0 for _, sip := range sentinels { - err = r.redisClient.SentinelCheckQuorum(sip, "mymaster") + err = r.redisClient.SentinelCheckQuorum(sip) if err != nil { unhealthyCnt += 1 } else { @@ -301,6 +344,27 @@ func (r *RedisFailoverChecker) GetSentinelsIPs(rf *redisfailoverv1.RedisFailover return sentinels, nil } +// GetMinimumRedisPodTime returns the minimum time a pod is alive +func (r *RedisFailoverChecker) GetMinimumRedisPodTime(rf *redisfailoverv1.RedisFailover) (time.Duration, error) { + minTime := 100000 * time.Hour // More than ten years + rps, err := r.k8sService.GetStatefulSetPods(rf.Namespace, GetRedisName(rf)) + if err != nil { + return minTime, err + } + for _, redisNode := range rps.Items { + if redisNode.Status.StartTime == nil { + continue + } + start := redisNode.Status.StartTime.Round(time.Second) + alive := time.Since(start) + r.logger.Infof("Pod %s has been alive for %.f seconds", redisNode.Status.PodIP, alive.Seconds()) + if alive < minTime { + minTime = alive + } + } + return minTime, nil +} + // GetMaxRedisPodTime returns the MAX uptime among the active Pods func (r *RedisFailoverChecker) GetMaxRedisPodTime(rf *redisfailoverv1.RedisFailover) (time.Duration, error) { maxTime := 0 * time.Hour diff --git a/operator/redisfailover/service/heal.go b/operator/redisfailover/service/heal.go index d469ee562..390a6ac48 100644 --- a/operator/redisfailover/service/heal.go +++ b/operator/redisfailover/service/heal.go @@ -156,6 +156,7 @@ func (r *RedisFailoverHealer) SetMasterOnAll(masterIP string, rf *redisfailoverv port := getRedisPort(rf.Spec.Redis.Port) for _, pod := range ssp.Items { + //During this configuration process if there is a new master selected , bailout isMaster, err := r.redisClient.IsMaster(masterIP, port, password) if err != nil || !isMaster { r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("check master failed maybe this node is not ready(ip changed), or sentinel made a switch: %s", masterIP) diff --git a/service/redis/client.go b/service/redis/client.go index b19713bb0..dffb9ccf3 100644 --- a/service/redis/client.go +++ b/service/redis/client.go @@ -30,7 +30,7 @@ type Client interface { SetCustomSentinelConfig(ip string, configs []string) error SetCustomRedisConfig(ip string, port string, configs []string, password string) error SlaveIsReady(ip, port, password string) (bool, error) - SentinelCheckQuorum(ip string, master string) error + SentinelCheckQuorum(ip string) error } type client struct { @@ -328,7 +328,7 @@ func (c *client) SetCustomSentinelConfig(ip string, configs []string) error { return nil } -func (c *client) SentinelCheckQuorum(ip string, master string) error { +func (c *client) SentinelCheckQuorum(ip string) error { options := &rediscli.Options{ Addr: net.JoinHostPort(ip, sentinelPort), @@ -337,7 +337,7 @@ func (c *client) SentinelCheckQuorum(ip string, master string) error { } rClient := rediscli.NewSentinelClient(options) defer rClient.Close() - cmd := rClient.CkQuorum(context.TODO(), master) + cmd := rClient.CkQuorum(context.TODO(), masterName) res, err := cmd.Result() if err != nil { log.Errorf("Unable to get result for CKQUORUM comand") From 872590cc11c57d25d774847e885a7a6aefd297ea Mon Sep 17 00:00:00 2001 From: dinesh-murugiah Date: Wed, 7 Dec 2022 16:29:56 +0530 Subject: [PATCH 3/6] added tes cases --- operator/redisfailover/checker_test.go | 81 ++++++++++++++++----- operator/redisfailover/service/heal_test.go | 9 +-- 2 files changed, 67 insertions(+), 23 deletions(-) diff --git a/operator/redisfailover/checker_test.go b/operator/redisfailover/checker_test.go index c8ae26970..2e83a47b4 100644 --- a/operator/redisfailover/checker_test.go +++ b/operator/redisfailover/checker_test.go @@ -7,7 +7,6 @@ import ( "time" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/mock" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -25,7 +24,9 @@ func TestCheckAndHeal(t *testing.T) { name string nMasters int nRedis int - forceNewMaster bool + forceNewMasterNoQrm bool + forceNewMasterFirstBoot bool + singleMasterTest bool slavesOK bool sentinelMonitorOK bool sentinelNumberInMemoryOK bool @@ -39,7 +40,9 @@ func TestCheckAndHeal(t *testing.T) { name: "Everything ok, no need to heal", nMasters: 1, nRedis: 3, - forceNewMaster: false, + singleMasterTest: false, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: false, slavesOK: true, sentinelMonitorOK: true, sentinelNumberInMemoryOK: true, @@ -53,7 +56,9 @@ func TestCheckAndHeal(t *testing.T) { name: "Multiple masters", nMasters: 2, nRedis: 3, - forceNewMaster: false, + singleMasterTest: false, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: false, slavesOK: true, sentinelMonitorOK: true, sentinelNumberInMemoryOK: true, @@ -67,7 +72,9 @@ func TestCheckAndHeal(t *testing.T) { name: "No masters but wait", nMasters: 0, nRedis: 3, - forceNewMaster: false, + singleMasterTest: false, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: false, slavesOK: true, sentinelMonitorOK: true, sentinelNumberInMemoryOK: true, @@ -81,7 +88,9 @@ func TestCheckAndHeal(t *testing.T) { name: "No masters, only one redis available, make master", nMasters: 0, nRedis: 1, - forceNewMaster: false, + singleMasterTest: true, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: false, slavesOK: true, sentinelMonitorOK: true, sentinelNumberInMemoryOK: true, @@ -92,10 +101,27 @@ func TestCheckAndHeal(t *testing.T) { allowSentinels: false, }, { - name: "No masters, set random", + name: "No masters,No sentinel quorum set random", nMasters: 0, nRedis: 3, - forceNewMaster: true, + singleMasterTest: false, + forceNewMasterNoQrm: true, + forceNewMasterFirstBoot: false, + slavesOK: true, + sentinelMonitorOK: true, + sentinelNumberInMemoryOK: true, + redisCheckNumberOK: true, + redisSetMasterOnAllOK: true, + sentinelSlavesNumberInMemoryOK: true, + allowSentinels: false, + }, + { + name: "No masters,Sentinel Quorum but slave of local host set random", + nMasters: 0, + nRedis: 3, + singleMasterTest: false, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: true, slavesOK: true, sentinelMonitorOK: true, sentinelNumberInMemoryOK: true, @@ -108,7 +134,9 @@ func TestCheckAndHeal(t *testing.T) { name: "Slaves from master wrong", nMasters: 1, nRedis: 3, - forceNewMaster: false, + singleMasterTest: false, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: false, slavesOK: false, sentinelMonitorOK: true, sentinelNumberInMemoryOK: true, @@ -122,7 +150,9 @@ func TestCheckAndHeal(t *testing.T) { name: "Sentinels not pointing correct monitor", nMasters: 1, nRedis: 3, - forceNewMaster: false, + singleMasterTest: false, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: false, slavesOK: true, sentinelMonitorOK: false, sentinelNumberInMemoryOK: true, @@ -136,7 +166,9 @@ func TestCheckAndHeal(t *testing.T) { name: "Sentinels with wrong number of sentinels", nMasters: 1, nRedis: 3, - forceNewMaster: false, + singleMasterTest: false, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: false, slavesOK: true, sentinelMonitorOK: true, sentinelNumberInMemoryOK: false, @@ -150,7 +182,9 @@ func TestCheckAndHeal(t *testing.T) { name: "Sentinels with wrong number of slaves", nMasters: 1, nRedis: 3, - forceNewMaster: false, + singleMasterTest: false, + forceNewMasterNoQrm: false, + forceNewMasterFirstBoot: false, slavesOK: true, sentinelMonitorOK: true, sentinelNumberInMemoryOK: true, @@ -251,6 +285,9 @@ func TestCheckAndHeal(t *testing.T) { allowSentinels = test.allowSentinels rf.Spec.BootstrapNode.AllowSentinels = allowSentinels } + if test.singleMasterTest { + rf.Spec.Redis.Replicas = 1 + } expErr := false continueTests := true @@ -297,18 +334,26 @@ func TestCheckAndHeal(t *testing.T) { mrfc.On("GetNumberMasters", rf).Once().Return(test.nMasters, nil) switch test.nMasters { case 0: - mrfc.On("GetRedisesIPs", rf).Once().Return(make([]string, test.nRedis), nil) - if test.nRedis == 1 { - mrfh.On("MakeMaster", mock.Anything, rf).Once().Return(nil) + //mrfc.On("GetRedisesIPs", rf).Once().Return(make([]string, test.nRedis), nil) + if rf.Spec.Redis.Replicas == 1 { + mrfh.On("SetOldestAsMaster", rf).Once().Return(nil) + continueTests = false break } - if test.forceNewMaster { - mrfc.On("GetMinimumRedisPodTime", rf).Once().Return(1*time.Hour, nil) + mrfc.On("GetMaxRedisPodTime", rf).Once().Return(1*time.Hour, nil) + if test.forceNewMasterNoQrm { + mrfc.On("CheckSentinelQuorum", rf).Once().Return(1, errors.New("")) + mrfh.On("SetOldestAsMaster", rf).Once().Return(nil) + } else if test.forceNewMasterFirstBoot { + mrfc.On("CheckSentinelQuorum", rf).Once().Return(3, nil) + mrfc.On("CheckIfMasterLocalhost", rf).Once().Return(true, nil) mrfh.On("SetOldestAsMaster", rf).Once().Return(nil) } else { - mrfc.On("GetMinimumRedisPodTime", rf).Once().Return(1*time.Second, nil) + mrfc.On("CheckSentinelQuorum", rf).Once().Return(3, nil) + mrfc.On("CheckIfMasterLocalhost", rf).Once().Return(false, nil) continueTests = false } + case 1: break default: diff --git a/operator/redisfailover/service/heal_test.go b/operator/redisfailover/service/heal_test.go index f9feb8ea4..27bd8cc11 100644 --- a/operator/redisfailover/service/heal_test.go +++ b/operator/redisfailover/service/heal_test.go @@ -41,7 +41,7 @@ func TestSetOldestAsMasterNewMasterError(t *testing.T) { healer := rfservice.NewRedisFailoverHealer(ms, mr, log.DummyLogger{}) err := healer.SetOldestAsMaster(rf) - assert.NoError(err) + assert.Error(err) } func TestSetOldestAsMaster(t *testing.T) { @@ -204,8 +204,7 @@ func TestSetMasterOnAllMakeMasterError(t *testing.T) { ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(pods, nil) ms.On("UpdatePodLabels", namespace, mock.AnythingOfType("string"), mock.Anything).Once().Return(nil) mr := &mRedisService.Client{} - mr.On("MakeMaster", "0.0.0.0", "0", "").Once().Return(errors.New("")) - + mr.On("IsMaster", "0.0.0.0", "0", "").Return(false, errors.New("")) healer := rfservice.NewRedisFailoverHealer(ms, mr, log.DummyLogger{}) err := healer.SetMasterOnAll("0.0.0.0", rf) @@ -236,7 +235,7 @@ func TestSetMasterOnAllMakeSlaveOfError(t *testing.T) { ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(pods, nil) ms.On("UpdatePodLabels", namespace, mock.AnythingOfType("string"), mock.Anything).Return(nil) mr := &mRedisService.Client{} - mr.On("MakeMaster", "0.0.0.0", "0", "").Once().Return(nil) + mr.On("IsMaster", "0.0.0.0", "0", "").Return(true, nil) mr.On("MakeSlaveOfWithPort", "1.1.1.1", "0.0.0.0", "0", "").Once().Return(errors.New("")) healer := rfservice.NewRedisFailoverHealer(ms, mr, log.DummyLogger{}) @@ -269,7 +268,7 @@ func TestSetMasterOnAll(t *testing.T) { ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(pods, nil) ms.On("UpdatePodLabels", namespace, mock.AnythingOfType("string"), mock.Anything).Return(nil) mr := &mRedisService.Client{} - mr.On("MakeMaster", "0.0.0.0", "0", "").Once().Return(nil) + mr.On("IsMaster", "0.0.0.0", "0", "").Return(true, nil) mr.On("MakeSlaveOfWithPort", "1.1.1.1", "0.0.0.0", "0", "").Once().Return(nil) healer := rfservice.NewRedisFailoverHealer(ms, mr, log.DummyLogger{}) From dd3142a057e29fdec869b8e2d27b526db68b6c3d Mon Sep 17 00:00:00 2001 From: dinesh-murugiah Date: Thu, 8 Dec 2022 17:53:10 +0530 Subject: [PATCH 4/6] removed the GetMinimumRedisPodTime function --- .../service/RedisFailoverCheck.go | 21 ------------------ operator/redisfailover/service/check.go | 22 ------------------- operator/redisfailover/service/check_test.go | 12 +++++----- 3 files changed, 6 insertions(+), 49 deletions(-) diff --git a/mocks/operator/redisfailover/service/RedisFailoverCheck.go b/mocks/operator/redisfailover/service/RedisFailoverCheck.go index 643d6b0ef..e58ad55b9 100644 --- a/mocks/operator/redisfailover/service/RedisFailoverCheck.go +++ b/mocks/operator/redisfailover/service/RedisFailoverCheck.go @@ -211,27 +211,6 @@ func (_m *RedisFailoverCheck) GetMaxRedisPodTime(rFailover *v1.RedisFailover) (t return r0, r1 } -// GetMinimumRedisPodTime provides a mock function with given fields: rFailover -func (_m *RedisFailoverCheck) GetMinimumRedisPodTime(rFailover *v1.RedisFailover) (time.Duration, error) { - ret := _m.Called(rFailover) - - var r0 time.Duration - if rf, ok := ret.Get(0).(func(*v1.RedisFailover) time.Duration); ok { - r0 = rf(rFailover) - } else { - r0 = ret.Get(0).(time.Duration) - } - - var r1 error - if rf, ok := ret.Get(1).(func(*v1.RedisFailover) error); ok { - r1 = rf(rFailover) - } else { - r1 = ret.Error(1) - } - - return r0, r1 -} - // GetNumberMasters provides a mock function with given fields: rFailover func (_m *RedisFailoverCheck) GetNumberMasters(rFailover *v1.RedisFailover) (int, error) { ret := _m.Called(rFailover) diff --git a/operator/redisfailover/service/check.go b/operator/redisfailover/service/check.go index 22688532d..e54f49005 100644 --- a/operator/redisfailover/service/check.go +++ b/operator/redisfailover/service/check.go @@ -30,7 +30,6 @@ type RedisFailoverCheck interface { GetNumberMasters(rFailover *redisfailoverv1.RedisFailover) (int, error) GetRedisesIPs(rFailover *redisfailoverv1.RedisFailover) ([]string, error) GetSentinelsIPs(rFailover *redisfailoverv1.RedisFailover) ([]string, error) - GetMinimumRedisPodTime(rFailover *redisfailoverv1.RedisFailover) (time.Duration, error) GetMaxRedisPodTime(rFailover *redisfailoverv1.RedisFailover) (time.Duration, error) GetRedisesSlavesPods(rFailover *redisfailoverv1.RedisFailover) ([]string, error) GetRedisesMasterPod(rFailover *redisfailoverv1.RedisFailover) (string, error) @@ -344,27 +343,6 @@ func (r *RedisFailoverChecker) GetSentinelsIPs(rf *redisfailoverv1.RedisFailover return sentinels, nil } -// GetMinimumRedisPodTime returns the minimum time a pod is alive -func (r *RedisFailoverChecker) GetMinimumRedisPodTime(rf *redisfailoverv1.RedisFailover) (time.Duration, error) { - minTime := 100000 * time.Hour // More than ten years - rps, err := r.k8sService.GetStatefulSetPods(rf.Namespace, GetRedisName(rf)) - if err != nil { - return minTime, err - } - for _, redisNode := range rps.Items { - if redisNode.Status.StartTime == nil { - continue - } - start := redisNode.Status.StartTime.Round(time.Second) - alive := time.Since(start) - r.logger.Infof("Pod %s has been alive for %.f seconds", redisNode.Status.PodIP, alive.Seconds()) - if alive < minTime { - minTime = alive - } - } - return minTime, nil -} - // GetMaxRedisPodTime returns the MAX uptime among the active Pods func (r *RedisFailoverChecker) GetMaxRedisPodTime(rf *redisfailoverv1.RedisFailover) (time.Duration, error) { maxTime := 0 * time.Hour diff --git a/operator/redisfailover/service/check_test.go b/operator/redisfailover/service/check_test.go index cd81c901a..af64d7e13 100644 --- a/operator/redisfailover/service/check_test.go +++ b/operator/redisfailover/service/check_test.go @@ -658,7 +658,7 @@ func TestGetNumberMastersTwo(t *testing.T) { assert.Equal(2, masterNumber, "the master number should be ok") } -func TestGetMinimumRedisPodTimeGetStatefulSetPodsError(t *testing.T) { +func TestGetMaxRedisPodTimeGetStatefulSetPodsError(t *testing.T) { assert := assert.New(t) rf := generateRF() @@ -669,11 +669,11 @@ func TestGetMinimumRedisPodTimeGetStatefulSetPodsError(t *testing.T) { checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) - _, err := checker.GetMinimumRedisPodTime(rf) + _, err := checker.GetMaxRedisPodTime(rf) assert.Error(err) } -func TestGetMinimumRedisPodTime(t *testing.T) { +func TestGetMaxRedisPodTime(t *testing.T) { assert := assert.New(t) rf := generateRF() @@ -707,11 +707,11 @@ func TestGetMinimumRedisPodTime(t *testing.T) { checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) - minTime, err := checker.GetMinimumRedisPodTime(rf) + maxTime, err := checker.GetMaxRedisPodTime(rf) assert.NoError(err) - expected := now.Sub(oneMinute).Round(time.Second) - assert.Equal(expected, minTime.Round(time.Second), "the closest time should be given") + expected := now.Sub(oneHour).Round(time.Second) + assert.Equal(expected, maxTime.Round(time.Second), "the closest time should be given") } func TestGetRedisPodsNames(t *testing.T) { From 61924de54d8cb806bf81dbb04b5ab5d4f5e8ce8a Mon Sep 17 00:00:00 2001 From: dinesh-murugiah Date: Fri, 9 Dec 2022 13:07:01 +0530 Subject: [PATCH 5/6] metrics fixes and log level corrections --- metrics/metrics.go | 1 + operator/redisfailover/checker.go | 15 ++++++++++----- service/redis/client.go | 16 ++++++++++------ 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index c40aab54c..1087b3dec 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -66,6 +66,7 @@ const ( MAKE_MASTER = "MAKE_INSTANCE_AS_MASTER" MAKE_SLAVE_OF = "MAKE_SLAVE_OF_GIVEN_MASTER_INSTANCE" GET_SENTINEL_MONITOR = "SENTINEL_GET_MASTER_INSTANCE" + CHECK_SENTINEL_QUORUM = "SENTINEL_CKQUORUM" SLAVE_IS_READY = "CHECK_IF_SLAVE_IS_READY" ) diff --git a/operator/redisfailover/checker.go b/operator/redisfailover/checker.go index 977c4490b..89a0c2f19 100644 --- a/operator/redisfailover/checker.go +++ b/operator/redisfailover/checker.go @@ -122,7 +122,9 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e //Configure to master if rf.Spec.Redis.Replicas == 1 { r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Resource spec with standalone master - operator will set the master") - if err = r.rfHealer.SetOldestAsMaster(rf); err != nil { + err = r.rfHealer.SetOldestAsMaster(rf) + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err) + if err != nil { r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master") return err } @@ -145,11 +147,12 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e if err != nil { // Sentinels are not in a situation to choose a master we pick one r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Quorum not available for sentinel to choose master,estimated unhealthy sentinels :%d , Operator to step-in", noqrm_cnt) - if err2 := r.rfHealer.SetOldestAsMaster(rf); err2 != nil { + err2 := r.rfHealer.SetOldestAsMaster(rf) + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err2) + if err2 != nil { r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master") return err2 } - setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, nil) } else { //sentinels are having a quorum to make a failover , but check if redis are not having local hostip (first boot) as master status, err2 := r.rfChecker.CheckIfMasterLocalhost(rf) @@ -159,11 +162,13 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e } else if status { // all avaialable redis pods have local host ip as master r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("all available redis is having local loop back as master , operator initiates master selection") - if err3 := r.rfHealer.SetOldestAsMaster(rf); err3 != nil { + err3 := r.rfHealer.SetOldestAsMaster(rf) + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err3) + if err3 != nil { r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master") return err3 } - setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, nil) + } else { // We'll wait until failover is done diff --git a/service/redis/client.go b/service/redis/client.go index dffb9ccf3..f7856b33e 100644 --- a/service/redis/client.go +++ b/service/redis/client.go @@ -339,30 +339,34 @@ func (c *client) SentinelCheckQuorum(ip string) error { defer rClient.Close() cmd := rClient.CkQuorum(context.TODO(), masterName) res, err := cmd.Result() + if err != nil { - log.Errorf("Unable to get result for CKQUORUM comand") + log.Warnf("Unable to get result for CKQUORUM comand") + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.CHECK_SENTINEL_QUORUM, metrics.FAIL, getRedisError(err)) return err } - - log.Errorf("CKQUORUM OUTPUT IS:%s", res) - + log.Debugf("SentinelCheckQuorum cmd result: %s", res) s := strings.Split(res, " ") status := s[0] quorum := s[1] if status == "" { log.Errorf("quorum command result unexpected output") + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.CHECK_SENTINEL_QUORUM, metrics.FAIL, "quorum command result unexpected output") return fmt.Errorf("quorum command result unexpected output") } if status == "(error)" && quorum == "NOQUORUM" { - log.Errorf("quorum command result - NOQUORUM") + log.Warnf("quorum command result - NOQUORUM") + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.CHECK_SENTINEL_QUORUM, metrics.SUCCESS, "NOQUORUM") return fmt.Errorf("quorum Not available") } else if status == "OK" { - log.Errorf("quorum command result - QUORUM") + log.Infof("quorum command result - QUORUM") + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.CHECK_SENTINEL_QUORUM, metrics.SUCCESS, "QUORUM") return nil } else { log.Errorf("quorum command result unexpected !!!") + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.CHECK_SENTINEL_QUORUM, metrics.FAIL, "quorum command result unexpected output") return fmt.Errorf("quorum result unexpected %s", status) } From 6c3803287cacc09736acf9e6c48c6758b9f6a14a Mon Sep 17 00:00:00 2001 From: dinesh-murugiah Date: Fri, 9 Dec 2022 15:35:38 +0530 Subject: [PATCH 6/6] removed un-necessary logs --- service/redis/client.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/service/redis/client.go b/service/redis/client.go index f7856b33e..8c42690bf 100644 --- a/service/redis/client.go +++ b/service/redis/client.go @@ -356,18 +356,16 @@ func (c *client) SentinelCheckQuorum(ip string) error { return fmt.Errorf("quorum command result unexpected output") } if status == "(error)" && quorum == "NOQUORUM" { - log.Warnf("quorum command result - NOQUORUM") c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.CHECK_SENTINEL_QUORUM, metrics.SUCCESS, "NOQUORUM") return fmt.Errorf("quorum Not available") } else if status == "OK" { - log.Infof("quorum command result - QUORUM") c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.CHECK_SENTINEL_QUORUM, metrics.SUCCESS, "QUORUM") return nil } else { - log.Errorf("quorum command result unexpected !!!") - c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.CHECK_SENTINEL_QUORUM, metrics.FAIL, "quorum command result unexpected output") - return fmt.Errorf("quorum result unexpected %s", status) + log.Errorf("quorum command status unexpected !!!") + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.CHECK_SENTINEL_QUORUM, metrics.FAIL, "quorum command status unexpected output") + return fmt.Errorf("quorum status unexpected %s", status) } }