Skip to content

Commit

Permalink
Merge pull request spotahome#536 from dinesh-murugiah/operator_fixes_sr
Browse files Browse the repository at this point in the history
Fixes to avoid operator conflict with the sentinel in choosing master
  • Loading branch information
ese authored Dec 9, 2022
2 parents 2a268b9 + 6c38032 commit ea4f232
Show file tree
Hide file tree
Showing 10 changed files with 330 additions and 74 deletions.
2 changes: 2 additions & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ const (
HEALTHY = 0.0
REDIS_REPLICA_MISMATCH = "REDIS_STATEFULSET_REPLICAS_MISMATCH"
SENTINEL_REPLICA_MISMATCH = "SENTINEL_DEPLOYMENT_REPLICAS_MISMATCH"
NO_MASTER = "NO_MASTER_AVAILABLE"
NUMBER_OF_MASTERS = "MASTER_COUNT_IS_NOT_ONE"
SENTINEL_WRONG_MASTER = "SENTINEL_IS_CONFIGURED_WITH_WRONG_MASTER_IP"
SLAVE_WRONG_MASTER = "SLAVE_IS_CONFIGURED_WITH_WRONG_MASTER_IP"
Expand Down Expand Up @@ -65,6 +66,7 @@ const (
MAKE_MASTER = "MAKE_INSTANCE_AS_MASTER"
MAKE_SLAVE_OF = "MAKE_SLAVE_OF_GIVEN_MASTER_INSTANCE"
GET_SENTINEL_MONITOR = "SENTINEL_GET_MASTER_INSTANCE"
CHECK_SENTINEL_QUORUM = "SENTINEL_CKQUORUM"
SLAVE_IS_READY = "CHECK_IF_SLAVE_IS_READY"
)

Expand Down
46 changes: 44 additions & 2 deletions mocks/operator/redisfailover/service/RedisFailoverCheck.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions mocks/service/redis/Client.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

77 changes: 55 additions & 22 deletions operator/redisfailover/checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@ import (
"github.com/spotahome/redis-operator/metrics"
)

const (
timeToPrepare = 2 * time.Minute
)

// UpdateRedisesPods if the running version of pods are equal to the statefulset one
func (r *RedisFailoverHandler) UpdateRedisesPods(rf *redisfailoverv1.RedisFailover) error {
redises, err := r.rfChecker.GetRedisesIPs(rf)
Expand Down Expand Up @@ -118,34 +114,71 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
if err != nil {
return err
}

switch nMasters {
case 0:
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("no masters detected"))
redisesIP, err := r.rfChecker.GetRedisesIPs(rf)
if err != nil {
return err
}
if len(redisesIP) == 1 {
if err := r.rfHealer.MakeMaster(redisesIP[0], rf); err != nil {
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, errors.New("no masters detected"))
//when number of redis replicas is 1 , the redis is configured for standalone master mode
//Configure to master
if rf.Spec.Redis.Replicas == 1 {
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Resource spec with standalone master - operator will set the master")
err = r.rfHealer.SetOldestAsMaster(rf)
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err)
if err != nil {
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master")
return err
}
break
return nil
}
minTime, err2 := r.rfChecker.GetMinimumRedisPodTime(rf)
if err2 != nil {
return err2
//During the First boot(New deployment or all pods of the statefulsets have restarted),
//Sentinesl will not be able to choose the master , so operator should select a master
//Also in scenarios where Sentinels is not in a position to choose a master like , No quorum reached
//Operator can choose a master , These scenarios can be checked by asking the all the sentinels
//if its in a postion to choose a master also check if the redis is configured with local host IP as master.
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Number of Masters running is 0")
maxUptime, err := r.rfChecker.GetMaxRedisPodTime(rf)
if err != nil {
return err
}
if minTime > timeToPrepare {
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("time %.f more than expected. Not even one master, fixing...", minTime.Round(time.Second).Seconds())
// We can consider there's an error
if err2 := r.rfHealer.SetOldestAsMaster(rf); err2 != nil {

r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("No master avaiable but max pod up time is : %f", maxUptime.Round(time.Second).Seconds())
//Check If Sentinel has quorum to take a failover decision
noqrm_cnt, err := r.rfChecker.CheckSentinelQuorum(rf)
if err != nil {
// Sentinels are not in a situation to choose a master we pick one
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Quorum not available for sentinel to choose master,estimated unhealthy sentinels :%d , Operator to step-in", noqrm_cnt)
err2 := r.rfHealer.SetOldestAsMaster(rf)
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err2)
if err2 != nil {
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master")
return err2
}
} else {
// We'll wait until failover is done
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("No master found, wait until failover")
return nil
//sentinels are having a quorum to make a failover , but check if redis are not having local hostip (first boot) as master
status, err2 := r.rfChecker.CheckIfMasterLocalhost(rf)
if err2 != nil {
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("CheckIfMasterLocalhost failed retry later")
return err2
} else if status {
// all avaialable redis pods have local host ip as master
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("all available redis is having local loop back as master , operator initiates master selection")
err3 := r.rfHealer.SetOldestAsMaster(rf)
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err3)
if err3 != nil {
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master")
return err3
}

} else {

// We'll wait until failover is done
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("no master found, wait until failover or fix manually")
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, errors.New("no master not fixed, wait until failover or fix manually"))
return nil
}

}

case 1:
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, nil)
default:
Expand Down
81 changes: 63 additions & 18 deletions operator/redisfailover/checker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
Expand All @@ -25,7 +24,9 @@ func TestCheckAndHeal(t *testing.T) {
name string
nMasters int
nRedis int
forceNewMaster bool
forceNewMasterNoQrm bool
forceNewMasterFirstBoot bool
singleMasterTest bool
slavesOK bool
sentinelMonitorOK bool
sentinelNumberInMemoryOK bool
Expand All @@ -39,7 +40,9 @@ func TestCheckAndHeal(t *testing.T) {
name: "Everything ok, no need to heal",
nMasters: 1,
nRedis: 3,
forceNewMaster: false,
singleMasterTest: false,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: false,
slavesOK: true,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: true,
Expand All @@ -53,7 +56,9 @@ func TestCheckAndHeal(t *testing.T) {
name: "Multiple masters",
nMasters: 2,
nRedis: 3,
forceNewMaster: false,
singleMasterTest: false,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: false,
slavesOK: true,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: true,
Expand All @@ -67,7 +72,9 @@ func TestCheckAndHeal(t *testing.T) {
name: "No masters but wait",
nMasters: 0,
nRedis: 3,
forceNewMaster: false,
singleMasterTest: false,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: false,
slavesOK: true,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: true,
Expand All @@ -81,7 +88,9 @@ func TestCheckAndHeal(t *testing.T) {
name: "No masters, only one redis available, make master",
nMasters: 0,
nRedis: 1,
forceNewMaster: false,
singleMasterTest: true,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: false,
slavesOK: true,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: true,
Expand All @@ -92,10 +101,27 @@ func TestCheckAndHeal(t *testing.T) {
allowSentinels: false,
},
{
name: "No masters, set random",
name: "No masters,No sentinel quorum set random",
nMasters: 0,
nRedis: 3,
forceNewMaster: true,
singleMasterTest: false,
forceNewMasterNoQrm: true,
forceNewMasterFirstBoot: false,
slavesOK: true,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: true,
redisCheckNumberOK: true,
redisSetMasterOnAllOK: true,
sentinelSlavesNumberInMemoryOK: true,
allowSentinels: false,
},
{
name: "No masters,Sentinel Quorum but slave of local host set random",
nMasters: 0,
nRedis: 3,
singleMasterTest: false,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: true,
slavesOK: true,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: true,
Expand All @@ -108,7 +134,9 @@ func TestCheckAndHeal(t *testing.T) {
name: "Slaves from master wrong",
nMasters: 1,
nRedis: 3,
forceNewMaster: false,
singleMasterTest: false,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: false,
slavesOK: false,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: true,
Expand All @@ -122,7 +150,9 @@ func TestCheckAndHeal(t *testing.T) {
name: "Sentinels not pointing correct monitor",
nMasters: 1,
nRedis: 3,
forceNewMaster: false,
singleMasterTest: false,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: false,
slavesOK: true,
sentinelMonitorOK: false,
sentinelNumberInMemoryOK: true,
Expand All @@ -136,7 +166,9 @@ func TestCheckAndHeal(t *testing.T) {
name: "Sentinels with wrong number of sentinels",
nMasters: 1,
nRedis: 3,
forceNewMaster: false,
singleMasterTest: false,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: false,
slavesOK: true,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: false,
Expand All @@ -150,7 +182,9 @@ func TestCheckAndHeal(t *testing.T) {
name: "Sentinels with wrong number of slaves",
nMasters: 1,
nRedis: 3,
forceNewMaster: false,
singleMasterTest: false,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: false,
slavesOK: true,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: true,
Expand Down Expand Up @@ -251,6 +285,9 @@ func TestCheckAndHeal(t *testing.T) {
allowSentinels = test.allowSentinels
rf.Spec.BootstrapNode.AllowSentinels = allowSentinels
}
if test.singleMasterTest {
rf.Spec.Redis.Replicas = 1
}

expErr := false
continueTests := true
Expand Down Expand Up @@ -297,18 +334,26 @@ func TestCheckAndHeal(t *testing.T) {
mrfc.On("GetNumberMasters", rf).Once().Return(test.nMasters, nil)
switch test.nMasters {
case 0:
mrfc.On("GetRedisesIPs", rf).Once().Return(make([]string, test.nRedis), nil)
if test.nRedis == 1 {
mrfh.On("MakeMaster", mock.Anything, rf).Once().Return(nil)
//mrfc.On("GetRedisesIPs", rf).Once().Return(make([]string, test.nRedis), nil)
if rf.Spec.Redis.Replicas == 1 {
mrfh.On("SetOldestAsMaster", rf).Once().Return(nil)
continueTests = false
break
}
if test.forceNewMaster {
mrfc.On("GetMinimumRedisPodTime", rf).Once().Return(1*time.Hour, nil)
mrfc.On("GetMaxRedisPodTime", rf).Once().Return(1*time.Hour, nil)
if test.forceNewMasterNoQrm {
mrfc.On("CheckSentinelQuorum", rf).Once().Return(1, errors.New(""))
mrfh.On("SetOldestAsMaster", rf).Once().Return(nil)
} else if test.forceNewMasterFirstBoot {
mrfc.On("CheckSentinelQuorum", rf).Once().Return(3, nil)
mrfc.On("CheckIfMasterLocalhost", rf).Once().Return(true, nil)
mrfh.On("SetOldestAsMaster", rf).Once().Return(nil)
} else {
mrfc.On("GetMinimumRedisPodTime", rf).Once().Return(1*time.Second, nil)
mrfc.On("CheckSentinelQuorum", rf).Once().Return(3, nil)
mrfc.On("CheckIfMasterLocalhost", rf).Once().Return(false, nil)
continueTests = false
}

case 1:
break
default:
Expand Down
Loading

0 comments on commit ea4f232

Please sign in to comment.