Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes to avoid operator conflict with the sentinel in choosing master #536

Merged
merged 6 commits into from
Dec 9, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ const (
HEALTHY = 0.0
REDIS_REPLICA_MISMATCH = "REDIS_STATEFULSET_REPLICAS_MISMATCH"
SENTINEL_REPLICA_MISMATCH = "SENTINEL_DEPLOYMENT_REPLICAS_MISMATCH"
NO_MASTER = "NO_MASTER_AVAILABLE"
NUMBER_OF_MASTERS = "MASTER_COUNT_IS_NOT_ONE"
SENTINEL_WRONG_MASTER = "SENTINEL_IS_CONFIGURED_WITH_WRONG_MASTER_IP"
SLAVE_WRONG_MASTER = "SLAVE_IS_CONFIGURED_WITH_WRONG_MASTER_IP"
Expand Down Expand Up @@ -65,6 +66,7 @@ const (
MAKE_MASTER = "MAKE_INSTANCE_AS_MASTER"
MAKE_SLAVE_OF = "MAKE_SLAVE_OF_GIVEN_MASTER_INSTANCE"
GET_SENTINEL_MONITOR = "SENTINEL_GET_MASTER_INSTANCE"
CHECK_SENTINEL_QUORUM = "SENTINEL_CKQUORUM"
SLAVE_IS_READY = "CHECK_IF_SLAVE_IS_READY"
)

Expand Down
46 changes: 44 additions & 2 deletions mocks/operator/redisfailover/service/RedisFailoverCheck.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions mocks/service/redis/Client.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

77 changes: 55 additions & 22 deletions operator/redisfailover/checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@ import (
"github.com/spotahome/redis-operator/metrics"
)

const (
timeToPrepare = 2 * time.Minute
)

// UpdateRedisesPods if the running version of pods are equal to the statefulset one
func (r *RedisFailoverHandler) UpdateRedisesPods(rf *redisfailoverv1.RedisFailover) error {
redises, err := r.rfChecker.GetRedisesIPs(rf)
Expand Down Expand Up @@ -118,34 +114,71 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
if err != nil {
return err
}

switch nMasters {
case 0:
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("no masters detected"))
redisesIP, err := r.rfChecker.GetRedisesIPs(rf)
if err != nil {
return err
}
if len(redisesIP) == 1 {
if err := r.rfHealer.MakeMaster(redisesIP[0], rf); err != nil {
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, errors.New("no masters detected"))
//when number of redis replicas is 1 , the redis is configured for standalone master mode
//Configure to master
if rf.Spec.Redis.Replicas == 1 {
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("Resource spec with standalone master - operator will set the master")
err = r.rfHealer.SetOldestAsMaster(rf)
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err)
if err != nil {
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master")
return err
}
break
return nil
}
minTime, err2 := r.rfChecker.GetMinimumRedisPodTime(rf)
if err2 != nil {
return err2
//During the First boot(New deployment or all pods of the statefulsets have restarted),
//Sentinesl will not be able to choose the master , so operator should select a master
//Also in scenarios where Sentinels is not in a position to choose a master like , No quorum reached
//Operator can choose a master , These scenarios can be checked by asking the all the sentinels
//if its in a postion to choose a master also check if the redis is configured with local host IP as master.
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Number of Masters running is 0")
maxUptime, err := r.rfChecker.GetMaxRedisPodTime(rf)
if err != nil {
return err
}
if minTime > timeToPrepare {
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("time %.f more than expected. Not even one master, fixing...", minTime.Round(time.Second).Seconds())
// We can consider there's an error
if err2 := r.rfHealer.SetOldestAsMaster(rf); err2 != nil {

r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("No master avaiable but max pod up time is : %f", maxUptime.Round(time.Second).Seconds())
//Check If Sentinel has quorum to take a failover decision
noqrm_cnt, err := r.rfChecker.CheckSentinelQuorum(rf)
if err != nil {
// Sentinels are not in a situation to choose a master we pick one
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Quorum not available for sentinel to choose master,estimated unhealthy sentinels :%d , Operator to step-in", noqrm_cnt)
err2 := r.rfHealer.SetOldestAsMaster(rf)
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err2)
if err2 != nil {
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master")
return err2
}
} else {
// We'll wait until failover is done
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("No master found, wait until failover")
return nil
//sentinels are having a quorum to make a failover , but check if redis are not having local hostip (first boot) as master
status, err2 := r.rfChecker.CheckIfMasterLocalhost(rf)
if err2 != nil {
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("CheckIfMasterLocalhost failed retry later")
return err2
} else if status {
// all avaialable redis pods have local host ip as master
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("all available redis is having local loop back as master , operator initiates master selection")
err3 := r.rfHealer.SetOldestAsMaster(rf)
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, err3)
if err3 != nil {
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Errorf("Error in Setting oldest Pod as master")
return err3
}

} else {

// We'll wait until failover is done
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Infof("no master found, wait until failover or fix manually")
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NO_MASTER, metrics.NOT_APPLICABLE, errors.New("no master not fixed, wait until failover or fix manually"))
return nil
}

}

case 1:
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, nil)
default:
Expand Down
81 changes: 63 additions & 18 deletions operator/redisfailover/checker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
Expand All @@ -25,7 +24,9 @@ func TestCheckAndHeal(t *testing.T) {
name string
nMasters int
nRedis int
forceNewMaster bool
forceNewMasterNoQrm bool
forceNewMasterFirstBoot bool
singleMasterTest bool
slavesOK bool
sentinelMonitorOK bool
sentinelNumberInMemoryOK bool
Expand All @@ -39,7 +40,9 @@ func TestCheckAndHeal(t *testing.T) {
name: "Everything ok, no need to heal",
nMasters: 1,
nRedis: 3,
forceNewMaster: false,
singleMasterTest: false,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: false,
slavesOK: true,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: true,
Expand All @@ -53,7 +56,9 @@ func TestCheckAndHeal(t *testing.T) {
name: "Multiple masters",
nMasters: 2,
nRedis: 3,
forceNewMaster: false,
singleMasterTest: false,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: false,
slavesOK: true,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: true,
Expand All @@ -67,7 +72,9 @@ func TestCheckAndHeal(t *testing.T) {
name: "No masters but wait",
nMasters: 0,
nRedis: 3,
forceNewMaster: false,
singleMasterTest: false,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: false,
slavesOK: true,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: true,
Expand All @@ -81,7 +88,9 @@ func TestCheckAndHeal(t *testing.T) {
name: "No masters, only one redis available, make master",
nMasters: 0,
nRedis: 1,
forceNewMaster: false,
singleMasterTest: true,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: false,
slavesOK: true,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: true,
Expand All @@ -92,10 +101,27 @@ func TestCheckAndHeal(t *testing.T) {
allowSentinels: false,
},
{
name: "No masters, set random",
name: "No masters,No sentinel quorum set random",
nMasters: 0,
nRedis: 3,
forceNewMaster: true,
singleMasterTest: false,
forceNewMasterNoQrm: true,
forceNewMasterFirstBoot: false,
slavesOK: true,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: true,
redisCheckNumberOK: true,
redisSetMasterOnAllOK: true,
sentinelSlavesNumberInMemoryOK: true,
allowSentinels: false,
},
{
name: "No masters,Sentinel Quorum but slave of local host set random",
nMasters: 0,
nRedis: 3,
singleMasterTest: false,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: true,
slavesOK: true,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: true,
Expand All @@ -108,7 +134,9 @@ func TestCheckAndHeal(t *testing.T) {
name: "Slaves from master wrong",
nMasters: 1,
nRedis: 3,
forceNewMaster: false,
singleMasterTest: false,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: false,
slavesOK: false,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: true,
Expand All @@ -122,7 +150,9 @@ func TestCheckAndHeal(t *testing.T) {
name: "Sentinels not pointing correct monitor",
nMasters: 1,
nRedis: 3,
forceNewMaster: false,
singleMasterTest: false,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: false,
slavesOK: true,
sentinelMonitorOK: false,
sentinelNumberInMemoryOK: true,
Expand All @@ -136,7 +166,9 @@ func TestCheckAndHeal(t *testing.T) {
name: "Sentinels with wrong number of sentinels",
nMasters: 1,
nRedis: 3,
forceNewMaster: false,
singleMasterTest: false,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: false,
slavesOK: true,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: false,
Expand All @@ -150,7 +182,9 @@ func TestCheckAndHeal(t *testing.T) {
name: "Sentinels with wrong number of slaves",
nMasters: 1,
nRedis: 3,
forceNewMaster: false,
singleMasterTest: false,
forceNewMasterNoQrm: false,
forceNewMasterFirstBoot: false,
slavesOK: true,
sentinelMonitorOK: true,
sentinelNumberInMemoryOK: true,
Expand Down Expand Up @@ -251,6 +285,9 @@ func TestCheckAndHeal(t *testing.T) {
allowSentinels = test.allowSentinels
rf.Spec.BootstrapNode.AllowSentinels = allowSentinels
}
if test.singleMasterTest {
rf.Spec.Redis.Replicas = 1
}

expErr := false
continueTests := true
Expand Down Expand Up @@ -297,18 +334,26 @@ func TestCheckAndHeal(t *testing.T) {
mrfc.On("GetNumberMasters", rf).Once().Return(test.nMasters, nil)
switch test.nMasters {
case 0:
mrfc.On("GetRedisesIPs", rf).Once().Return(make([]string, test.nRedis), nil)
if test.nRedis == 1 {
mrfh.On("MakeMaster", mock.Anything, rf).Once().Return(nil)
//mrfc.On("GetRedisesIPs", rf).Once().Return(make([]string, test.nRedis), nil)
if rf.Spec.Redis.Replicas == 1 {
mrfh.On("SetOldestAsMaster", rf).Once().Return(nil)
continueTests = false
break
}
if test.forceNewMaster {
mrfc.On("GetMinimumRedisPodTime", rf).Once().Return(1*time.Hour, nil)
mrfc.On("GetMaxRedisPodTime", rf).Once().Return(1*time.Hour, nil)
if test.forceNewMasterNoQrm {
mrfc.On("CheckSentinelQuorum", rf).Once().Return(1, errors.New(""))
mrfh.On("SetOldestAsMaster", rf).Once().Return(nil)
} else if test.forceNewMasterFirstBoot {
mrfc.On("CheckSentinelQuorum", rf).Once().Return(3, nil)
mrfc.On("CheckIfMasterLocalhost", rf).Once().Return(true, nil)
mrfh.On("SetOldestAsMaster", rf).Once().Return(nil)
} else {
mrfc.On("GetMinimumRedisPodTime", rf).Once().Return(1*time.Second, nil)
mrfc.On("CheckSentinelQuorum", rf).Once().Return(3, nil)
mrfc.On("CheckIfMasterLocalhost", rf).Once().Return(false, nil)
continueTests = false
}

case 1:
break
default:
Expand Down
Loading