From 1e46182984c0dee22b95382acd7eb3c1c7cac6d8 Mon Sep 17 00:00:00 2001 From: Sergio Ballesteros Date: Thu, 1 Dec 2022 09:27:27 +0100 Subject: [PATCH] Add IsRedisRunning checks to ensure that pods are up and running --- .../service/RedisFailoverCheck.go | 42 +++++++++ operator/redisfailover/checker.go | 15 ++-- operator/redisfailover/checker_test.go | 6 +- operator/redisfailover/service/check.go | 29 +++++++ operator/redisfailover/service/check_test.go | 87 +++++++++++++++++++ 5 files changed, 168 insertions(+), 11 deletions(-) diff --git a/mocks/operator/redisfailover/service/RedisFailoverCheck.go b/mocks/operator/redisfailover/service/RedisFailoverCheck.go index 80bb06610..c21c37f1a 100644 --- a/mocks/operator/redisfailover/service/RedisFailoverCheck.go +++ b/mocks/operator/redisfailover/service/RedisFailoverCheck.go @@ -322,6 +322,48 @@ func (_m *RedisFailoverCheck) GetStatefulSetUpdateRevision(rFailover *v1.RedisFa return r0, r1 } +// IsClusterRunning provides a mock function with given fields: rFailover +func (_m *RedisFailoverCheck) IsClusterRunning(rFailover *v1.RedisFailover) bool { + ret := _m.Called(rFailover) + + var r0 bool + if rf, ok := ret.Get(0).(func(*v1.RedisFailover) bool); ok { + r0 = rf(rFailover) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsRedisRunning provides a mock function with given fields: rFailover +func (_m *RedisFailoverCheck) IsRedisRunning(rFailover *v1.RedisFailover) bool { + ret := _m.Called(rFailover) + + var r0 bool + if rf, ok := ret.Get(0).(func(*v1.RedisFailover) bool); ok { + r0 = rf(rFailover) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsSentinelRunning provides a mock function with given fields: rFailover +func (_m *RedisFailoverCheck) IsSentinelRunning(rFailover *v1.RedisFailover) bool { + ret := _m.Called(rFailover) + + var r0 bool + if rf, ok := ret.Get(0).(func(*v1.RedisFailover) bool); ok { + r0 = rf(rFailover) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + type mockConstructorTestingTNewRedisFailoverCheck interface { mock.TestingT Cleanup(func()) diff --git a/operator/redisfailover/checker.go b/operator/redisfailover/checker.go index 78690e4aa..0324b341b 100644 --- a/operator/redisfailover/checker.go +++ b/operator/redisfailover/checker.go @@ -198,14 +198,14 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e } func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.RedisFailover) error { - err := r.rfChecker.CheckRedisNumber(rf) - setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) - if err != nil { + + if !r.rfChecker.IsRedisRunning(rf) { + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running")) r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of redis mismatch, waiting for redis statefulset reconcile") return nil } - err = r.UpdateRedisesPods(rf) + err := r.UpdateRedisesPods(rf) if err != nil { return err } @@ -221,10 +221,9 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red } if rf.SentinelsAllowed() { - err = r.rfChecker.CheckSentinelNumber(rf) - setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) - if err != nil { - r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Number of sentinel mismatch, waiting for sentinel deployment reconcile") + if !r.rfChecker.IsSentinelRunning(rf) { + setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, errors.New("not all replicas running")) + r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Debugf("Number of sentinel mismatch, waiting for sentinel deployment reconcile") return nil } diff --git a/operator/redisfailover/checker_test.go b/operator/redisfailover/checker_test.go index 02031e724..c8ae26970 100644 --- a/operator/redisfailover/checker_test.go +++ b/operator/redisfailover/checker_test.go @@ -265,14 +265,14 @@ func TestCheckAndHeal(t *testing.T) { mrfh := &mRFService.RedisFailoverHeal{} if test.redisCheckNumberOK { - mrfc.On("CheckRedisNumber", rf).Once().Return(nil) + mrfc.On("IsRedisRunning", rf).Once().Return(true) } else { continueTests = false - mrfc.On("CheckRedisNumber", rf).Once().Return(errors.New("")) + mrfc.On("IsRedisRunning", rf).Once().Return(false) } if allowSentinels { - mrfc.On("CheckSentinelNumber", rf).Once().Return(nil) + mrfc.On("IsSentinelRunning", rf).Once().Return(true) } if bootstrappingTests && continueTests { diff --git a/operator/redisfailover/service/check.go b/operator/redisfailover/service/check.go index 63946a255..1a2a94914 100644 --- a/operator/redisfailover/service/check.go +++ b/operator/redisfailover/service/check.go @@ -34,6 +34,9 @@ type RedisFailoverCheck interface { GetStatefulSetUpdateRevision(rFailover *redisfailoverv1.RedisFailover) (string, error) GetRedisRevisionHash(podName string, rFailover *redisfailoverv1.RedisFailover) (string, error) CheckRedisSlavesReady(slaveIP string, rFailover *redisfailoverv1.RedisFailover) (bool, error) + IsRedisRunning(rFailover *redisfailoverv1.RedisFailover) bool + IsSentinelRunning(rFailover *redisfailoverv1.RedisFailover) bool + IsClusterRunning(rFailover *redisfailoverv1.RedisFailover) bool } // RedisFailoverChecker is our implementation of RedisFailoverCheck interface @@ -385,6 +388,32 @@ func (r *RedisFailoverChecker) CheckRedisSlavesReady(ip string, rFailover *redis return r.redisClient.SlaveIsReady(ip, port, password) } +// IsRedisRunning returns true if all the pods are Running +func (r *RedisFailoverChecker) IsRedisRunning(rFailover *redisfailoverv1.RedisFailover) bool { + dp, err := r.k8sService.GetStatefulSetPods(rFailover.Namespace, GetRedisName(rFailover)) + return err == nil && len(dp.Items) > int(rFailover.Spec.Redis.Replicas-1) && AreAllRunning(dp) +} + +// IsSentinelRunning returns true if all the pods are Running +func (r *RedisFailoverChecker) IsSentinelRunning(rFailover *redisfailoverv1.RedisFailover) bool { + dp, err := r.k8sService.GetDeploymentPods(rFailover.Namespace, GetSentinelName(rFailover)) + return err == nil && len(dp.Items) > int(rFailover.Spec.Redis.Replicas-1) && AreAllRunning(dp) +} + +// IsClusterRunning returns true if all the pods in the given redisfailover are Running +func (r *RedisFailoverChecker) IsClusterRunning(rFailover *redisfailoverv1.RedisFailover) bool { + return r.IsSentinelRunning(rFailover) && r.IsRedisRunning(rFailover) +} + func getRedisPort(p int32) string { return strconv.Itoa(int(p)) } + +func AreAllRunning(pods *corev1.PodList) bool { + for _, pod := range pods.Items { + if pod.Status.Phase != corev1.PodRunning || pod.DeletionTimestamp != nil { + return false + } + } + return true +} diff --git a/operator/redisfailover/service/check_test.go b/operator/redisfailover/service/check_test.go index 79ceb84a7..cd81c901a 100644 --- a/operator/redisfailover/service/check_test.go +++ b/operator/redisfailover/service/check_test.go @@ -869,3 +869,90 @@ func TestGetRedisRevisionHash(t *testing.T) { } } + +func TestClusterRunning(t *testing.T) { + assert := assert.New(t) + + rf := generateRF() + + allRunning := &corev1.PodList{ + Items: []corev1.Pod{ + { + Status: corev1.PodStatus{ + PodIP: "0.0.0.0", + Phase: corev1.PodRunning, + }, + }, + { + Status: corev1.PodStatus{ + PodIP: "1.1.1.1", + Phase: corev1.PodRunning, + }, + }, + { + Status: corev1.PodStatus{ + PodIP: "1.1.1.1", + Phase: corev1.PodRunning, + }, + }, + }, + } + + notAllRunning := &corev1.PodList{ + Items: []corev1.Pod{ + { + Status: corev1.PodStatus{ + PodIP: "0.0.0.0", + Phase: corev1.PodRunning, + }, + }, + { + Status: corev1.PodStatus{ + PodIP: "1.1.1.1", + Phase: corev1.PodPending, + }, + }, + { + Status: corev1.PodStatus{ + PodIP: "1.1.1.1", + Phase: corev1.PodRunning, + }, + }, + }, + } + + notAllReplicas := &corev1.PodList{ + Items: []corev1.Pod{ + { + Status: corev1.PodStatus{ + PodIP: "0.0.0.0", + Phase: corev1.PodRunning, + }, + }, + { + Status: corev1.PodStatus{ + PodIP: "1.1.1.1", + Phase: corev1.PodRunning, + }, + }, + }, + } + + ms := &mK8SService.Services{} + ms.On("GetDeploymentPods", namespace, rfservice.GetSentinelName(rf)).Once().Return(allRunning, nil) + ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(allRunning, nil) + mr := &mRedisService.Client{} + + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) + + assert.True(checker.IsClusterRunning(rf)) + + ms.On("GetDeploymentPods", namespace, rfservice.GetSentinelName(rf)).Once().Return(allRunning, nil) + ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(notAllReplicas, nil) + assert.False(checker.IsClusterRunning(rf)) + + ms.On("GetDeploymentPods", namespace, rfservice.GetSentinelName(rf)).Once().Return(notAllRunning, nil) + ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(allRunning, nil) + assert.False(checker.IsClusterRunning(rf)) + +}