From af671ed71dfe57e79c34160cd23d2768d2ff329a Mon Sep 17 00:00:00 2001 From: Raghu Nandan B S Date: Mon, 26 Sep 2022 11:22:40 +0530 Subject: [PATCH 01/24] Add ensure success/failure metrics --- .gitignore | 3 +- api/redisfailover/v1/validate.go | 1 + docker/development/Dockerfile | 2 +- metrics/dummy.go | 4 ++ metrics/metrics.go | 53 +++++++++++++++++-- operator/redisfailover/ensurer.go | 3 +- operator/redisfailover/ensurer_test.go | 2 +- operator/redisfailover/factory.go | 2 +- operator/redisfailover/handler.go | 2 +- operator/redisfailover/service/client.go | 51 +++++++++++++----- .../redisfailover/service/generator_test.go | 31 +++++------ 11 files changed, 116 insertions(+), 38 deletions(-) diff --git a/.gitignore b/.gitignore index 15eb3155e..7a1189ac8 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ .bash_history .vscode .idea/ -/tmp \ No newline at end of file +/tmp +vendor \ No newline at end of file diff --git a/api/redisfailover/v1/validate.go b/api/redisfailover/v1/validate.go index ec5a7e66e..f3efedbff 100644 --- a/api/redisfailover/v1/validate.go +++ b/api/redisfailover/v1/validate.go @@ -12,6 +12,7 @@ const ( // Validate set the values by default if not defined and checks if the values given are valid func (r *RedisFailover) Validate() error { + if len(r.Name) > maxNameLength { return fmt.Errorf("name length can't be higher than %d", maxNameLength) } diff --git a/docker/development/Dockerfile b/docker/development/Dockerfile index 0e247dbcb..5a0959f62 100644 --- a/docker/development/Dockerfile +++ b/docker/development/Dockerfile @@ -18,7 +18,7 @@ RUN wget http://github.com/kubernetes/code-generator/archive/kubernetes-${CODEGE # Mock creator ARG MOCKERY_VERSION="2.9.4" -RUN wget -c https://github.com/vektra/mockery/releases/download/v${MOCKERY_VERSION}/mockery_${MOCKERY_VERSION}_$(uname -o)_$(uname -m).tar.gz -O - | tar -xz -C /go/bin/ +RUN wget -c "https://github.com/vektra/mockery/releases/download/v2.9.4/mockery_2.9.4_Linux_arm64.tar.gz" -O - | tar -xz -C /go/bin/ # Create user ARG uid=1000 diff --git a/metrics/dummy.go b/metrics/dummy.go index d18d4c80d..a63186a2d 100644 --- a/metrics/dummy.go +++ b/metrics/dummy.go @@ -17,3 +17,7 @@ type dummy struct { func (d *dummy) SetClusterOK(namespace string, name string) {} func (d *dummy) SetClusterError(namespace string, name string) {} func (d *dummy) DeleteCluster(namespace string, name string) {} +func (d *dummy) IncrEnsureResourceSuccessCount(objectNamespace string, objectName string, objectKind string, resourceName string) { +} +func (d *dummy) IncrEnsureResourceFailureCount(objectNamespace string, objectName string, objectKind string, resourceName string) { +} diff --git a/metrics/metrics.go b/metrics/metrics.go index e4e3e9db0..20d7cf75a 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -10,20 +10,43 @@ const ( promControllerSubsystem = "controller" ) +const () + // Instrumenter is the interface that will collect the metrics and has ability to send/expose those metrics. type Recorder interface { koopercontroller.MetricsRecorder + // ClusterOK metrics SetClusterOK(namespace string, name string) SetClusterError(namespace string, name string) DeleteCluster(namespace string, name string) + + //K8s Manager resources + IncrEnsureResourceSuccessCount(objectNamespace string, objectName string, objectKind string, resourceName string) + IncrEnsureResourceFailureCount(objectNamespace string, objectName string, objectKind string, resourceName string) } // PromMetrics implements the instrumenter so the metrics can be managed by Prometheus. type recorder struct { // Metrics fields. - clusterOK *prometheus.GaugeVec // clusterOk is the status of a cluster + clusterOK *prometheus.GaugeVec // clusterOk is the status of a cluster + ensureResourceSuccess *prometheus.CounterVec // number of successful "ensure" operators performed by the controller. + ensureResourceFailureRecorder *prometheus.CounterVec // number of failed "ensure" operators performed by the controller. + koopercontroller.MetricsRecorder +} +type ensureResourceSuccessRecorder struct { + ensureResourceSuccess *prometheus.CounterVec + koopercontroller.MetricsRecorder +} + +type ensureResourceFailureRecorder struct { + ensureResourceSuccess *prometheus.CounterVec + koopercontroller.MetricsRecorder +} + +type k8sClientErrorRecorder struct { + k8sClientErrors *prometheus.GaugeVec koopercontroller.MetricsRecorder } @@ -37,10 +60,25 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { Help: "Number of failover clusters managed by the operator.", }, []string{"namespace", "name"}) + ensureResourceSuccess := prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: promControllerSubsystem, + Name: "ensure_resource_success", + Help: "number of successful 'ensure' operations on a resource performed by the controller.", + }, []string{"object_namespace", "object_name", "object_kind", "resource_name"}) + + ensureResourceFailure := prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: promControllerSubsystem, + Name: "ensure_resource_failure", + Help: "number of failed 'ensure' operations on a resource performed by the controller.", + }, []string{"object_namespace", "object_name", "object_kind", "resource_name"}) + // Create the instance. r := recorder{ - clusterOK: clusterOK, - + clusterOK: clusterOK, + ensureResourceSuccess: ensureResourceSuccess, + ensureResourceFailureRecorder: ensureResourceFailure, MetricsRecorder: kooperprometheus.New(kooperprometheus.Config{ Registerer: reg, }), @@ -49,6 +87,7 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { // Register metrics. reg.MustRegister( r.clusterOK, + r.ensureResourceSuccess, ) return r @@ -68,3 +107,11 @@ func (r recorder) SetClusterError(namespace string, name string) { func (r recorder) DeleteCluster(namespace string, name string) { r.clusterOK.DeleteLabelValues(namespace, name) } + +func (r recorder) IncrEnsureResourceSuccessCount(objectNamespace string, objectName string, objectKind string, resourceName string) { + r.ensureResourceSuccess.WithLabelValues(objectNamespace, objectName, objectKind, resourceName).Add(1) +} + +func (r recorder) IncrEnsureResourceFailureCount(objectNamespace string, objectName string, objectKind string, resourceName string) { + r.ensureResourceSuccess.WithLabelValues(objectNamespace, objectName, objectKind, resourceName).Add(1) +} diff --git a/operator/redisfailover/ensurer.go b/operator/redisfailover/ensurer.go index 2c20fa1db..23b609417 100644 --- a/operator/redisfailover/ensurer.go +++ b/operator/redisfailover/ensurer.go @@ -4,10 +4,11 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" redisfailoverv1 "github.com/spotahome/redis-operator/api/redisfailover/v1" + "github.com/spotahome/redis-operator/metrics" ) // Ensure is called to ensure all of the resources associated with a RedisFailover are created -func (w *RedisFailoverHandler) Ensure(rf *redisfailoverv1.RedisFailover, labels map[string]string, or []metav1.OwnerReference) error { +func (w *RedisFailoverHandler) Ensure(rf *redisfailoverv1.RedisFailover, labels map[string]string, or []metav1.OwnerReference, metricsClient metrics.Recorder) error { if rf.Spec.Redis.Exporter.Enabled { if err := w.rfService.EnsureRedisService(rf, labels, or); err != nil { return err diff --git a/operator/redisfailover/ensurer_test.go b/operator/redisfailover/ensurer_test.go index efe033687..254e36383 100644 --- a/operator/redisfailover/ensurer_test.go +++ b/operator/redisfailover/ensurer_test.go @@ -124,7 +124,7 @@ func TestEnsure(t *testing.T) { // Create the Kops client and call the valid logic. handler := rfOperator.NewRedisFailoverHandler(config, mrfs, mrfc, mrfh, mk, metrics.Dummy, log.Dummy) - err := handler.Ensure(rf, map[string]string{}, []metav1.OwnerReference{}) + err := handler.Ensure(rf, map[string]string{}, []metav1.OwnerReference{}, metrics.Dummy) assert.NoError(err) mrfs.AssertExpectations(t) diff --git a/operator/redisfailover/factory.go b/operator/redisfailover/factory.go index ac92a9f2b..d8a44df25 100644 --- a/operator/redisfailover/factory.go +++ b/operator/redisfailover/factory.go @@ -30,7 +30,7 @@ const ( // to create redis failovers. func New(cfg Config, k8sService k8s.Services, k8sClient kubernetes.Interface, lockNamespace string, redisClient redis.Client, kooperMetricsRecorder metrics.Recorder, logger log.Logger) (controller.Controller, error) { // Create internal services. - rfService := rfservice.NewRedisFailoverKubeClient(k8sService, logger) + rfService := rfservice.NewRedisFailoverKubeClient(k8sService, logger, kooperMetricsRecorder) rfChecker := rfservice.NewRedisFailoverChecker(k8sService, redisClient, logger) rfHealer := rfservice.NewRedisFailoverHealer(k8sService, redisClient, logger) diff --git a/operator/redisfailover/handler.go b/operator/redisfailover/handler.go index e760ffdaa..6a03afd56 100644 --- a/operator/redisfailover/handler.go +++ b/operator/redisfailover/handler.go @@ -71,7 +71,7 @@ func (r *RedisFailoverHandler) Handle(_ context.Context, obj runtime.Object) err // Create the labels every object derived from this need to have. labels := r.getLabels(rf) - if err := r.Ensure(rf, labels, oRefs); err != nil { + if err := r.Ensure(rf, labels, oRefs, r.mClient); err != nil { r.mClient.SetClusterError(rf.Namespace, rf.Name) return err } diff --git a/operator/redisfailover/service/client.go b/operator/redisfailover/service/client.go index ed6567bda..f92dd728c 100644 --- a/operator/redisfailover/service/client.go +++ b/operator/redisfailover/service/client.go @@ -6,6 +6,7 @@ import ( redisfailoverv1 "github.com/spotahome/redis-operator/api/redisfailover/v1" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" "github.com/spotahome/redis-operator/operator/redisfailover/util" "github.com/spotahome/redis-operator/service/k8s" ) @@ -26,15 +27,17 @@ type RedisFailoverClient interface { // RedisFailoverKubeClient implements the required methods to talk with kubernetes type RedisFailoverKubeClient struct { - K8SService k8s.Services - logger log.Logger + K8SService k8s.Services + logger log.Logger + metricsClient metrics.Recorder } // NewRedisFailoverKubeClient creates a new RedisFailoverKubeClient -func NewRedisFailoverKubeClient(k8sService k8s.Services, logger log.Logger) *RedisFailoverKubeClient { +func NewRedisFailoverKubeClient(k8sService k8s.Services, logger log.Logger, metricsClient metrics.Recorder) *RedisFailoverKubeClient { return &RedisFailoverKubeClient{ - K8SService: k8sService, - logger: logger, + K8SService: k8sService, + logger: logger, + metricsClient: metricsClient, } } @@ -65,13 +68,15 @@ func generateRedisSlaveRoleLabel() map[string]string { // EnsureSentinelService makes sure the sentinel service exists func (r *RedisFailoverKubeClient) EnsureSentinelService(rf *redisfailoverv1.RedisFailover, labels map[string]string, ownerRefs []metav1.OwnerReference) error { svc := generateSentinelService(rf, labels, ownerRefs) - return r.K8SService.CreateOrUpdateService(rf.Namespace, svc) + err := r.K8SService.CreateOrUpdateService(rf.Namespace, svc) + return r.setK8sObjectCreationMetrics(svc.Namespace, svc.Name, "Service", rf.Name, err) } // EnsureSentinelConfigMap makes sure the sentinel configmap exists func (r *RedisFailoverKubeClient) EnsureSentinelConfigMap(rf *redisfailoverv1.RedisFailover, labels map[string]string, ownerRefs []metav1.OwnerReference) error { cm := generateSentinelConfigMap(rf, labels, ownerRefs) - return r.K8SService.CreateOrUpdateConfigMap(rf.Namespace, cm) + err := r.K8SService.CreateOrUpdateConfigMap(rf.Namespace, cm) + return r.setK8sObjectCreationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) } // EnsureSentinelDeployment makes sure the sentinel deployment exists in the desired state @@ -80,7 +85,9 @@ func (r *RedisFailoverKubeClient) EnsureSentinelDeployment(rf *redisfailoverv1.R return err } d := generateSentinelDeployment(rf, labels, ownerRefs) - return r.K8SService.CreateOrUpdateDeployment(rf.Namespace, d) + err := r.K8SService.CreateOrUpdateDeployment(rf.Namespace, d) + + return r.setK8sObjectCreationMetrics(d.Namespace, d.Name, "Deployment", rf.Name, err) } // EnsureRedisStatefulset makes sure the redis statefulset exists in the desired state @@ -89,7 +96,9 @@ func (r *RedisFailoverKubeClient) EnsureRedisStatefulset(rf *redisfailoverv1.Red return err } ss := generateRedisStatefulSet(rf, labels, ownerRefs) - return r.K8SService.CreateOrUpdateStatefulSet(rf.Namespace, ss) + err := r.K8SService.CreateOrUpdateStatefulSet(rf.Namespace, ss) + + return r.setK8sObjectCreationMetrics(ss.Namespace, ss.Name, "StatefulSet", rf.Name, err) } // EnsureRedisConfigMap makes sure the Redis ConfigMap exists @@ -101,7 +110,9 @@ func (r *RedisFailoverKubeClient) EnsureRedisConfigMap(rf *redisfailoverv1.Redis } cm := generateRedisConfigMap(rf, labels, ownerRefs, password) - return r.K8SService.CreateOrUpdateConfigMap(rf.Namespace, cm) + err = r.K8SService.CreateOrUpdateConfigMap(rf.Namespace, cm) + + return r.setK8sObjectCreationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) } // EnsureRedisShutdownConfigMap makes sure the redis configmap with shutdown script exists @@ -112,7 +123,8 @@ func (r *RedisFailoverKubeClient) EnsureRedisShutdownConfigMap(rf *redisfailover } } else { cm := generateRedisShutdownConfigMap(rf, labels, ownerRefs) - return r.K8SService.CreateOrUpdateConfigMap(rf.Namespace, cm) + err := r.K8SService.CreateOrUpdateConfigMap(rf.Namespace, cm) + return r.setK8sObjectCreationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) } return nil } @@ -120,13 +132,16 @@ func (r *RedisFailoverKubeClient) EnsureRedisShutdownConfigMap(rf *redisfailover // EnsureRedisReadinessConfigMap makes sure the redis configmap with shutdown script exists func (r *RedisFailoverKubeClient) EnsureRedisReadinessConfigMap(rf *redisfailoverv1.RedisFailover, labels map[string]string, ownerRefs []metav1.OwnerReference) error { cm := generateRedisReadinessConfigMap(rf, labels, ownerRefs) - return r.K8SService.CreateOrUpdateConfigMap(rf.Namespace, cm) + err := r.K8SService.CreateOrUpdateConfigMap(rf.Namespace, cm) + return r.setK8sObjectCreationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) } // EnsureRedisService makes sure the redis statefulset exists func (r *RedisFailoverKubeClient) EnsureRedisService(rf *redisfailoverv1.RedisFailover, labels map[string]string, ownerRefs []metav1.OwnerReference) error { svc := generateRedisService(rf, labels, ownerRefs) - return r.K8SService.CreateOrUpdateService(rf.Namespace, svc) + err := r.K8SService.CreateOrUpdateService(rf.Namespace, svc) + + return r.setK8sObjectCreationMetrics(svc.Namespace, svc.Name, "Service", rf.Name, err) } // EnsureNotPresentRedisService makes sure the redis service is not present @@ -153,6 +168,14 @@ func (r *RedisFailoverKubeClient) ensurePodDisruptionBudget(rf *redisfailoverv1. labels = util.MergeLabels(labels, generateSelectorLabels(component, rf.Name)) pdb := generatePodDisruptionBudget(name, namespace, labels, ownerRefs, minAvailable) + err := r.K8SService.CreateOrUpdatePodDisruptionBudget(namespace, pdb) + return r.setK8sObjectCreationMetrics(pdb.Namespace, pdb.Name, "PodDisruptionBudget" /* pdb.TypeMeta.Kind isnt working; pdb.Kind isnt working either */, rf.Name, err) +} - return r.K8SService.CreateOrUpdatePodDisruptionBudget(namespace, pdb) +func (r *RedisFailoverKubeClient) setK8sObjectCreationMetrics(objectNamespace string, objectName string, objectKind string, ownerName string, err error) error { + if nil != err { + r.metricsClient.IncrEnsureResourceFailureCount(objectNamespace, objectName, objectKind, ownerName) + } + r.metricsClient.IncrEnsureResourceSuccessCount(objectNamespace, objectName, objectKind, ownerName) + return err } diff --git a/operator/redisfailover/service/generator_test.go b/operator/redisfailover/service/generator_test.go index 332e6067f..0167cc53a 100644 --- a/operator/redisfailover/service/generator_test.go +++ b/operator/redisfailover/service/generator_test.go @@ -13,6 +13,7 @@ import ( redisfailoverv1 "github.com/spotahome/redis-operator/api/redisfailover/v1" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" mK8SService "github.com/spotahome/redis-operator/mocks/service/k8s" rfservice "github.com/spotahome/redis-operator/operator/redisfailover/service" ) @@ -531,7 +532,7 @@ func TestRedisStatefulSetStorageGeneration(t *testing.T) { generatedStatefulSet = *ss }).Return(nil) - client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy) + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) err := client.EnsureRedisStatefulset(rf, nil, test.ownerRefs) // Check that the storage-related fields are as spected @@ -585,7 +586,7 @@ func TestRedisStatefulSetCommands(t *testing.T) { gotCommands = ss.Spec.Template.Spec.Containers[0].Command }).Return(nil) - client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy) + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) err := client.EnsureRedisStatefulset(rf, nil, []metav1.OwnerReference{}) assert.Equal(test.expectedCommands, gotCommands) @@ -637,7 +638,7 @@ func TestSentinelDeploymentCommands(t *testing.T) { gotCommands = d.Spec.Template.Spec.Containers[0].Command }).Return(nil) - client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy) + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) err := client.EnsureSentinelDeployment(rf, nil, []metav1.OwnerReference{}) assert.Equal(test.expectedCommands, gotCommands) @@ -685,7 +686,7 @@ func TestRedisStatefulSetPodAnnotations(t *testing.T) { gotPodAnnotations = ss.Spec.Template.ObjectMeta.Annotations }).Return(nil) - client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy) + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) err := client.EnsureRedisStatefulset(rf, nil, []metav1.OwnerReference{}) assert.Equal(test.expectedPodAnnotations, gotPodAnnotations) @@ -733,7 +734,7 @@ func TestSentinelDeploymentPodAnnotations(t *testing.T) { gotPodAnnotations = d.Spec.Template.ObjectMeta.Annotations }).Return(nil) - client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy) + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) err := client.EnsureSentinelDeployment(rf, nil, []metav1.OwnerReference{}) assert.Equal(test.expectedPodAnnotations, gotPodAnnotations) @@ -775,7 +776,7 @@ func TestRedisStatefulSetServiceAccountName(t *testing.T) { gotServiceAccountName = ss.Spec.Template.Spec.ServiceAccountName }).Return(nil) - client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy) + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) err := client.EnsureRedisStatefulset(rf, nil, []metav1.OwnerReference{}) assert.Equal(test.expectedServiceAccountName, gotServiceAccountName) @@ -817,7 +818,7 @@ func TestSentinelDeploymentServiceAccountName(t *testing.T) { gotServiceAccountName = d.Spec.Template.Spec.ServiceAccountName }).Return(nil) - client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy) + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) err := client.EnsureSentinelDeployment(rf, nil, []metav1.OwnerReference{}) assert.Equal(test.expectedServiceAccountName, gotServiceAccountName) @@ -1034,7 +1035,7 @@ func TestSentinelService(t *testing.T) { generatedService = *s }).Return(nil) - client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy) + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) err := client.EnsureSentinelService(rf, test.rfLabels, []metav1.OwnerReference{{Name: "testing"}}) assert.Equal(test.expectedService, generatedService) @@ -1282,7 +1283,7 @@ func TestRedisService(t *testing.T) { generatedService = *s }).Return(nil) - client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy) + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) err := client.EnsureRedisService(rf, test.rfLabels, []metav1.OwnerReference{{Name: "testing"}}) assert.Equal(test.expectedService, generatedService) @@ -1331,7 +1332,7 @@ func TestRedisHostNetworkAndDnsPolicy(t *testing.T) { actualDnsPolicy = ss.Spec.Template.Spec.DNSPolicy }).Return(nil) - client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy) + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) err := client.EnsureRedisStatefulset(rf, nil, []metav1.OwnerReference{}) assert.NoError(err) @@ -1380,7 +1381,7 @@ func TestSentinelHostNetworkAndDnsPolicy(t *testing.T) { actualDnsPolicy = d.Spec.Template.Spec.DNSPolicy }).Return(nil) - client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy) + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) err := client.EnsureSentinelDeployment(rf, nil, []metav1.OwnerReference{}) assert.NoError(err) @@ -1430,7 +1431,7 @@ func TestRedisImagePullPolicy(t *testing.T) { exporterPolicy = ss.Spec.Template.Spec.Containers[1].ImagePullPolicy }).Return(nil) - client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy) + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) err := client.EnsureRedisStatefulset(rf, nil, []metav1.OwnerReference{}) assert.NoError(err) @@ -1476,7 +1477,7 @@ func TestSentinelImagePullPolicy(t *testing.T) { configPolicy = d.Spec.Template.Spec.InitContainers[0].ImagePullPolicy }).Return(nil) - client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy) + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) err := client.EnsureSentinelDeployment(rf, nil, []metav1.OwnerReference{}) assert.NoError(err) @@ -1552,7 +1553,7 @@ func TestRedisExtraVolumeMounts(t *testing.T) { extraVolumeMount = s.Spec.Template.Spec.Containers[0].VolumeMounts[4] }).Return(nil) - client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy) + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) err := client.EnsureRedisStatefulset(rf, nil, []metav1.OwnerReference{}) assert.NoError(err) @@ -1628,7 +1629,7 @@ func TestSentinelExtraVolumeMounts(t *testing.T) { extraVolumeMount = d.Spec.Template.Spec.Containers[0].VolumeMounts[1] }).Return(nil) - client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy) + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) err := client.EnsureSentinelDeployment(rf, nil, []metav1.OwnerReference{}) assert.NoError(err) From bc997a986362a625a049966557620a9b338790fc Mon Sep 17 00:00:00 2001 From: Raghu Nandan B S Date: Mon, 26 Sep 2022 11:27:29 +0530 Subject: [PATCH 02/24] add ensureResourceFailure to MustRegister --- metrics/metrics.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index 20d7cf75a..b059afb6b 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -29,9 +29,9 @@ type Recorder interface { // PromMetrics implements the instrumenter so the metrics can be managed by Prometheus. type recorder struct { // Metrics fields. - clusterOK *prometheus.GaugeVec // clusterOk is the status of a cluster - ensureResourceSuccess *prometheus.CounterVec // number of successful "ensure" operators performed by the controller. - ensureResourceFailureRecorder *prometheus.CounterVec // number of failed "ensure" operators performed by the controller. + clusterOK *prometheus.GaugeVec // clusterOk is the status of a cluster + ensureResourceSuccess *prometheus.CounterVec // number of successful "ensure" operators performed by the controller. + ensureResourceFailure *prometheus.CounterVec // number of failed "ensure" operators performed by the controller. koopercontroller.MetricsRecorder } @@ -76,9 +76,9 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { // Create the instance. r := recorder{ - clusterOK: clusterOK, - ensureResourceSuccess: ensureResourceSuccess, - ensureResourceFailureRecorder: ensureResourceFailure, + clusterOK: clusterOK, + ensureResourceSuccess: ensureResourceSuccess, + ensureResourceFailure: ensureResourceFailure, MetricsRecorder: kooperprometheus.New(kooperprometheus.Config{ Registerer: reg, }), @@ -88,6 +88,7 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { reg.MustRegister( r.clusterOK, r.ensureResourceSuccess, + r.ensureResourceFailure, ) return r From 9eea5183628d8fd77a1fdfd8e152601e0f93401e Mon Sep 17 00:00:00 2001 From: Raghu Nandan B S Date: Mon, 26 Sep 2022 13:37:30 +0530 Subject: [PATCH 03/24] add redisInstance metric - reports IP of the instance, and IP of its master. --- Makefile | 1 - cmd/redisoperator/main.go | 2 +- metrics/dummy.go | 1 + metrics/metrics.go | 23 +++++++++++++++++--- operator/redisfailover/factory.go | 2 +- operator/redisfailover/service/check.go | 18 +++++++++------ operator/redisfailover/service/check_test.go | 3 ++- operator/redisfailover/service/client.go | 20 ++++++++--------- service/redis/client.go | 10 +++++++-- 9 files changed, 54 insertions(+), 26 deletions(-) diff --git a/Makefile b/Makefile index 21ca105b7..c91bc8c4b 100644 --- a/Makefile +++ b/Makefile @@ -94,7 +94,6 @@ image: deps-development -t $(SERVICE_NAME) \ -t $(REPOSITORY):latest \ -t $(REPOSITORY):$(COMMIT) \ - -t $(REPOSITORY):$(BRANCH) \ -f $(APP_DIR)/Dockerfile \ . diff --git a/cmd/redisoperator/main.go b/cmd/redisoperator/main.go index ea18cc019..c9b4a535f 100644 --- a/cmd/redisoperator/main.go +++ b/cmd/redisoperator/main.go @@ -85,7 +85,7 @@ func (m *Main) Run() error { k8sservice := k8s.New(k8sClient, customClient, aeClientset, m.logger) // Create the redis clients - redisClient := redis.New() + redisClient := redis.New(metricsRecorder) // Get lease lock resource namespace lockNamespace := getNamespace() diff --git a/metrics/dummy.go b/metrics/dummy.go index a63186a2d..1c39d8749 100644 --- a/metrics/dummy.go +++ b/metrics/dummy.go @@ -21,3 +21,4 @@ func (d *dummy) IncrEnsureResourceSuccessCount(objectNamespace string, objectNam } func (d *dummy) IncrEnsureResourceFailureCount(objectNamespace string, objectName string, objectKind string, resourceName string) { } +func (d *dummy) SetRedisInstance(IP string, masterIP string, role string) {} diff --git a/metrics/metrics.go b/metrics/metrics.go index b059afb6b..a464dd0da 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -21,9 +21,12 @@ type Recorder interface { SetClusterError(namespace string, name string) DeleteCluster(namespace string, name string) - //K8s Manager resources + // Indicate if `ensure` operation succeeded IncrEnsureResourceSuccessCount(objectNamespace string, objectName string, objectKind string, resourceName string) + // Indicate if `ensure` operation failed IncrEnsureResourceFailureCount(objectNamespace string, objectName string, objectKind string, resourceName string) + // Indicate redis instances being monitored + SetRedisInstance(IP string, masterIP string, role string) } // PromMetrics implements the instrumenter so the metrics can be managed by Prometheus. @@ -32,6 +35,7 @@ type recorder struct { clusterOK *prometheus.GaugeVec // clusterOk is the status of a cluster ensureResourceSuccess *prometheus.CounterVec // number of successful "ensure" operators performed by the controller. ensureResourceFailure *prometheus.CounterVec // number of failed "ensure" operators performed by the controller. + redisInstance *prometheus.GaugeVec // Indicates known redis instances, with IPs and master/slave status koopercontroller.MetricsRecorder } @@ -45,8 +49,8 @@ type ensureResourceFailureRecorder struct { koopercontroller.MetricsRecorder } -type k8sClientErrorRecorder struct { - k8sClientErrors *prometheus.GaugeVec +type redisInstanceRecorder struct { + ensureResourceSuccess *prometheus.GaugeVec koopercontroller.MetricsRecorder } @@ -74,11 +78,19 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { Help: "number of failed 'ensure' operations on a resource performed by the controller.", }, []string{"object_namespace", "object_name", "object_kind", "resource_name"}) + redisInstance := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: promControllerSubsystem, + Name: "redis_instance", + Help: "redis instances discovered guage. IPs of redis instances, and Master/Slave role as indicators in the labels.", + }, []string{"IP", "MasterIP", "role"}) + // Create the instance. r := recorder{ clusterOK: clusterOK, ensureResourceSuccess: ensureResourceSuccess, ensureResourceFailure: ensureResourceFailure, + redisInstance: redisInstance, MetricsRecorder: kooperprometheus.New(kooperprometheus.Config{ Registerer: reg, }), @@ -89,6 +101,7 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { r.clusterOK, r.ensureResourceSuccess, r.ensureResourceFailure, + r.redisInstance, ) return r @@ -116,3 +129,7 @@ func (r recorder) IncrEnsureResourceSuccessCount(objectNamespace string, objectN func (r recorder) IncrEnsureResourceFailureCount(objectNamespace string, objectName string, objectKind string, resourceName string) { r.ensureResourceSuccess.WithLabelValues(objectNamespace, objectName, objectKind, resourceName).Add(1) } + +func (r recorder) SetRedisInstance(IP string, masterIP string, role string) { + r.redisInstance.WithLabelValues(IP, masterIP, role).Set(1) +} diff --git a/operator/redisfailover/factory.go b/operator/redisfailover/factory.go index d8a44df25..2b4c82c8b 100644 --- a/operator/redisfailover/factory.go +++ b/operator/redisfailover/factory.go @@ -31,7 +31,7 @@ const ( func New(cfg Config, k8sService k8s.Services, k8sClient kubernetes.Interface, lockNamespace string, redisClient redis.Client, kooperMetricsRecorder metrics.Recorder, logger log.Logger) (controller.Controller, error) { // Create internal services. rfService := rfservice.NewRedisFailoverKubeClient(k8sService, logger, kooperMetricsRecorder) - rfChecker := rfservice.NewRedisFailoverChecker(k8sService, redisClient, logger) + rfChecker := rfservice.NewRedisFailoverChecker(k8sService, redisClient, logger, kooperMetricsRecorder) rfHealer := rfservice.NewRedisFailoverHealer(k8sService, redisClient, logger) // Create the handlers. diff --git a/operator/redisfailover/service/check.go b/operator/redisfailover/service/check.go index ddce86ae8..396bc72a7 100644 --- a/operator/redisfailover/service/check.go +++ b/operator/redisfailover/service/check.go @@ -11,6 +11,7 @@ import ( redisfailoverv1 "github.com/spotahome/redis-operator/api/redisfailover/v1" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" "github.com/spotahome/redis-operator/service/k8s" "github.com/spotahome/redis-operator/service/redis" ) @@ -37,17 +38,19 @@ type RedisFailoverCheck interface { // RedisFailoverChecker is our implementation of RedisFailoverCheck interface type RedisFailoverChecker struct { - k8sService k8s.Services - redisClient redis.Client - logger log.Logger + k8sService k8s.Services + redisClient redis.Client + logger log.Logger + metricsClient metrics.Recorder } // NewRedisFailoverChecker creates an object of the RedisFailoverChecker struct -func NewRedisFailoverChecker(k8sService k8s.Services, redisClient redis.Client, logger log.Logger) *RedisFailoverChecker { +func NewRedisFailoverChecker(k8sService k8s.Services, redisClient redis.Client, logger log.Logger, metricsClient metrics.Recorder) *RedisFailoverChecker { return &RedisFailoverChecker{ - k8sService: k8sService, - redisClient: redisClient, - logger: logger, + k8sService: k8sService, + redisClient: redisClient, + logger: logger, + metricsClient: metricsClient, } } @@ -120,6 +123,7 @@ func (r *RedisFailoverChecker) CheckAllSlavesFromMaster(master string, rf *redis } slave, err := r.redisClient.GetSlaveOf(rp.Status.PodIP, rport, password) + // set redis instance metrics if err != nil { r.logger.Errorf("Get slave of master failed, maybe this node is not ready, pod ip: %s", rp.Status.PodIP) return err diff --git a/operator/redisfailover/service/check_test.go b/operator/redisfailover/service/check_test.go index 297777d30..bb9595eb8 100644 --- a/operator/redisfailover/service/check_test.go +++ b/operator/redisfailover/service/check_test.go @@ -14,6 +14,7 @@ import ( redisfailoverv1 "github.com/spotahome/redis-operator/api/redisfailover/v1" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" mK8SService "github.com/spotahome/redis-operator/mocks/service/k8s" mRedisService "github.com/spotahome/redis-operator/mocks/service/redis" rfservice "github.com/spotahome/redis-operator/operator/redisfailover/service" @@ -66,7 +67,7 @@ func TestCheckRedisNumberFalse(t *testing.T) { ms.On("GetStatefulSet", namespace, rfservice.GetRedisName(rf)).Once().Return(ss, nil) mr := &mRedisService.Client{} - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckRedisNumber(rf) assert.Error(err) diff --git a/operator/redisfailover/service/client.go b/operator/redisfailover/service/client.go index f92dd728c..b031b0c92 100644 --- a/operator/redisfailover/service/client.go +++ b/operator/redisfailover/service/client.go @@ -69,14 +69,14 @@ func generateRedisSlaveRoleLabel() map[string]string { func (r *RedisFailoverKubeClient) EnsureSentinelService(rf *redisfailoverv1.RedisFailover, labels map[string]string, ownerRefs []metav1.OwnerReference) error { svc := generateSentinelService(rf, labels, ownerRefs) err := r.K8SService.CreateOrUpdateService(rf.Namespace, svc) - return r.setK8sObjectCreationMetrics(svc.Namespace, svc.Name, "Service", rf.Name, err) + return r.setEnsureOperationMetrics(svc.Namespace, svc.Name, "Service", rf.Name, err) } // EnsureSentinelConfigMap makes sure the sentinel configmap exists func (r *RedisFailoverKubeClient) EnsureSentinelConfigMap(rf *redisfailoverv1.RedisFailover, labels map[string]string, ownerRefs []metav1.OwnerReference) error { cm := generateSentinelConfigMap(rf, labels, ownerRefs) err := r.K8SService.CreateOrUpdateConfigMap(rf.Namespace, cm) - return r.setK8sObjectCreationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) + return r.setEnsureOperationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) } // EnsureSentinelDeployment makes sure the sentinel deployment exists in the desired state @@ -87,7 +87,7 @@ func (r *RedisFailoverKubeClient) EnsureSentinelDeployment(rf *redisfailoverv1.R d := generateSentinelDeployment(rf, labels, ownerRefs) err := r.K8SService.CreateOrUpdateDeployment(rf.Namespace, d) - return r.setK8sObjectCreationMetrics(d.Namespace, d.Name, "Deployment", rf.Name, err) + return r.setEnsureOperationMetrics(d.Namespace, d.Name, "Deployment", rf.Name, err) } // EnsureRedisStatefulset makes sure the redis statefulset exists in the desired state @@ -98,7 +98,7 @@ func (r *RedisFailoverKubeClient) EnsureRedisStatefulset(rf *redisfailoverv1.Red ss := generateRedisStatefulSet(rf, labels, ownerRefs) err := r.K8SService.CreateOrUpdateStatefulSet(rf.Namespace, ss) - return r.setK8sObjectCreationMetrics(ss.Namespace, ss.Name, "StatefulSet", rf.Name, err) + return r.setEnsureOperationMetrics(ss.Namespace, ss.Name, "StatefulSet", rf.Name, err) } // EnsureRedisConfigMap makes sure the Redis ConfigMap exists @@ -112,7 +112,7 @@ func (r *RedisFailoverKubeClient) EnsureRedisConfigMap(rf *redisfailoverv1.Redis cm := generateRedisConfigMap(rf, labels, ownerRefs, password) err = r.K8SService.CreateOrUpdateConfigMap(rf.Namespace, cm) - return r.setK8sObjectCreationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) + return r.setEnsureOperationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) } // EnsureRedisShutdownConfigMap makes sure the redis configmap with shutdown script exists @@ -124,7 +124,7 @@ func (r *RedisFailoverKubeClient) EnsureRedisShutdownConfigMap(rf *redisfailover } else { cm := generateRedisShutdownConfigMap(rf, labels, ownerRefs) err := r.K8SService.CreateOrUpdateConfigMap(rf.Namespace, cm) - return r.setK8sObjectCreationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) + return r.setEnsureOperationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) } return nil } @@ -133,7 +133,7 @@ func (r *RedisFailoverKubeClient) EnsureRedisShutdownConfigMap(rf *redisfailover func (r *RedisFailoverKubeClient) EnsureRedisReadinessConfigMap(rf *redisfailoverv1.RedisFailover, labels map[string]string, ownerRefs []metav1.OwnerReference) error { cm := generateRedisReadinessConfigMap(rf, labels, ownerRefs) err := r.K8SService.CreateOrUpdateConfigMap(rf.Namespace, cm) - return r.setK8sObjectCreationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) + return r.setEnsureOperationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) } // EnsureRedisService makes sure the redis statefulset exists @@ -141,7 +141,7 @@ func (r *RedisFailoverKubeClient) EnsureRedisService(rf *redisfailoverv1.RedisFa svc := generateRedisService(rf, labels, ownerRefs) err := r.K8SService.CreateOrUpdateService(rf.Namespace, svc) - return r.setK8sObjectCreationMetrics(svc.Namespace, svc.Name, "Service", rf.Name, err) + return r.setEnsureOperationMetrics(svc.Namespace, svc.Name, "Service", rf.Name, err) } // EnsureNotPresentRedisService makes sure the redis service is not present @@ -169,10 +169,10 @@ func (r *RedisFailoverKubeClient) ensurePodDisruptionBudget(rf *redisfailoverv1. pdb := generatePodDisruptionBudget(name, namespace, labels, ownerRefs, minAvailable) err := r.K8SService.CreateOrUpdatePodDisruptionBudget(namespace, pdb) - return r.setK8sObjectCreationMetrics(pdb.Namespace, pdb.Name, "PodDisruptionBudget" /* pdb.TypeMeta.Kind isnt working; pdb.Kind isnt working either */, rf.Name, err) + return r.setEnsureOperationMetrics(pdb.Namespace, pdb.Name, "PodDisruptionBudget" /* pdb.TypeMeta.Kind isnt working; pdb.Kind isnt working either */, rf.Name, err) } -func (r *RedisFailoverKubeClient) setK8sObjectCreationMetrics(objectNamespace string, objectName string, objectKind string, ownerName string, err error) error { +func (r *RedisFailoverKubeClient) setEnsureOperationMetrics(objectNamespace string, objectName string, objectKind string, ownerName string, err error) error { if nil != err { r.metricsClient.IncrEnsureResourceFailureCount(objectNamespace, objectName, objectKind, ownerName) } diff --git a/service/redis/client.go b/service/redis/client.go index da7e9c84e..879e1157d 100644 --- a/service/redis/client.go +++ b/service/redis/client.go @@ -10,6 +10,7 @@ import ( "strings" rediscli "github.com/go-redis/redis/v8" + "github.com/spotahome/redis-operator/metrics" ) // Client defines the functions neccesary to connect to redis and sentinel to get or set what we nned @@ -31,11 +32,14 @@ type Client interface { } type client struct { + metricsRecorder metrics.Recorder } // New returns a redis client -func New() Client { - return &client{} +func New(metricsRecorder metrics.Recorder) Client { + return &client{ + metricsRecorder: metricsRecorder, + } } const ( @@ -157,8 +161,10 @@ func (c *client) GetSlaveOf(ip, port, password string) (string, error) { } match := redisMasterHostRE.FindStringSubmatch(info) if len(match) == 0 { + c.metricsRecorder.SetRedisInstance(ip, ip, "master") return "", nil } + c.metricsRecorder.SetRedisInstance(ip, match[1], "slave") return match[1], nil } From 94ac740ea945c5319aec2e74bf3d5a09cf11d7e0 Mon Sep 17 00:00:00 2001 From: Raghu Nandan B S Date: Mon, 26 Sep 2022 17:43:11 +0530 Subject: [PATCH 04/24] add redisUnhealthy/sentinelUnhealthy metrics --- metrics/dummy.go | 5 ++ metrics/metrics.go | 70 +++++++++++++++++++++++-- operator/redisfailover/checker.go | 11 +++- operator/redisfailover/service/check.go | 3 ++ service/redis/client.go | 36 +++++++++++++ 5 files changed, 121 insertions(+), 4 deletions(-) diff --git a/metrics/dummy.go b/metrics/dummy.go index 1c39d8749..431ef7481 100644 --- a/metrics/dummy.go +++ b/metrics/dummy.go @@ -22,3 +22,8 @@ func (d *dummy) IncrEnsureResourceSuccessCount(objectNamespace string, objectNam func (d *dummy) IncrEnsureResourceFailureCount(objectNamespace string, objectName string, objectKind string, resourceName string) { } func (d *dummy) SetRedisInstance(IP string, masterIP string, role string) {} +func (d *dummy) ResetRedisInstance() {} +func (d *dummy) IncrRedisUnhealthyCount(namespace string, resource string, indicator string, instance string) { +} +func (d *dummy) IncrSentinelUnhealthyCount(namespace string, resource string, indicator string, instance string) { +} diff --git a/metrics/metrics.go b/metrics/metrics.go index a464dd0da..4514540ff 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -10,7 +10,24 @@ const ( promControllerSubsystem = "controller" ) -const () +// variables for setting various indicator labels +const ( + NOT_APPLICABLE = "NA" + UNHEALTHY = 1.0 + HEALTHY = 0.0 + REDIS_REPLICA_MISMATCH = "REDIS_STATEFULSET_REPLICAS_MISMATCH" + SENTINEL_REPLICA_MISMATCH = "SENTINEL_DEPLOYMENT_REPLICAS_MISMATCH" + NUMBER_OF_MASTERS = "MASTER_COUNT_IS_NOT_ONE" + SENTINEL_WRONG_MASTER = "SENTINEL_IS_CONFIGURED_WITH_WRONG_MASTER_IP" + SLAVE_WRONG_MASTER = "SLAVE_IS_CONFIGURED_WITH_WRONG_MASTER_IP" + + // redis connection related errors + WRONG_PASSWORD_USED = "WRONG_PASSWORD_USED" + NOAUTH = "AUTH_CREDENTIALS_NOT_PROVIDED" + NOPERM = "REDIS_USER_DOES_NOT_HAVE_PERMISSIONS" + IO_TIMEOUT = "CONNECTION_TIMEDOUT" + CONNECTION_REFUSED = "CONNECTION_REFUSED" +) // Instrumenter is the interface that will collect the metrics and has ability to send/expose those metrics. type Recorder interface { @@ -25,8 +42,13 @@ type Recorder interface { IncrEnsureResourceSuccessCount(objectNamespace string, objectName string, objectKind string, resourceName string) // Indicate if `ensure` operation failed IncrEnsureResourceFailureCount(objectNamespace string, objectName string, objectKind string, resourceName string) + // Indicate redis instances being monitored SetRedisInstance(IP string, masterIP string, role string) + ResetRedisInstance() + + IncrRedisUnhealthyCount(namespace string, resource string, indicator /* aspect of redis that is unhealthy */ string, instance string) + IncrSentinelUnhealthyCount(namespace string, resource string, indicator /* aspect of redis that is unhealthy */ string, instance string) } // PromMetrics implements the instrumenter so the metrics can be managed by Prometheus. @@ -36,6 +58,8 @@ type recorder struct { ensureResourceSuccess *prometheus.CounterVec // number of successful "ensure" operators performed by the controller. ensureResourceFailure *prometheus.CounterVec // number of failed "ensure" operators performed by the controller. redisInstance *prometheus.GaugeVec // Indicates known redis instances, with IPs and master/slave status + redisUnhealthy *prometheus.CounterVec // indicates any error encountered in managed redis instance(s) + sentinelUnhealthy *prometheus.CounterVec // indicates any error encountered in managed sentinel instance(s) koopercontroller.MetricsRecorder } @@ -54,6 +78,16 @@ type redisInstanceRecorder struct { koopercontroller.MetricsRecorder } +type redisHealthRecorder struct { + redisUnhealthy *prometheus.CounterVec + koopercontroller.MetricsRecorder +} + +type sentinelHealthRecorder struct { + sentinelUnhealthy *prometheus.CounterVec + koopercontroller.MetricsRecorder +} + // NewPrometheusMetrics returns a new PromMetrics object. func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { // Create metrics. @@ -81,16 +115,32 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { redisInstance := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: promControllerSubsystem, - Name: "redis_instance", - Help: "redis instances discovered guage. IPs of redis instances, and Master/Slave role as indicators in the labels.", + Name: "redis_instance_info", + Help: "redis instances discovered. IPs of redis instances, and Master/Slave role as indicators in the labels.", }, []string{"IP", "MasterIP", "role"}) + redisUnhealthy := prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: promControllerSubsystem, + Name: "redis_unhealthy", + Help: "indicates any error encountered in managed redis instance(s)", + }, []string{"namespace", "resource", "indicator", "instance"}) + + sentinelUnhealthy := prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: promControllerSubsystem, + Name: "sentinel_unhealthy", + Help: "indicates any error encountered in managed sentinel instance(s)", + }, []string{"namespace", "resource", "indicator", "instance"}) + // Create the instance. r := recorder{ clusterOK: clusterOK, ensureResourceSuccess: ensureResourceSuccess, ensureResourceFailure: ensureResourceFailure, redisInstance: redisInstance, + redisUnhealthy: redisUnhealthy, + sentinelUnhealthy: sentinelUnhealthy, MetricsRecorder: kooperprometheus.New(kooperprometheus.Config{ Registerer: reg, }), @@ -102,6 +152,8 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { r.ensureResourceSuccess, r.ensureResourceFailure, r.redisInstance, + r.redisUnhealthy, + r.sentinelUnhealthy, ) return r @@ -133,3 +185,15 @@ func (r recorder) IncrEnsureResourceFailureCount(objectNamespace string, objectN func (r recorder) SetRedisInstance(IP string, masterIP string, role string) { r.redisInstance.WithLabelValues(IP, masterIP, role).Set(1) } + +func (r recorder) ResetRedisInstance() { + r.redisInstance.Reset() +} + +func (r recorder) IncrRedisUnhealthyCount(namespace string, resource string, indicator /* aspect of redis that is unhealthy */ string, instance string) { + r.redisUnhealthy.WithLabelValues(namespace, resource, indicator, instance).Add(1) +} + +func (r recorder) IncrSentinelUnhealthyCount(namespace string, resource string, indicator /* aspect of sentinel that is unhealthy */ string, instance string) { + r.sentinelUnhealthy.WithLabelValues(namespace, resource, indicator, instance).Add(1) +} diff --git a/operator/redisfailover/checker.go b/operator/redisfailover/checker.go index 583d74628..d6c4f0148 100644 --- a/operator/redisfailover/checker.go +++ b/operator/redisfailover/checker.go @@ -6,13 +6,14 @@ import ( "time" redisfailoverv1 "github.com/spotahome/redis-operator/api/redisfailover/v1" + "github.com/spotahome/redis-operator/metrics" ) const ( timeToPrepare = 2 * time.Minute ) -//UpdateRedisesPods if the running version of pods are equal to the statefulset one +// UpdateRedisesPods if the running version of pods are equal to the statefulset one func (r *RedisFailoverHandler) UpdateRedisesPods(rf *redisfailoverv1.RedisFailover) error { redises, err := r.rfChecker.GetRedisesIPs(rf) if err != nil { @@ -100,12 +101,16 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e // All sentinels points to the same redis master // Sentinel has not death nodes // Sentinel knows the correct slave number + if err := r.rfChecker.CheckRedisNumber(rf); err != nil { r.logger.Debug("Number of redis mismatch, this could be for a change on the statefulset") + r.mClient.IncrRedisUnhealthyCount(rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE) return nil } + if err := r.rfChecker.CheckSentinelNumber(rf); err != nil { r.logger.Debug("Number of sentinel mismatch, this could be for a change on the deployment") + r.mClient.IncrSentinelUnhealthyCount(rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE) return nil } @@ -115,6 +120,7 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e } switch nMasters { case 0: + r.mClient.IncrRedisUnhealthyCount(rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE) redisesIP, err := r.rfChecker.GetRedisesIPs(rf) if err != nil { return err @@ -143,6 +149,7 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e case 1: break default: + r.mClient.IncrRedisUnhealthyCount(rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE) return errors.New("More than one master, fix manually") } @@ -174,11 +181,13 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e port := getRedisPort(rf.Spec.Redis.Port) for _, sip := range sentinels { if err := r.rfChecker.CheckSentinelMonitor(sip, master, port); err != nil { + r.mClient.IncrRedisUnhealthyCount(rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip) r.logger.Debug("Sentinel is not monitoring the correct master") if err := r.rfHealer.NewSentinelMonitor(sip, master, rf); err != nil { return err } } + } return r.checkAndHealSentinels(rf, sentinels) } diff --git a/operator/redisfailover/service/check.go b/operator/redisfailover/service/check.go index 396bc72a7..d5b9dc740 100644 --- a/operator/redisfailover/service/check.go +++ b/operator/redisfailover/service/check.go @@ -98,6 +98,7 @@ func (r *RedisFailoverChecker) setSlaveLabelIfNecessary(namespace string, pod co // CheckAllSlavesFromMaster controlls that all slaves have the same master (the real one) func (r *RedisFailoverChecker) CheckAllSlavesFromMaster(master string, rf *redisfailoverv1.RedisFailover) error { + r.metricsClient.ResetRedisInstance() rps, err := r.k8sService.GetStatefulSetPods(rf.Namespace, GetRedisName(rf)) if err != nil { return err @@ -129,8 +130,10 @@ func (r *RedisFailoverChecker) CheckAllSlavesFromMaster(master string, rf *redis return err } if slave != "" && slave != master { + r.metricsClient.IncrRedisUnhealthyCount(rf.Namespace, rf.Name, metrics.SLAVE_WRONG_MASTER, rp.Status.PodIP) return fmt.Errorf("slave %s don't have the master %s, has %s", rp.Status.PodIP, master, slave) } + } return nil } diff --git a/service/redis/client.go b/service/redis/client.go index 879e1157d..dc7e8fa67 100644 --- a/service/redis/client.go +++ b/service/redis/client.go @@ -10,6 +10,7 @@ import ( "strings" rediscli "github.com/go-redis/redis/v8" + "github.com/spotahome/redis-operator/log" "github.com/spotahome/redis-operator/metrics" ) @@ -74,6 +75,7 @@ func (c *client) GetNumberSentinelsInMemory(ip string) (int32, error) { defer rClient.Close() info, err := rClient.Info(context.TODO(), "sentinel").Result() if err != nil { + c.recordRedisError(err, ip) return 0, err } if err2 := isSentinelReady(info); err2 != nil { @@ -101,6 +103,7 @@ func (c *client) GetNumberSentinelSlavesInMemory(ip string) (int32, error) { defer rClient.Close() info, err := rClient.Info(context.TODO(), "sentinel").Result() if err != nil { + c.recordRedisError(err, ip) return 0, err } if err2 := isSentinelReady(info); err2 != nil { @@ -137,6 +140,7 @@ func (c *client) ResetSentinel(ip string) error { cmd := rediscli.NewIntCmd(context.TODO(), "SENTINEL", "reset", "*") err := rClient.Process(context.TODO(), cmd) if err != nil { + c.recordRedisError(err, ip) return err } _, err = cmd.Result() @@ -148,6 +152,7 @@ func (c *client) ResetSentinel(ip string) error { // GetSlaveOf returns the master of the given redis, or nil if it's master func (c *client) GetSlaveOf(ip, port, password string) (string, error) { + options := &rediscli.Options{ Addr: net.JoinHostPort(ip, port), Password: password, @@ -157,10 +162,13 @@ func (c *client) GetSlaveOf(ip, port, password string) (string, error) { defer rClient.Close() info, err := rClient.Info(context.TODO(), "replication").Result() if err != nil { + log.Errorf("error while getting masterIP : %v", err) + c.recordRedisError(err, ip) return "", err } match := redisMasterHostRE.FindStringSubmatch(info) if len(match) == 0 { + log.Errorf("error while getting masterIP : %v", err) c.metricsRecorder.SetRedisInstance(ip, ip, "master") return "", nil } @@ -178,6 +186,7 @@ func (c *client) IsMaster(ip, port, password string) (bool, error) { defer rClient.Close() info, err := rClient.Info(context.TODO(), "replication").Result() if err != nil { + c.recordRedisError(err, ip) return false, err } return strings.Contains(info, redisRoleMaster), nil @@ -201,10 +210,12 @@ func (c *client) MonitorRedisWithPort(ip, monitor, port, quorum, password string cmd = rediscli.NewBoolCmd(context.TODO(), "SENTINEL", "MONITOR", masterName, monitor, port, quorum) err := rClient.Process(context.TODO(), cmd) if err != nil { + c.recordRedisError(err, ip) return err } _, err = cmd.Result() if err != nil { + c.recordRedisError(err, ip) return err } @@ -212,6 +223,7 @@ func (c *client) MonitorRedisWithPort(ip, monitor, port, quorum, password string cmd = rediscli.NewBoolCmd(context.TODO(), "SENTINEL", "SET", masterName, "auth-pass", password) err := rClient.Process(context.TODO(), cmd) if err != nil { + c.recordRedisError(err, ip) return err } _, err = cmd.Result() @@ -231,6 +243,7 @@ func (c *client) MakeMaster(ip string, port string, password string) error { rClient := rediscli.NewClient(options) defer rClient.Close() if res := rClient.SlaveOf(context.TODO(), "NO", "ONE"); res.Err() != nil { + c.recordRedisError(res.Err(), ip) return res.Err() } return nil @@ -249,6 +262,7 @@ func (c *client) MakeSlaveOfWithPort(ip, masterIP, masterPort, password string) rClient := rediscli.NewClient(options) defer rClient.Close() if res := rClient.SlaveOf(context.TODO(), masterIP, masterPort); res.Err() != nil { + c.recordRedisError(res.Err(), ip) return res.Err() } return nil @@ -265,10 +279,12 @@ func (c *client) GetSentinelMonitor(ip string) (string, string, error) { cmd := rediscli.NewSliceCmd(context.TODO(), "SENTINEL", "master", masterName) err := rClient.Process(context.TODO(), cmd) if err != nil { + c.recordRedisError(err, ip) return "", "", err } res, err := cmd.Result() if err != nil { + c.recordRedisError(err, ip) return "", "", err } masterIP := res[3].(string) @@ -320,6 +336,10 @@ func (c *client) SetCustomRedisConfig(ip string, port string, configs []string, func (c *client) applyRedisConfig(parameter string, value string, rClient *rediscli.Client) error { result := rClient.ConfigSet(context.TODO(), parameter, value) + if nil != result.Err() { + c.recordRedisError(result.Err(), strings.Split(rClient.Options().Addr, ":")[0]) + } + return result.Err() } @@ -327,6 +347,7 @@ func (c *client) applySentinelConfig(parameter string, value string, rClient *re cmd := rediscli.NewStatusCmd(context.TODO(), "SENTINEL", "set", masterName, parameter, value) err := rClient.Process(context.TODO(), cmd) if err != nil { + c.recordRedisError(err, strings.Split(rClient.Options().Addr, ":")[0]) return err } return cmd.Err() @@ -350,6 +371,7 @@ func (c *client) SlaveIsReady(ip, port, password string) (bool, error) { defer rClient.Close() info, err := rClient.Info(context.TODO(), "replication").Result() if err != nil { + c.recordRedisError(err, err.Error()) return false, err } @@ -359,3 +381,17 @@ func (c *client) SlaveIsReady(ip, port, password string) (bool, error) { return ok, nil } + +func (c *client) recordRedisError(err error, ip string) { + if strings.Contains(err.Error(), "NOAUTH") { + c.metricsRecorder.IncrRedisUnhealthyCount(metrics.NOT_APPLICABLE, metrics.NOT_APPLICABLE, metrics.NOAUTH, ip) + } else if strings.Contains(err.Error(), "WRONGPASS") { + c.metricsRecorder.IncrRedisUnhealthyCount(metrics.NOT_APPLICABLE, metrics.NOT_APPLICABLE, metrics.WRONG_PASSWORD_USED, ip) + } else if strings.Contains(err.Error(), "NOPERM") { + c.metricsRecorder.IncrRedisUnhealthyCount(metrics.NOT_APPLICABLE, metrics.NOT_APPLICABLE, metrics.NOPERM, ip) + } else if strings.Contains(err.Error(), "i/o timeout") { + c.metricsRecorder.IncrRedisUnhealthyCount(metrics.NOT_APPLICABLE, metrics.NOT_APPLICABLE, metrics.IO_TIMEOUT, ip) + } else if strings.Contains(err.Error(), "connection refused") { + c.metricsRecorder.IncrRedisUnhealthyCount(metrics.NOT_APPLICABLE, metrics.NOT_APPLICABLE, metrics.CONNECTION_REFUSED, ip) + } +} From 57d86a4c0d0309cf0f68d7010603b5a16b4a579e Mon Sep 17 00:00:00 2001 From: Raghu Nandan B S Date: Tue, 27 Sep 2022 16:09:46 +0530 Subject: [PATCH 05/24] add k8s_operations, redis_operations metrics --- cmd/redisoperator/main.go | 2 +- .../all-redis-operator-resources.yaml | 11 ++- metrics/dummy.go | 4 + metrics/metrics.go | 92 ++++++++++++++++--- operator/redisfailover/checker.go | 5 +- operator/redisfailover/service/check.go | 2 - operator/redisfailover/service/check_test.go | 70 +++++++------- service/k8s/configmap.go | 24 +++-- service/k8s/configmap_test.go | 3 +- service/k8s/deployment.go | 25 +++-- service/k8s/deployment_test.go | 3 +- service/k8s/k8s.go | 21 +++-- service/k8s/pod.go | 27 ++++-- service/k8s/pod_test.go | 3 +- service/k8s/poddisruptionbudget.go | 20 ++-- service/k8s/poddisruptionbudget_test.go | 3 +- service/k8s/rbac.go | 31 +++++-- service/k8s/rbac_test.go | 3 +- service/k8s/redisfailover.go | 21 +++-- service/k8s/secret.go | 14 ++- service/k8s/secret_test.go | 3 +- service/k8s/service.go | 28 ++++-- service/k8s/service_test.go | 3 +- service/k8s/statefulset.go | 24 +++-- service/k8s/statefulset_test.go | 3 +- service/k8s/util.go | 17 ++++ service/redis/client.go | 72 +++++++++------ 27 files changed, 372 insertions(+), 162 deletions(-) diff --git a/cmd/redisoperator/main.go b/cmd/redisoperator/main.go index c9b4a535f..c42b73874 100644 --- a/cmd/redisoperator/main.go +++ b/cmd/redisoperator/main.go @@ -82,7 +82,7 @@ func (m *Main) Run() error { } // Create kubernetes service. - k8sservice := k8s.New(k8sClient, customClient, aeClientset, m.logger) + k8sservice := k8s.New(k8sClient, customClient, aeClientset, m.logger, metricsRecorder) // Create the redis clients redisClient := redis.New(metricsRecorder) diff --git a/example/operator/all-redis-operator-resources.yaml b/example/operator/all-redis-operator-resources.yaml index 30342514b..ec4027d16 100644 --- a/example/operator/all-redis-operator-resources.yaml +++ b/example/operator/all-redis-operator-resources.yaml @@ -18,8 +18,8 @@ spec: spec: serviceAccountName: redisoperator containers: - - image: quay.io/spotahome/redis-operator:v1.1.0 - imagePullPolicy: IfNotPresent + - image: redis-operator:latest + imagePullPolicy: Never name: app securityContext: readOnlyRootFilesystem: true @@ -96,6 +96,13 @@ rules: - poddisruptionbudgets verbs: - "*" + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - "*" + --- apiVersion: v1 kind: ServiceAccount diff --git a/metrics/dummy.go b/metrics/dummy.go index 431ef7481..a5138daf3 100644 --- a/metrics/dummy.go +++ b/metrics/dummy.go @@ -27,3 +27,7 @@ func (d *dummy) IncrRedisUnhealthyCount(namespace string, resource string, indic } func (d *dummy) IncrSentinelUnhealthyCount(namespace string, resource string, indicator string, instance string) { } +func (d dummy) RecordK8sOperation(namespace string, kind string, object string, operation string, status string, err string) { +} +func (d dummy) RecordRedisOperation(kind /*redis/sentinel? */ string, IP string, operation string, status string, err string) { +} diff --git a/metrics/metrics.go b/metrics/metrics.go index 4514540ff..b8c0bf2e0 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -12,21 +12,47 @@ const ( // variables for setting various indicator labels const ( - NOT_APPLICABLE = "NA" - UNHEALTHY = 1.0 - HEALTHY = 0.0 - REDIS_REPLICA_MISMATCH = "REDIS_STATEFULSET_REPLICAS_MISMATCH" - SENTINEL_REPLICA_MISMATCH = "SENTINEL_DEPLOYMENT_REPLICAS_MISMATCH" - NUMBER_OF_MASTERS = "MASTER_COUNT_IS_NOT_ONE" - SENTINEL_WRONG_MASTER = "SENTINEL_IS_CONFIGURED_WITH_WRONG_MASTER_IP" - SLAVE_WRONG_MASTER = "SLAVE_IS_CONFIGURED_WITH_WRONG_MASTER_IP" - + SUCCESS = "SUCCESS" + FAIL = "FAIL" + NOT_APPLICABLE = "NA" + UNHEALTHY = 1.0 + HEALTHY = 0.0 + REDIS_REPLICA_MISMATCH = "REDIS_STATEFULSET_REPLICAS_MISMATCH" + SENTINEL_REPLICA_MISMATCH = "SENTINEL_DEPLOYMENT_REPLICAS_MISMATCH" + NUMBER_OF_MASTERS = "MASTER_COUNT_IS_NOT_ONE" + SENTINEL_WRONG_MASTER = "SENTINEL_IS_CONFIGURED_WITH_WRONG_MASTER_IP" + SLAVE_WRONG_MASTER = "SLAVE_IS_CONFIGURED_WITH_WRONG_MASTER_IP" + SENTINEL_NOT_READY = "SENTINEL_NOT_READY" + REGEX_NOT_FOUND = "SENTINEL_REGEX_NOT_FOUND" + MISC = "MISC_ERROR" + SENTINEL_NUMBER_IN_MEMORY_MISMATCH = "SENTINEL_NUMBER_IN_MEMORY_MISMATCH" + REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH = "REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH" // redis connection related errors WRONG_PASSWORD_USED = "WRONG_PASSWORD_USED" NOAUTH = "AUTH_CREDENTIALS_NOT_PROVIDED" NOPERM = "REDIS_USER_DOES_NOT_HAVE_PERMISSIONS" IO_TIMEOUT = "CONNECTION_TIMEDOUT" CONNECTION_REFUSED = "CONNECTION_REFUSED" + + K8S_FORBIDDEN_ERR = "USER_FORBIDDEN_TO_PERFORM_ACTION" + K8S_UNAUTH = "CLIENT_NOT_AUTHORISED" + K8S_MISC = "MISC_ERROR_CHECK_LOGS" + K8S_NOT_FOUND = "RESOURCE_NOT_FOUND" + + KIND_REDIS = "REDIS" + KIND_SENTINEL = "SENTINEL" + APPLY_REDIS_CONFIG = "APPLY_REDIS_CONFIG" + APPLY_SENTINEL_CONFIG = "APPLY_SENTINEL_CONFIG" + MONITOR_REDIS_WITH_PORT = "SET_SENTINEL_TO_MONITOR_REDIS_WITH_GIVEN_PORT" + RESET_SENTINEL = "RESET_ALL_SENTINEL_CONFIG" + GET_NUM_SENTINELS_IN_MEM = "GET_NUMBER_OF_SENTINELS_IN_MEMORY" // `info sentinel` command on a sentinel machine > grep sentinel + GET_NUM_REDIS_SLAVES_IN_MEM = "GET_NUMBER_OF_REDIS_SLAVES_IN_MEMORY" // `info sentinel` command on a sentinel machine > grep slaves + GET_SLAVE_OF = "GET_MASTER_OF_GIVEN_SLAVE_INSTANCE" + IS_MASTER = "CHECK_IF_INSTANCE_IS_MASTER" + MAKE_MASTER = "MAKE_INSTANCE_AS_MASTER" + MAKE_SLAVE_OF = "MAKE_SLAVE_OF_GIVEN_MASTER_INSTANCE" + GET_SENTINEL_MONITOR = "SENTINEL_GET_MASTER_INSTANCE" + SLAVE_IS_READY = "CHECK_IF_SLAVE_IS_READY" ) // Instrumenter is the interface that will collect the metrics and has ability to send/expose those metrics. @@ -49,6 +75,10 @@ type Recorder interface { IncrRedisUnhealthyCount(namespace string, resource string, indicator /* aspect of redis that is unhealthy */ string, instance string) IncrSentinelUnhealthyCount(namespace string, resource string, indicator /* aspect of redis that is unhealthy */ string, instance string) + + RecordK8sOperation(namespace string, kind string, object string, operation string, status string, err string) + + RecordRedisOperation(kind /*redis/sentinel? */ string, IP string, operation string, status string, err string) } // PromMetrics implements the instrumenter so the metrics can be managed by Prometheus. @@ -57,9 +87,11 @@ type recorder struct { clusterOK *prometheus.GaugeVec // clusterOk is the status of a cluster ensureResourceSuccess *prometheus.CounterVec // number of successful "ensure" operators performed by the controller. ensureResourceFailure *prometheus.CounterVec // number of failed "ensure" operators performed by the controller. - redisInstance *prometheus.GaugeVec // Indicates known redis instances, with IPs and master/slave status + redisInstance *prometheus.GaugeVec // indicates known redis instances, with IPs and master/slave status redisUnhealthy *prometheus.CounterVec // indicates any error encountered in managed redis instance(s) sentinelUnhealthy *prometheus.CounterVec // indicates any error encountered in managed sentinel instance(s) + k8sServiceOperations *prometheus.CounterVec // number of operations performed on k8s + redisOperations *prometheus.CounterVec // number of operations performed on redis/sentinel instances koopercontroller.MetricsRecorder } @@ -78,13 +110,18 @@ type redisInstanceRecorder struct { koopercontroller.MetricsRecorder } -type redisHealthRecorder struct { - redisUnhealthy *prometheus.CounterVec +type k8sServiceOperationsRecorder struct { + operations *prometheus.CounterVec koopercontroller.MetricsRecorder } -type sentinelHealthRecorder struct { - sentinelUnhealthy *prometheus.CounterVec +type k8sServiceerrorRecorder struct { + errors *prometheus.CounterVec + koopercontroller.MetricsRecorder +} + +type redisHealthRecorder struct { + redisUnhealthy *prometheus.CounterVec koopercontroller.MetricsRecorder } @@ -133,6 +170,21 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { Help: "indicates any error encountered in managed sentinel instance(s)", }, []string{"namespace", "resource", "indicator", "instance"}) + redisOperations := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: promControllerSubsystem, + Name: "redis_operations", + Help: "number of operations performed on redis", + }, []string{"kind" /* redis/sentinel? */, "IP", "operation", "status", "err"}) + + k8sServiceOperations := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: promControllerSubsystem, + Name: "k8s_operations", + Help: "number of operations performed on k8s", + }, []string{"namespace", "kind", "object", "operation", "status", "err"}) // Create the instance. r := recorder{ clusterOK: clusterOK, @@ -141,6 +193,8 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { redisInstance: redisInstance, redisUnhealthy: redisUnhealthy, sentinelUnhealthy: sentinelUnhealthy, + k8sServiceOperations: k8sServiceOperations, + redisOperations: redisOperations, MetricsRecorder: kooperprometheus.New(kooperprometheus.Config{ Registerer: reg, }), @@ -154,6 +208,8 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { r.redisInstance, r.redisUnhealthy, r.sentinelUnhealthy, + r.k8sServiceOperations, + r.redisOperations, ) return r @@ -197,3 +253,11 @@ func (r recorder) IncrRedisUnhealthyCount(namespace string, resource string, ind func (r recorder) IncrSentinelUnhealthyCount(namespace string, resource string, indicator /* aspect of sentinel that is unhealthy */ string, instance string) { r.sentinelUnhealthy.WithLabelValues(namespace, resource, indicator, instance).Add(1) } + +func (r recorder) RecordK8sOperation(namespace string, kind string, object string, operation string, status string, err string) { + r.k8sServiceOperations.WithLabelValues(namespace, kind, object, operation, status, err).Add(1) +} + +func (r recorder) RecordRedisOperation(kind /*redis/sentinel? */ string, IP string, operation string, status string, err string) { + r.redisOperations.WithLabelValues(kind, IP, operation, status, err).Add(1) +} diff --git a/operator/redisfailover/checker.go b/operator/redisfailover/checker.go index d6c4f0148..ee0c86083 100644 --- a/operator/redisfailover/checker.go +++ b/operator/redisfailover/checker.go @@ -158,6 +158,7 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e return err } if err2 := r.rfChecker.CheckAllSlavesFromMaster(master, rf); err2 != nil { + r.mClient.IncrRedisUnhealthyCount(rf.Namespace, rf.Name, metrics.SLAVE_WRONG_MASTER, metrics.NOT_APPLICABLE) r.logger.Debug("Not all slaves have the same master") if err3 := r.rfHealer.SetMasterOnAll(master, rf); err3 != nil { return err3 @@ -181,7 +182,7 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e port := getRedisPort(rf.Spec.Redis.Port) for _, sip := range sentinels { if err := r.rfChecker.CheckSentinelMonitor(sip, master, port); err != nil { - r.mClient.IncrRedisUnhealthyCount(rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip) + r.mClient.IncrSentinelUnhealthyCount(rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip) r.logger.Debug("Sentinel is not monitoring the correct master") if err := r.rfHealer.NewSentinelMonitor(sip, master, rf); err != nil { return err @@ -251,6 +252,7 @@ func (r *RedisFailoverHandler) applyRedisCustomConfig(rf *redisfailoverv1.RedisF func (r *RedisFailoverHandler) checkAndHealSentinels(rf *redisfailoverv1.RedisFailover, sentinels []string) error { for _, sip := range sentinels { if err := r.rfChecker.CheckSentinelNumberInMemory(sip, rf); err != nil { + r.mClient.IncrSentinelUnhealthyCount(rf.Namespace, rf.Name, metrics.SENTINEL_NUMBER_IN_MEMORY_MISMATCH, metrics.NOT_APPLICABLE) r.logger.Debug("Sentinel has more sentinel in memory than spected") if err := r.rfHealer.RestoreSentinel(sip); err != nil { return err @@ -261,6 +263,7 @@ func (r *RedisFailoverHandler) checkAndHealSentinels(rf *redisfailoverv1.RedisFa if err := r.rfChecker.CheckSentinelSlavesNumberInMemory(sip, rf); err != nil { r.logger.Debug("Sentinel has more slaves in memory than spected") if err := r.rfHealer.RestoreSentinel(sip); err != nil { + r.mClient.IncrSentinelUnhealthyCount(rf.Namespace, rf.Name, metrics.REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH, metrics.NOT_APPLICABLE) return err } } diff --git a/operator/redisfailover/service/check.go b/operator/redisfailover/service/check.go index d5b9dc740..b1add5f9d 100644 --- a/operator/redisfailover/service/check.go +++ b/operator/redisfailover/service/check.go @@ -98,7 +98,6 @@ func (r *RedisFailoverChecker) setSlaveLabelIfNecessary(namespace string, pod co // CheckAllSlavesFromMaster controlls that all slaves have the same master (the real one) func (r *RedisFailoverChecker) CheckAllSlavesFromMaster(master string, rf *redisfailoverv1.RedisFailover) error { - r.metricsClient.ResetRedisInstance() rps, err := r.k8sService.GetStatefulSetPods(rf.Namespace, GetRedisName(rf)) if err != nil { return err @@ -130,7 +129,6 @@ func (r *RedisFailoverChecker) CheckAllSlavesFromMaster(master string, rf *redis return err } if slave != "" && slave != master { - r.metricsClient.IncrRedisUnhealthyCount(rf.Namespace, rf.Name, metrics.SLAVE_WRONG_MASTER, rp.Status.PodIP) return fmt.Errorf("slave %s don't have the master %s, has %s", rp.Status.PodIP, master, slave) } diff --git a/operator/redisfailover/service/check_test.go b/operator/redisfailover/service/check_test.go index bb9595eb8..79ceb84a7 100644 --- a/operator/redisfailover/service/check_test.go +++ b/operator/redisfailover/service/check_test.go @@ -46,7 +46,7 @@ func TestCheckRedisNumberError(t *testing.T) { ms.On("GetStatefulSet", namespace, rfservice.GetRedisName(rf)).Once().Return(nil, errors.New("")) mr := &mRedisService.Client{} - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckRedisNumber(rf) assert.Error(err) @@ -88,7 +88,7 @@ func TestCheckRedisNumberTrue(t *testing.T) { ms.On("GetStatefulSet", namespace, rfservice.GetRedisName(rf)).Once().Return(ss, nil) mr := &mRedisService.Client{} - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckRedisNumber(rf) assert.NoError(err) @@ -103,7 +103,7 @@ func TestCheckSentinelNumberError(t *testing.T) { ms.On("GetDeployment", namespace, rfservice.GetSentinelName(rf)).Once().Return(nil, errors.New("")) mr := &mRedisService.Client{} - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckSentinelNumber(rf) assert.Error(err) @@ -124,7 +124,7 @@ func TestCheckSentinelNumberFalse(t *testing.T) { ms.On("GetDeployment", namespace, rfservice.GetSentinelName(rf)).Once().Return(ss, nil) mr := &mRedisService.Client{} - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckSentinelNumber(rf) assert.Error(err) @@ -145,7 +145,7 @@ func TestCheckSentinelNumberTrue(t *testing.T) { ms.On("GetDeployment", namespace, rfservice.GetSentinelName(rf)).Once().Return(ss, nil) mr := &mRedisService.Client{} - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckSentinelNumber(rf) assert.NoError(err) @@ -161,7 +161,7 @@ func TestCheckAllSlavesFromMasterGetStatefulSetError(t *testing.T) { ms.On("UpdatePodLabels", namespace, mock.AnythingOfType("string"), mock.Anything).Once().Return(nil) mr := &mRedisService.Client{} - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckAllSlavesFromMaster("", rf) assert.Error(err) @@ -189,7 +189,7 @@ func TestCheckAllSlavesFromMasterGetSlaveOfError(t *testing.T) { mr := &mRedisService.Client{} mr.On("GetSlaveOf", "", "0", "").Once().Return("", errors.New("")) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckAllSlavesFromMaster("", rf) assert.Error(err) @@ -217,7 +217,7 @@ func TestCheckAllSlavesFromMasterDifferentMaster(t *testing.T) { mr := &mRedisService.Client{} mr.On("GetSlaveOf", "0.0.0.0", "0", "").Once().Return("1.1.1.1", nil) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckAllSlavesFromMaster("0.0.0.0", rf) assert.Error(err) @@ -245,7 +245,7 @@ func TestCheckAllSlavesFromMaster(t *testing.T) { mr := &mRedisService.Client{} mr.On("GetSlaveOf", "0.0.0.0", "0", "").Once().Return("1.1.1.1", nil) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckAllSlavesFromMaster("1.1.1.1", rf) assert.NoError(err) @@ -260,7 +260,7 @@ func TestCheckSentinelNumberInMemoryGetDeploymentPodsError(t *testing.T) { mr := &mRedisService.Client{} mr.On("GetNumberSentinelsInMemory", "1.1.1.1").Once().Return(int32(0), errors.New("expected error")) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckSentinelNumberInMemory("1.1.1.1", rf) assert.Error(err) @@ -275,7 +275,7 @@ func TestCheckSentinelNumberInMemoryGetNumberSentinelInMemoryError(t *testing.T) mr := &mRedisService.Client{} mr.On("GetNumberSentinelsInMemory", "1.1.1.1").Once().Return(int32(0), errors.New("")) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckSentinelNumberInMemory("1.1.1.1", rf) assert.Error(err) @@ -290,7 +290,7 @@ func TestCheckSentinelNumberInMemoryNumberMismatch(t *testing.T) { mr := &mRedisService.Client{} mr.On("GetNumberSentinelsInMemory", "1.1.1.1").Once().Return(int32(4), nil) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckSentinelNumberInMemory("1.1.1.1", rf) assert.Error(err) @@ -305,7 +305,7 @@ func TestCheckSentinelNumberInMemory(t *testing.T) { mr := &mRedisService.Client{} mr.On("GetNumberSentinelsInMemory", "1.1.1.1").Once().Return(int32(3), nil) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckSentinelNumberInMemory("1.1.1.1", rf) assert.NoError(err) @@ -320,7 +320,7 @@ func TestCheckSentinelSlavesNumberInMemoryGetNumberSentinelSlavesInMemoryError(t mr := &mRedisService.Client{} mr.On("GetNumberSentinelSlavesInMemory", "1.1.1.1").Once().Return(int32(0), errors.New("")) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckSentinelSlavesNumberInMemory("1.1.1.1", rf) assert.Error(err) @@ -335,7 +335,7 @@ func TestCheckSentinelSlavesNumberInMemoryReplicasMismatch(t *testing.T) { mr := &mRedisService.Client{} mr.On("GetNumberSentinelSlavesInMemory", "1.1.1.1").Once().Return(int32(3), nil) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckSentinelSlavesNumberInMemory("1.1.1.1", rf) assert.Error(err) @@ -351,7 +351,7 @@ func TestCheckSentinelSlavesNumberInMemory(t *testing.T) { mr := &mRedisService.Client{} mr.On("GetNumberSentinelSlavesInMemory", "1.1.1.1").Once().Return(int32(4), nil) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckSentinelSlavesNumberInMemory("1.1.1.1", rf) assert.NoError(err) @@ -364,7 +364,7 @@ func TestCheckSentinelMonitorGetSentinelMonitorError(t *testing.T) { mr := &mRedisService.Client{} mr.On("GetSentinelMonitor", "0.0.0.0").Once().Return("", "", errors.New("")) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckSentinelMonitor("0.0.0.0", "1.1.1.1") assert.Error(err) @@ -377,7 +377,7 @@ func TestCheckSentinelMonitorMismatch(t *testing.T) { mr := &mRedisService.Client{} mr.On("GetSentinelMonitor", "0.0.0.0").Once().Return("2.2.2.2", "6379", nil) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckSentinelMonitor("0.0.0.0", "1.1.1.1") assert.Error(err) @@ -390,7 +390,7 @@ func TestCheckSentinelMonitor(t *testing.T) { mr := &mRedisService.Client{} mr.On("GetSentinelMonitor", "0.0.0.0").Once().Return("1.1.1.1", "6379", nil) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckSentinelMonitor("0.0.0.0", "1.1.1.1") assert.NoError(err) @@ -403,7 +403,7 @@ func TestCheckSentinelMonitorWithPort(t *testing.T) { mr := &mRedisService.Client{} mr.On("GetSentinelMonitor", "0.0.0.0").Once().Return("1.1.1.1", "6379", nil) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckSentinelMonitor("0.0.0.0", "1.1.1.1", "6379") assert.NoError(err) @@ -416,7 +416,7 @@ func TestCheckSentinelMonitorWithPortMismatch(t *testing.T) { mr := &mRedisService.Client{} mr.On("GetSentinelMonitor", "0.0.0.0").Once().Return("1.1.1.1", "6379", nil) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckSentinelMonitor("0.0.0.0", "0.0.0.0", "6379") assert.Error(err) @@ -429,7 +429,7 @@ func TestCheckSentinelMonitorWithPortIPMismatch(t *testing.T) { mr := &mRedisService.Client{} mr.On("GetSentinelMonitor", "0.0.0.0").Once().Return("1.1.1.1", "6379", nil) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) err := checker.CheckSentinelMonitor("0.0.0.0", "1.1.1.1", "6380") assert.Error(err) @@ -444,7 +444,7 @@ func TestGetMasterIPGetStatefulSetPodsError(t *testing.T) { ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(nil, errors.New("")) mr := &mRedisService.Client{} - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) _, err := checker.GetMasterIP(rf) assert.Error(err) @@ -471,7 +471,7 @@ func TestGetMasterIPIsMasterError(t *testing.T) { mr := &mRedisService.Client{} mr.On("IsMaster", "0.0.0.0", "0", "").Once().Return(false, errors.New("")) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) _, err := checker.GetMasterIP(rf) assert.Error(err) @@ -505,7 +505,7 @@ func TestGetMasterIPMultipleMastersError(t *testing.T) { mr.On("IsMaster", "0.0.0.0", "0", "").Once().Return(true, nil) mr.On("IsMaster", "1.1.1.1", "0", "").Once().Return(true, nil) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) _, err := checker.GetMasterIP(rf) assert.Error(err) @@ -539,7 +539,7 @@ func TestGetMasterIP(t *testing.T) { mr.On("IsMaster", "0.0.0.0", "0", "").Once().Return(true, nil) mr.On("IsMaster", "1.1.1.1", "0", "").Once().Return(false, nil) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) master, err := checker.GetMasterIP(rf) assert.NoError(err) @@ -555,7 +555,7 @@ func TestGetNumberMastersGetStatefulSetPodsError(t *testing.T) { ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(nil, errors.New("")) mr := &mRedisService.Client{} - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) _, err := checker.GetNumberMasters(rf) assert.Error(err) @@ -582,7 +582,7 @@ func TestGetNumberMastersIsMasterError(t *testing.T) { mr := &mRedisService.Client{} mr.On("IsMaster", "0.0.0.0", "0", "").Once().Return(true, errors.New("")) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) _, err := checker.GetNumberMasters(rf) assert.NoError(err) @@ -616,7 +616,7 @@ func TestGetNumberMasters(t *testing.T) { mr.On("IsMaster", "0.0.0.0", "0", "").Once().Return(true, nil) mr.On("IsMaster", "1.1.1.1", "0", "").Once().Return(false, nil) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) masterNumber, err := checker.GetNumberMasters(rf) assert.NoError(err) @@ -651,7 +651,7 @@ func TestGetNumberMastersTwo(t *testing.T) { mr.On("IsMaster", "0.0.0.0", "0", "").Once().Return(true, nil) mr.On("IsMaster", "1.1.1.1", "0", "").Once().Return(true, nil) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) masterNumber, err := checker.GetNumberMasters(rf) assert.NoError(err) @@ -667,7 +667,7 @@ func TestGetMinimumRedisPodTimeGetStatefulSetPodsError(t *testing.T) { ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(nil, errors.New("")) mr := &mRedisService.Client{} - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) _, err := checker.GetMinimumRedisPodTime(rf) assert.Error(err) @@ -705,7 +705,7 @@ func TestGetMinimumRedisPodTime(t *testing.T) { ms.On("GetStatefulSetPods", namespace, rfservice.GetRedisName(rf)).Once().Return(pods, nil) mr := &mRedisService.Client{} - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) minTime, err := checker.GetMinimumRedisPodTime(rf) assert.NoError(err) @@ -756,7 +756,7 @@ func TestGetRedisPodsNames(t *testing.T) { mr.On("IsMaster", "0.0.0.0", "0", "").Twice().Return(false, nil) mr.On("IsMaster", "1.1.1.1", "0", "").Once().Return(true, nil) - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) master, err := checker.GetRedisesMasterPod(rf) assert.NoError(err) @@ -807,7 +807,7 @@ func TestGetStatefulSetUpdateRevision(t *testing.T) { ms.On("GetStatefulSet", namespace, rfservice.GetRedisName(rf)).Once().Return(test.ss, nil) mr := &mRedisService.Client{} - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) version, err := checker.GetStatefulSetUpdateRevision(rf) if test.expectedError == nil { @@ -856,7 +856,7 @@ func TestGetRedisRevisionHash(t *testing.T) { ms.On("GetPod", namespace, "namepod").Once().Return(test.pod, nil) mr := &mRedisService.Client{} - checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}) + checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy) hash, err := checker.GetRedisRevisionHash("namepod", rf) if test.expectedError == nil { diff --git a/service/k8s/configmap.go b/service/k8s/configmap.go index 9077687c9..3b9d77a8f 100644 --- a/service/k8s/configmap.go +++ b/service/k8s/configmap.go @@ -9,6 +9,7 @@ import ( "k8s.io/client-go/kubernetes" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" ) // ConfigMap the ServiceAccount service that knows how to interact with k8s to manage them @@ -23,21 +24,24 @@ type ConfigMap interface { // ConfigMapService is the configMap service implementation using API calls to kubernetes. type ConfigMapService struct { - kubeClient kubernetes.Interface - logger log.Logger + kubeClient kubernetes.Interface + logger log.Logger + metricsRecorder metrics.Recorder } // NewConfigMapService returns a new ConfigMap KubeService. -func NewConfigMapService(kubeClient kubernetes.Interface, logger log.Logger) *ConfigMapService { +func NewConfigMapService(kubeClient kubernetes.Interface, logger log.Logger, metricsRecorder metrics.Recorder) *ConfigMapService { logger = logger.With("service", "k8s.configMap") return &ConfigMapService{ - kubeClient: kubeClient, - logger: logger, + kubeClient: kubeClient, + logger: logger, + metricsRecorder: metricsRecorder, } } func (p *ConfigMapService) GetConfigMap(namespace string, name string) (*corev1.ConfigMap, error) { configMap, err := p.kubeClient.CoreV1().ConfigMaps(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + recordMetrics(namespace, "ConfigMap", name, "GET", err, p.metricsRecorder) if err != nil { return nil, err } @@ -46,6 +50,7 @@ func (p *ConfigMapService) GetConfigMap(namespace string, name string) (*corev1. func (p *ConfigMapService) CreateConfigMap(namespace string, configMap *corev1.ConfigMap) error { _, err := p.kubeClient.CoreV1().ConfigMaps(namespace).Create(context.TODO(), configMap, metav1.CreateOptions{}) + recordMetrics(namespace, "ConfigMap", configMap.GetName(), "CREATE", err, p.metricsRecorder) if err != nil { return err } @@ -54,6 +59,7 @@ func (p *ConfigMapService) CreateConfigMap(namespace string, configMap *corev1.C } func (p *ConfigMapService) UpdateConfigMap(namespace string, configMap *corev1.ConfigMap) error { _, err := p.kubeClient.CoreV1().ConfigMaps(namespace).Update(context.TODO(), configMap, metav1.UpdateOptions{}) + recordMetrics(namespace, "ConfigMap", configMap.GetName(), "UPDATE", err, p.metricsRecorder) if err != nil { return err } @@ -79,9 +85,13 @@ func (p *ConfigMapService) CreateOrUpdateConfigMap(namespace string, configMap * } func (p *ConfigMapService) DeleteConfigMap(namespace string, name string) error { - return p.kubeClient.CoreV1().ConfigMaps(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}) + err := p.kubeClient.CoreV1().ConfigMaps(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}) + recordMetrics(namespace, "ConfigMap", name, "DELETE", err, p.metricsRecorder) + return err } func (p *ConfigMapService) ListConfigMaps(namespace string) (*corev1.ConfigMapList, error) { - return p.kubeClient.CoreV1().ConfigMaps(namespace).List(context.TODO(), metav1.ListOptions{}) + objects, err := p.kubeClient.CoreV1().ConfigMaps(namespace).List(context.TODO(), metav1.ListOptions{}) + recordMetrics(namespace, "ConfigMap", metrics.NOT_APPLICABLE, "LIST", err, p.metricsRecorder) + return objects, err } diff --git a/service/k8s/configmap_test.go b/service/k8s/configmap_test.go index 1a8b8be78..da0701cf5 100644 --- a/service/k8s/configmap_test.go +++ b/service/k8s/configmap_test.go @@ -14,6 +14,7 @@ import ( kubetesting "k8s.io/client-go/testing" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" "github.com/spotahome/redis-operator/service/k8s" ) @@ -103,7 +104,7 @@ func TestConfigMapServiceGetCreateOrUpdate(t *testing.T) { return true, nil, test.errorOnCreation }) - service := k8s.NewConfigMapService(mcli, log.Dummy) + service := k8s.NewConfigMapService(mcli, log.Dummy, metrics.Dummy) err := service.CreateOrUpdateConfigMap(testns, test.configMap) if test.expErr { diff --git a/service/k8s/deployment.go b/service/k8s/deployment.go index c215ce82a..ec945014b 100644 --- a/service/k8s/deployment.go +++ b/service/k8s/deployment.go @@ -12,6 +12,7 @@ import ( "k8s.io/client-go/kubernetes" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" ) // Deployment the Deployment service that knows how to interact with k8s to manage them @@ -27,22 +28,25 @@ type Deployment interface { // DeploymentService is the service account service implementation using API calls to kubernetes. type DeploymentService struct { - kubeClient kubernetes.Interface - logger log.Logger + kubeClient kubernetes.Interface + logger log.Logger + metricsRecorder metrics.Recorder } // NewDeploymentService returns a new Deployment KubeService. -func NewDeploymentService(kubeClient kubernetes.Interface, logger log.Logger) *DeploymentService { +func NewDeploymentService(kubeClient kubernetes.Interface, logger log.Logger, metricsRecorder metrics.Recorder) *DeploymentService { logger = logger.With("service", "k8s.deployment") return &DeploymentService{ - kubeClient: kubeClient, - logger: logger, + kubeClient: kubeClient, + logger: logger, + metricsRecorder: metricsRecorder, } } // GetDeployment will retrieve the requested deployment based on namespace and name func (d *DeploymentService) GetDeployment(namespace, name string) (*appsv1.Deployment, error) { deployment, err := d.kubeClient.AppsV1().Deployments(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + recordMetrics(namespace, "Deployment", name, "GET", err, d.metricsRecorder) if err != nil { return nil, err } @@ -52,6 +56,7 @@ func (d *DeploymentService) GetDeployment(namespace, name string) (*appsv1.Deplo // GetDeploymentPods will retrieve the pods managed by a given deployment func (d *DeploymentService) GetDeploymentPods(namespace, name string) (*corev1.PodList, error) { deployment, err := d.kubeClient.AppsV1().Deployments(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + recordMetrics(namespace, "Deployment", name, "GET", err, d.metricsRecorder) if err != nil { return nil, err } @@ -66,6 +71,7 @@ func (d *DeploymentService) GetDeploymentPods(namespace, name string) (*corev1.P // CreateDeployment will create the given deployment func (d *DeploymentService) CreateDeployment(namespace string, deployment *appsv1.Deployment) error { _, err := d.kubeClient.AppsV1().Deployments(namespace).Create(context.TODO(), deployment, metav1.CreateOptions{}) + recordMetrics(namespace, "Deployment", deployment.GetName(), "CREATE", err, d.metricsRecorder) if err != nil { return err } @@ -76,6 +82,7 @@ func (d *DeploymentService) CreateDeployment(namespace string, deployment *appsv // UpdateDeployment will update the given deployment func (d *DeploymentService) UpdateDeployment(namespace string, deployment *appsv1.Deployment) error { _, err := d.kubeClient.AppsV1().Deployments(namespace).Update(context.TODO(), deployment, metav1.UpdateOptions{}) + recordMetrics(namespace, "Deployment", deployment.GetName(), "UPDATE", err, d.metricsRecorder) if err != nil { return err } @@ -105,10 +112,14 @@ func (d *DeploymentService) CreateOrUpdateDeployment(namespace string, deploymen // DeleteDeployment will delete the given deployment func (d *DeploymentService) DeleteDeployment(namespace, name string) error { propagation := metav1.DeletePropagationForeground - return d.kubeClient.AppsV1().Deployments(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{PropagationPolicy: &propagation}) + err := d.kubeClient.AppsV1().Deployments(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{PropagationPolicy: &propagation}) + recordMetrics(namespace, "Deployment", name, "DELETE", err, d.metricsRecorder) + return err } // ListDeployments will give all the deployments on a given namespace func (d *DeploymentService) ListDeployments(namespace string) (*appsv1.DeploymentList, error) { - return d.kubeClient.AppsV1().Deployments(namespace).List(context.TODO(), metav1.ListOptions{}) + deployments, err := d.kubeClient.AppsV1().Deployments(namespace).List(context.TODO(), metav1.ListOptions{}) + recordMetrics(namespace, "Deployment", metrics.NOT_APPLICABLE, "DELETE", err, d.metricsRecorder) + return deployments, err } diff --git a/service/k8s/deployment_test.go b/service/k8s/deployment_test.go index 8dc227515..14be68360 100644 --- a/service/k8s/deployment_test.go +++ b/service/k8s/deployment_test.go @@ -14,6 +14,7 @@ import ( kubetesting "k8s.io/client-go/testing" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" "github.com/spotahome/redis-operator/service/k8s" ) @@ -103,7 +104,7 @@ func TestDeploymentServiceGetCreateOrUpdate(t *testing.T) { return true, nil, test.errorOnCreation }) - service := k8s.NewDeploymentService(mcli, log.Dummy) + service := k8s.NewDeploymentService(mcli, log.Dummy, metrics.Dummy) err := service.CreateOrUpdateDeployment(testns, test.deployment) if test.expErr { diff --git a/service/k8s/k8s.go b/service/k8s/k8s.go index f3fa9555f..b6e68ae44 100644 --- a/service/k8s/k8s.go +++ b/service/k8s/k8s.go @@ -6,6 +6,7 @@ import ( redisfailoverclientset "github.com/spotahome/redis-operator/client/k8s/clientset/versioned" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" ) // Service is the K8s service entrypoint. @@ -34,16 +35,16 @@ type services struct { } // New returns a new Kubernetes service. -func New(kubecli kubernetes.Interface, crdcli redisfailoverclientset.Interface, apiextcli apiextensionscli.Interface, logger log.Logger) Services { +func New(kubecli kubernetes.Interface, crdcli redisfailoverclientset.Interface, apiextcli apiextensionscli.Interface, logger log.Logger, metricsRecorder metrics.Recorder) Services { return &services{ - ConfigMap: NewConfigMapService(kubecli, logger), - Secret: NewSecretService(kubecli, logger), - Pod: NewPodService(kubecli, logger), - PodDisruptionBudget: NewPodDisruptionBudgetService(kubecli, logger), - RedisFailover: NewRedisFailoverService(crdcli, logger), - Service: NewServiceService(kubecli, logger), - RBAC: NewRBACService(kubecli, logger), - Deployment: NewDeploymentService(kubecli, logger), - StatefulSet: NewStatefulSetService(kubecli, logger), + ConfigMap: NewConfigMapService(kubecli, logger, metricsRecorder), + Secret: NewSecretService(kubecli, logger, metricsRecorder), + Pod: NewPodService(kubecli, logger, metricsRecorder), + PodDisruptionBudget: NewPodDisruptionBudgetService(kubecli, logger, metricsRecorder), + RedisFailover: NewRedisFailoverService(crdcli, logger, metricsRecorder), + Service: NewServiceService(kubecli, logger, metricsRecorder), + RBAC: NewRBACService(kubecli, logger, metricsRecorder), + Deployment: NewDeploymentService(kubecli, logger, metricsRecorder), + StatefulSet: NewStatefulSetService(kubecli, logger, metricsRecorder), } } diff --git a/service/k8s/pod.go b/service/k8s/pod.go index 6125557ba..5a6f9c8e2 100644 --- a/service/k8s/pod.go +++ b/service/k8s/pod.go @@ -12,6 +12,7 @@ import ( "k8s.io/client-go/kubernetes" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" ) // Pod the ServiceAccount service that knows how to interact with k8s to manage them @@ -27,21 +28,24 @@ type Pod interface { // PodService is the pod service implementation using API calls to kubernetes. type PodService struct { - kubeClient kubernetes.Interface - logger log.Logger + kubeClient kubernetes.Interface + logger log.Logger + metricsRecorder metrics.Recorder } // NewPodService returns a new Pod KubeService. -func NewPodService(kubeClient kubernetes.Interface, logger log.Logger) *PodService { +func NewPodService(kubeClient kubernetes.Interface, logger log.Logger, metricsRecorder metrics.Recorder) *PodService { logger = logger.With("service", "k8s.pod") return &PodService{ - kubeClient: kubeClient, - logger: logger, + kubeClient: kubeClient, + logger: logger, + metricsRecorder: metricsRecorder, } } func (p *PodService) GetPod(namespace string, name string) (*corev1.Pod, error) { pod, err := p.kubeClient.CoreV1().Pods(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + recordMetrics(namespace, "Pod", name, "GET", err, p.metricsRecorder) if err != nil { return nil, err } @@ -50,6 +54,7 @@ func (p *PodService) GetPod(namespace string, name string) (*corev1.Pod, error) func (p *PodService) CreatePod(namespace string, pod *corev1.Pod) error { _, err := p.kubeClient.CoreV1().Pods(namespace).Create(context.TODO(), pod, metav1.CreateOptions{}) + recordMetrics(namespace, "Pod", pod.GetName(), "CREATE", err, p.metricsRecorder) if err != nil { return err } @@ -58,6 +63,7 @@ func (p *PodService) CreatePod(namespace string, pod *corev1.Pod) error { } func (p *PodService) UpdatePod(namespace string, pod *corev1.Pod) error { _, err := p.kubeClient.CoreV1().Pods(namespace).Update(context.TODO(), pod, metav1.UpdateOptions{}) + recordMetrics(namespace, "Pod", pod.GetName(), "UPDATE", err, p.metricsRecorder) if err != nil { return err } @@ -83,14 +89,18 @@ func (p *PodService) CreateOrUpdatePod(namespace string, pod *corev1.Pod) error } func (p *PodService) DeletePod(namespace string, name string) error { - return p.kubeClient.CoreV1().Pods(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}) + err := p.kubeClient.CoreV1().Pods(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}) + recordMetrics(namespace, "Pod", name, "DELETE", err, p.metricsRecorder) + return err } func (p *PodService) ListPods(namespace string) (*corev1.PodList, error) { - return p.kubeClient.CoreV1().Pods(namespace).List(context.TODO(), metav1.ListOptions{}) + pods, err := p.kubeClient.CoreV1().Pods(namespace).List(context.TODO(), metav1.ListOptions{}) + recordMetrics(namespace, "Pod", metrics.NOT_APPLICABLE, "LIST", err, p.metricsRecorder) + return pods, err } -//PatchStringValue specifies a patch operation for a string. +// PatchStringValue specifies a patch operation for a string. type PatchStringValue struct { Op string `json:"op"` Path string `json:"path"` @@ -112,6 +122,7 @@ func (p *PodService) UpdatePodLabels(namespace, podName string, labels map[strin payloadBytes, _ := json.Marshal(payloads) _, err := p.kubeClient.CoreV1().Pods(namespace).Patch(context.TODO(), podName, types.JSONPatchType, payloadBytes, metav1.PatchOptions{}) + recordMetrics(namespace, "Pod", podName, "PATCH", err, p.metricsRecorder) if err != nil { p.logger.Errorf("Update pod labels failed, namespace: %s, pod name: %s, error: %v", namespace, podName, err) } diff --git a/service/k8s/pod_test.go b/service/k8s/pod_test.go index e57937d94..eaf4e7feb 100644 --- a/service/k8s/pod_test.go +++ b/service/k8s/pod_test.go @@ -14,6 +14,7 @@ import ( kubetesting "k8s.io/client-go/testing" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" "github.com/spotahome/redis-operator/service/k8s" ) @@ -103,7 +104,7 @@ func TestPodServiceGetCreateOrUpdate(t *testing.T) { return true, nil, test.errorOnCreation }) - service := k8s.NewPodService(mcli, log.Dummy) + service := k8s.NewPodService(mcli, log.Dummy, metrics.Dummy) err := service.CreateOrUpdatePod(testns, test.pod) if test.expErr { diff --git a/service/k8s/poddisruptionbudget.go b/service/k8s/poddisruptionbudget.go index 32a1d2170..df24685df 100644 --- a/service/k8s/poddisruptionbudget.go +++ b/service/k8s/poddisruptionbudget.go @@ -9,6 +9,7 @@ import ( "k8s.io/client-go/kubernetes" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" ) // PodDisruptionBudget the ServiceAccount service that knows how to interact with k8s to manage them @@ -22,21 +23,24 @@ type PodDisruptionBudget interface { // PodDisruptionBudgetService is the podDisruptionBudget service implementation using API calls to kubernetes. type PodDisruptionBudgetService struct { - kubeClient kubernetes.Interface - logger log.Logger + kubeClient kubernetes.Interface + logger log.Logger + metricsRecorder metrics.Recorder } // NewPodDisruptionBudgetService returns a new PodDisruptionBudget KubeService. -func NewPodDisruptionBudgetService(kubeClient kubernetes.Interface, logger log.Logger) *PodDisruptionBudgetService { +func NewPodDisruptionBudgetService(kubeClient kubernetes.Interface, logger log.Logger, metricsRecorder metrics.Recorder) *PodDisruptionBudgetService { logger = logger.With("service", "k8s.podDisruptionBudget") return &PodDisruptionBudgetService{ - kubeClient: kubeClient, - logger: logger, + kubeClient: kubeClient, + logger: logger, + metricsRecorder: metricsRecorder, } } func (p *PodDisruptionBudgetService) GetPodDisruptionBudget(namespace string, name string) (*policyv1.PodDisruptionBudget, error) { podDisruptionBudget, err := p.kubeClient.PolicyV1().PodDisruptionBudgets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + recordMetrics(namespace, "PodDisruptionBudget", name, "GET", err, p.metricsRecorder) if err != nil { return nil, err } @@ -45,6 +49,7 @@ func (p *PodDisruptionBudgetService) GetPodDisruptionBudget(namespace string, na func (p *PodDisruptionBudgetService) CreatePodDisruptionBudget(namespace string, podDisruptionBudget *policyv1.PodDisruptionBudget) error { _, err := p.kubeClient.PolicyV1().PodDisruptionBudgets(namespace).Create(context.TODO(), podDisruptionBudget, metav1.CreateOptions{}) + recordMetrics(namespace, "PodDisruptionBudget", podDisruptionBudget.GetName(), "CREATE", err, p.metricsRecorder) if err != nil { return err } @@ -54,6 +59,7 @@ func (p *PodDisruptionBudgetService) CreatePodDisruptionBudget(namespace string, func (p *PodDisruptionBudgetService) UpdatePodDisruptionBudget(namespace string, podDisruptionBudget *policyv1.PodDisruptionBudget) error { _, err := p.kubeClient.PolicyV1().PodDisruptionBudgets(namespace).Update(context.TODO(), podDisruptionBudget, metav1.UpdateOptions{}) + recordMetrics(namespace, "PodDisruptionBudget", podDisruptionBudget.GetName(), "UPDATE", err, p.metricsRecorder) if err != nil { return err } @@ -80,5 +86,7 @@ func (p *PodDisruptionBudgetService) CreateOrUpdatePodDisruptionBudget(namespace } func (p *PodDisruptionBudgetService) DeletePodDisruptionBudget(namespace string, name string) error { - return p.kubeClient.PolicyV1().PodDisruptionBudgets(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}) + err := p.kubeClient.PolicyV1().PodDisruptionBudgets(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}) + recordMetrics(namespace, "PodDisruptionBudget", name, "DELETE", err, p.metricsRecorder) + return err } diff --git a/service/k8s/poddisruptionbudget_test.go b/service/k8s/poddisruptionbudget_test.go index c9bf97471..c2427859d 100644 --- a/service/k8s/poddisruptionbudget_test.go +++ b/service/k8s/poddisruptionbudget_test.go @@ -14,6 +14,7 @@ import ( kubetesting "k8s.io/client-go/testing" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" "github.com/spotahome/redis-operator/service/k8s" ) @@ -101,7 +102,7 @@ func TestPodDisruptionBudgetServiceGetCreateOrUpdate(t *testing.T) { return true, nil, test.errorOnCreation }) - service := k8s.NewPodDisruptionBudgetService(mcli, log.Dummy) + service := k8s.NewPodDisruptionBudgetService(mcli, log.Dummy, metrics.Dummy) err := service.CreateOrUpdatePodDisruptionBudget(testns, test.podDisruptionBudget) if test.expErr { diff --git a/service/k8s/rbac.go b/service/k8s/rbac.go index 24e298a22..300f84a4e 100644 --- a/service/k8s/rbac.go +++ b/service/k8s/rbac.go @@ -9,6 +9,7 @@ import ( "k8s.io/client-go/kubernetes" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" ) // RBAC is the service that knows how to interact with k8s to manage RBAC related resources. @@ -26,33 +27,42 @@ type RBAC interface { // NamespaceService is the Namespace service implementation using API calls to kubernetes. type RBACService struct { - kubeClient kubernetes.Interface - logger log.Logger + kubeClient kubernetes.Interface + logger log.Logger + metricsRecorder metrics.Recorder } // NewRBACService returns a new RBAC KubeService. -func NewRBACService(kubeClient kubernetes.Interface, logger log.Logger) *RBACService { +func NewRBACService(kubeClient kubernetes.Interface, logger log.Logger, metricsRecorder metrics.Recorder) *RBACService { logger = logger.With("service", "k8s.rbac") return &RBACService{ - kubeClient: kubeClient, - logger: logger, + kubeClient: kubeClient, + logger: logger, + metricsRecorder: metricsRecorder, } } func (r *RBACService) GetClusterRole(name string) (*rbacv1.ClusterRole, error) { - return r.kubeClient.RbacV1().ClusterRoles().Get(context.TODO(), name, metav1.GetOptions{}) + clusterRole, err := r.kubeClient.RbacV1().ClusterRoles().Get(context.TODO(), name, metav1.GetOptions{}) + recordMetrics(metrics.NOT_APPLICABLE, "ClusterRole", name, "GET", err, r.metricsRecorder) + return clusterRole, err } func (r *RBACService) GetRole(namespace, name string) (*rbacv1.Role, error) { - return r.kubeClient.RbacV1().Roles(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + role, err := r.kubeClient.RbacV1().Roles(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + recordMetrics(namespace, "Role", name, "GET", err, r.metricsRecorder) + return role, err } func (r *RBACService) GetRoleBinding(namespace, name string) (*rbacv1.RoleBinding, error) { - return r.kubeClient.RbacV1().RoleBindings(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + rolbinding, err := r.kubeClient.RbacV1().RoleBindings(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + recordMetrics(namespace, "RoleBinding", name, "GET", err, r.metricsRecorder) + return rolbinding, err } func (r *RBACService) DeleteRole(namespace, name string) error { err := r.kubeClient.RbacV1().Roles(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}) + recordMetrics(namespace, "Role", name, "DELETE", err, r.metricsRecorder) if err != nil { return err } @@ -62,6 +72,7 @@ func (r *RBACService) DeleteRole(namespace, name string) error { func (r *RBACService) CreateRole(namespace string, role *rbacv1.Role) error { _, err := r.kubeClient.RbacV1().Roles(namespace).Create(context.TODO(), role, metav1.CreateOptions{}) + recordMetrics(namespace, "Role", role.GetName(), "CREATE", err, r.metricsRecorder) if err != nil { return err } @@ -71,6 +82,7 @@ func (r *RBACService) CreateRole(namespace string, role *rbacv1.Role) error { func (s *RBACService) UpdateRole(namespace string, role *rbacv1.Role) error { _, err := s.kubeClient.RbacV1().Roles(namespace).Update(context.TODO(), role, metav1.UpdateOptions{}) + recordMetrics(namespace, "Role", role.GetName(), "UPDATE", err, s.metricsRecorder) if err != nil { return err } @@ -98,6 +110,7 @@ func (r *RBACService) CreateOrUpdateRole(namespace string, role *rbacv1.Role) er func (r *RBACService) DeleteRoleBinding(namespace, name string) error { err := r.kubeClient.RbacV1().RoleBindings(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}) + recordMetrics(namespace, "Role", name, "DELETE", err, r.metricsRecorder) if err != nil { return err } @@ -107,6 +120,7 @@ func (r *RBACService) DeleteRoleBinding(namespace, name string) error { func (r *RBACService) CreateRoleBinding(namespace string, binding *rbacv1.RoleBinding) error { _, err := r.kubeClient.RbacV1().RoleBindings(namespace).Create(context.TODO(), binding, metav1.CreateOptions{}) + recordMetrics(namespace, "RoleBinding", binding.GetName(), "CREATE", err, r.metricsRecorder) if err != nil { return err } @@ -116,6 +130,7 @@ func (r *RBACService) CreateRoleBinding(namespace string, binding *rbacv1.RoleBi func (r *RBACService) UpdateRoleBinding(namespace string, binding *rbacv1.RoleBinding) error { _, err := r.kubeClient.RbacV1().RoleBindings(namespace).Update(context.TODO(), binding, metav1.UpdateOptions{}) + recordMetrics(namespace, "Role", binding.GetName(), "UPDATE", err, r.metricsRecorder) if err != nil { return err } diff --git a/service/k8s/rbac_test.go b/service/k8s/rbac_test.go index 56b07e50f..d4f942adb 100644 --- a/service/k8s/rbac_test.go +++ b/service/k8s/rbac_test.go @@ -14,6 +14,7 @@ import ( kubetesting "k8s.io/client-go/testing" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" "github.com/spotahome/redis-operator/service/k8s" ) @@ -130,7 +131,7 @@ func TestRBACServiceGetCreateOrUpdateRoleBinding(t *testing.T) { return true, nil, test.errorOnCreation }) - service := k8s.NewRBACService(mcli, log.Dummy) + service := k8s.NewRBACService(mcli, log.Dummy, metrics.Dummy) err := service.CreateOrUpdateRoleBinding(testns, test.rb) if test.expErr { diff --git a/service/k8s/redisfailover.go b/service/k8s/redisfailover.go index f2bf539ad..9878390f6 100644 --- a/service/k8s/redisfailover.go +++ b/service/k8s/redisfailover.go @@ -9,6 +9,7 @@ import ( redisfailoverv1 "github.com/spotahome/redis-operator/api/redisfailover/v1" redisfailoverclientset "github.com/spotahome/redis-operator/client/k8s/clientset/versioned" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" ) // RedisFailover the RF service that knows how to interact with k8s to get them @@ -21,25 +22,31 @@ type RedisFailover interface { // RedisFailoverService is the RedisFailover service implementation using API calls to kubernetes. type RedisFailoverService struct { - k8sCli redisfailoverclientset.Interface - logger log.Logger + k8sCli redisfailoverclientset.Interface + logger log.Logger + metricsRecorder metrics.Recorder } // NewRedisFailoverService returns a new Workspace KubeService. -func NewRedisFailoverService(k8scli redisfailoverclientset.Interface, logger log.Logger) *RedisFailoverService { +func NewRedisFailoverService(k8scli redisfailoverclientset.Interface, logger log.Logger, metricsRecorder metrics.Recorder) *RedisFailoverService { logger = logger.With("service", "k8s.redisfailover") return &RedisFailoverService{ - k8sCli: k8scli, - logger: logger, + k8sCli: k8scli, + logger: logger, + metricsRecorder: metricsRecorder, } } // ListRedisFailovers satisfies redisfailover.Service interface. func (r *RedisFailoverService) ListRedisFailovers(ctx context.Context, namespace string, opts metav1.ListOptions) (*redisfailoverv1.RedisFailoverList, error) { - return r.k8sCli.DatabasesV1().RedisFailovers(namespace).List(ctx, opts) + redisFailoverList, err := r.k8sCli.DatabasesV1().RedisFailovers(namespace).List(ctx, opts) + recordMetrics(namespace, "RedisFailover", metrics.NOT_APPLICABLE, "LIST", err, r.metricsRecorder) + return redisFailoverList, err } // WatchRedisFailovers satisfies redisfailover.Service interface. func (r *RedisFailoverService) WatchRedisFailovers(ctx context.Context, namespace string, opts metav1.ListOptions) (watch.Interface, error) { - return r.k8sCli.DatabasesV1().RedisFailovers(namespace).Watch(ctx, opts) + watcher, err := r.k8sCli.DatabasesV1().RedisFailovers(namespace).Watch(ctx, opts) + recordMetrics(namespace, "RedisFailover", metrics.NOT_APPLICABLE, "WATCH", err, r.metricsRecorder) + return watcher, err } diff --git a/service/k8s/secret.go b/service/k8s/secret.go index d7141b9a1..0edd23dea 100644 --- a/service/k8s/secret.go +++ b/service/k8s/secret.go @@ -4,6 +4,7 @@ import ( "context" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" @@ -16,22 +17,25 @@ type Secret interface { // SecretService is the secret service implementation using API calls to kubernetes. type SecretService struct { - kubeClient kubernetes.Interface - logger log.Logger + kubeClient kubernetes.Interface + logger log.Logger + metricsRecorder metrics.Recorder } -func NewSecretService(kubeClient kubernetes.Interface, logger log.Logger) *SecretService { +func NewSecretService(kubeClient kubernetes.Interface, logger log.Logger, metricsRecorder metrics.Recorder) *SecretService { logger = logger.With("service", "k8s.secret") return &SecretService{ - kubeClient: kubeClient, - logger: logger, + kubeClient: kubeClient, + logger: logger, + metricsRecorder: metricsRecorder, } } func (s *SecretService) GetSecret(namespace, name string) (*corev1.Secret, error) { secret, err := s.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + recordMetrics(namespace, "Secret", name, "GET", err, s.metricsRecorder) if err != nil { return nil, err } diff --git a/service/k8s/secret_test.go b/service/k8s/secret_test.go index 097b1e4ce..f39dd0e71 100644 --- a/service/k8s/secret_test.go +++ b/service/k8s/secret_test.go @@ -5,6 +5,7 @@ import ( "testing" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" @@ -46,7 +47,7 @@ func TestSecretServiceGet(t *testing.T) { assert.NoError(err) // test getting the secret - service := NewSecretService(mcli, log.Dummy) + service := NewSecretService(mcli, log.Dummy, metrics.Dummy) ss, err := service.GetSecret(secret.ObjectMeta.Namespace, secret.ObjectMeta.Name) assert.NotNil(ss) assert.NoError(err) diff --git a/service/k8s/service.go b/service/k8s/service.go index 1e956c2c1..24341b080 100644 --- a/service/k8s/service.go +++ b/service/k8s/service.go @@ -9,6 +9,7 @@ import ( "k8s.io/client-go/kubernetes" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" ) // Service the ServiceAccount service that knows how to interact with k8s to manage them @@ -24,21 +25,25 @@ type Service interface { // ServiceService is the service service implementation using API calls to kubernetes. type ServiceService struct { - kubeClient kubernetes.Interface - logger log.Logger + kubeClient kubernetes.Interface + logger log.Logger + metricsRecorder metrics.Recorder } // NewServiceService returns a new Service KubeService. -func NewServiceService(kubeClient kubernetes.Interface, logger log.Logger) *ServiceService { +func NewServiceService(kubeClient kubernetes.Interface, logger log.Logger, metricsRecorder metrics.Recorder) *ServiceService { logger = logger.With("service", "k8s.service") return &ServiceService{ - kubeClient: kubeClient, - logger: logger, + kubeClient: kubeClient, + logger: logger, + metricsRecorder: metricsRecorder, } } func (s *ServiceService) GetService(namespace string, name string) (*corev1.Service, error) { service, err := s.kubeClient.CoreV1().Services(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + log.Errorf("Error while getting service %v in %v namespace : %v", name, namespace, err) + recordMetrics(namespace, "Service", name, "GET", err, s.metricsRecorder) if err != nil { return nil, err } @@ -47,6 +52,7 @@ func (s *ServiceService) GetService(namespace string, name string) (*corev1.Serv func (s *ServiceService) CreateService(namespace string, service *corev1.Service) error { _, err := s.kubeClient.CoreV1().Services(namespace).Create(context.TODO(), service, metav1.CreateOptions{}) + recordMetrics(namespace, "Service", service.GetName(), "CREATE", err, s.metricsRecorder) if err != nil { return err } @@ -55,6 +61,7 @@ func (s *ServiceService) CreateService(namespace string, service *corev1.Service } func (s *ServiceService) CreateIfNotExistsService(namespace string, service *corev1.Service) error { + log.Debugf("trying to get %v service in %v namespace... ", service.GetName(), namespace) if _, err := s.GetService(namespace, service.Name); err != nil { // If no resource we need to create. if errors.IsNotFound(err) { @@ -67,14 +74,17 @@ func (s *ServiceService) CreateIfNotExistsService(namespace string, service *cor func (s *ServiceService) UpdateService(namespace string, service *corev1.Service) error { _, err := s.kubeClient.CoreV1().Services(namespace).Update(context.TODO(), service, metav1.UpdateOptions{}) + recordMetrics(namespace, "Service", service.GetName(), "UPDATE", err, s.metricsRecorder) if err != nil { return err } s.logger.WithField("namespace", namespace).WithField("serviceName", service.Name).Infof("service updated") + return nil } func (s *ServiceService) CreateOrUpdateService(namespace string, service *corev1.Service) error { storedService, err := s.GetService(namespace, service.Name) + log.Errorf("Error while getting service %v in %v namespace : %v", service.GetName(), namespace, err) if err != nil { // If no resource we need to create. if errors.IsNotFound(err) { @@ -93,9 +103,13 @@ func (s *ServiceService) CreateOrUpdateService(namespace string, service *corev1 func (s *ServiceService) DeleteService(namespace string, name string) error { propagation := metav1.DeletePropagationForeground - return s.kubeClient.CoreV1().Services(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{PropagationPolicy: &propagation}) + err := s.kubeClient.CoreV1().Services(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{PropagationPolicy: &propagation}) + recordMetrics(namespace, "Service", name, "DELETE", err, s.metricsRecorder) + return err } func (s *ServiceService) ListServices(namespace string) (*corev1.ServiceList, error) { - return s.kubeClient.CoreV1().Services(namespace).List(context.TODO(), metav1.ListOptions{}) + serviceList, err := s.kubeClient.CoreV1().Services(namespace).List(context.TODO(), metav1.ListOptions{}) + recordMetrics(namespace, "Service", metrics.NOT_APPLICABLE, "LIST", err, s.metricsRecorder) + return serviceList, err } diff --git a/service/k8s/service_test.go b/service/k8s/service_test.go index 5e18e3555..7b3678e7e 100644 --- a/service/k8s/service_test.go +++ b/service/k8s/service_test.go @@ -14,6 +14,7 @@ import ( kubetesting "k8s.io/client-go/testing" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" "github.com/spotahome/redis-operator/service/k8s" ) @@ -103,7 +104,7 @@ func TestServiceServiceGetCreateOrUpdate(t *testing.T) { return true, nil, test.errorOnCreation }) - service := k8s.NewServiceService(mcli, log.Dummy) + service := k8s.NewServiceService(mcli, log.Dummy, metrics.Dummy) err := service.CreateOrUpdateService(testns, test.service) if test.expErr { diff --git a/service/k8s/statefulset.go b/service/k8s/statefulset.go index 0bbbee003..5adfe0e74 100644 --- a/service/k8s/statefulset.go +++ b/service/k8s/statefulset.go @@ -12,6 +12,7 @@ import ( "k8s.io/client-go/kubernetes" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" ) // StatefulSet the StatefulSet service that knows how to interact with k8s to manage them @@ -27,22 +28,25 @@ type StatefulSet interface { // StatefulSetService is the service account service implementation using API calls to kubernetes. type StatefulSetService struct { - kubeClient kubernetes.Interface - logger log.Logger + kubeClient kubernetes.Interface + logger log.Logger + metricsRecorder metrics.Recorder } // NewStatefulSetService returns a new StatefulSet KubeService. -func NewStatefulSetService(kubeClient kubernetes.Interface, logger log.Logger) *StatefulSetService { +func NewStatefulSetService(kubeClient kubernetes.Interface, logger log.Logger, metricsRecorder metrics.Recorder) *StatefulSetService { logger = logger.With("service", "k8s.statefulSet") return &StatefulSetService{ - kubeClient: kubeClient, - logger: logger, + kubeClient: kubeClient, + logger: logger, + metricsRecorder: metricsRecorder, } } // GetStatefulSet will retrieve the requested statefulset based on namespace and name func (s *StatefulSetService) GetStatefulSet(namespace, name string) (*appsv1.StatefulSet, error) { statefulSet, err := s.kubeClient.AppsV1().StatefulSets(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + recordMetrics(namespace, "StatefulSet", name, "GET", err, s.metricsRecorder) if err != nil { return nil, err } @@ -66,6 +70,7 @@ func (s *StatefulSetService) GetStatefulSetPods(namespace, name string) (*corev1 // CreateStatefulSet will create the given statefulset func (s *StatefulSetService) CreateStatefulSet(namespace string, statefulSet *appsv1.StatefulSet) error { _, err := s.kubeClient.AppsV1().StatefulSets(namespace).Create(context.TODO(), statefulSet, metav1.CreateOptions{}) + recordMetrics(namespace, "StatefulSet", statefulSet.GetName(), "CREATE", err, s.metricsRecorder) if err != nil { return err } @@ -76,6 +81,7 @@ func (s *StatefulSetService) CreateStatefulSet(namespace string, statefulSet *ap // UpdateStatefulSet will update the given statefulset func (s *StatefulSetService) UpdateStatefulSet(namespace string, statefulSet *appsv1.StatefulSet) error { _, err := s.kubeClient.AppsV1().StatefulSets(namespace).Update(context.TODO(), statefulSet, metav1.UpdateOptions{}) + recordMetrics(namespace, "StatefulSet", statefulSet.GetName(), "UPDATE", err, s.metricsRecorder) if err != nil { return err } @@ -105,10 +111,14 @@ func (s *StatefulSetService) CreateOrUpdateStatefulSet(namespace string, statefu // DeleteStatefulSet will delete the statefulset func (s *StatefulSetService) DeleteStatefulSet(namespace, name string) error { propagation := metav1.DeletePropagationForeground - return s.kubeClient.AppsV1().StatefulSets(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{PropagationPolicy: &propagation}) + err := s.kubeClient.AppsV1().StatefulSets(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{PropagationPolicy: &propagation}) + recordMetrics(namespace, "StatefulSet", name, "DELETE", err, s.metricsRecorder) + return err } // ListStatefulSets will retrieve a list of statefulset in the given namespace func (s *StatefulSetService) ListStatefulSets(namespace string) (*appsv1.StatefulSetList, error) { - return s.kubeClient.AppsV1().StatefulSets(namespace).List(context.TODO(), metav1.ListOptions{}) + stsList, err := s.kubeClient.AppsV1().StatefulSets(namespace).List(context.TODO(), metav1.ListOptions{}) + recordMetrics(namespace, "StatefulSet", metrics.NOT_APPLICABLE, "LIST", err, s.metricsRecorder) + return stsList, err } diff --git a/service/k8s/statefulset_test.go b/service/k8s/statefulset_test.go index 305cfeff7..57b653710 100644 --- a/service/k8s/statefulset_test.go +++ b/service/k8s/statefulset_test.go @@ -14,6 +14,7 @@ import ( kubetesting "k8s.io/client-go/testing" "github.com/spotahome/redis-operator/log" + "github.com/spotahome/redis-operator/metrics" "github.com/spotahome/redis-operator/service/k8s" ) @@ -103,7 +104,7 @@ func TestStatefulSetServiceGetCreateOrUpdate(t *testing.T) { return true, nil, test.errorOnCreation }) - service := k8s.NewStatefulSetService(mcli, log.Dummy) + service := k8s.NewStatefulSetService(mcli, log.Dummy, metrics.Dummy) err := service.CreateOrUpdateStatefulSet(testns, test.statefulSet) if test.expErr { diff --git a/service/k8s/util.go b/service/k8s/util.go index 44cb6cb9e..9c2cb1668 100644 --- a/service/k8s/util.go +++ b/service/k8s/util.go @@ -4,6 +4,8 @@ import ( "fmt" redisfailoverv1 "github.com/spotahome/redis-operator/api/redisfailover/v1" + "github.com/spotahome/redis-operator/metrics" + "k8s.io/apimachinery/pkg/api/errors" ) // GetRedisPassword retreives password from kubernetes secret or, if @@ -26,3 +28,18 @@ func GetRedisPassword(s Services, rf *redisfailoverv1.RedisFailover) (string, er return "", fmt.Errorf("secret \"%s\" does not have a password field", rf.Spec.Auth.SecretPath) } + +func recordMetrics(namespace string, kind string, object string, operation string, err error, metricsRecorder metrics.Recorder) { + if nil == err { + metricsRecorder.RecordK8sOperation(namespace, kind, object, operation, metrics.SUCCESS, metrics.NOT_APPLICABLE) + } else if errors.IsForbidden(err) { + metricsRecorder.RecordK8sOperation(namespace, kind, object, operation, metrics.FAIL, metrics.K8S_FORBIDDEN_ERR) + } else if errors.IsUnauthorized(err) { + metricsRecorder.RecordK8sOperation(namespace, kind, object, operation, metrics.FAIL, metrics.K8S_UNAUTH) + } else if errors.IsNotFound(err) { + metricsRecorder.RecordK8sOperation(namespace, kind, object, operation, metrics.FAIL, metrics.K8S_NOT_FOUND) + } else { + metricsRecorder.RecordK8sOperation(namespace, kind, object, operation, metrics.FAIL, metrics.K8S_MISC) + } + +} diff --git a/service/redis/client.go b/service/redis/client.go index dc7e8fa67..52515d978 100644 --- a/service/redis/client.go +++ b/service/redis/client.go @@ -75,20 +75,24 @@ func (c *client) GetNumberSentinelsInMemory(ip string) (int32, error) { defer rClient.Close() info, err := rClient.Info(context.TODO(), "sentinel").Result() if err != nil { - c.recordRedisError(err, ip) + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.GET_NUM_SENTINELS_IN_MEM, metrics.FAIL, getRedisError(err)) return 0, err } if err2 := isSentinelReady(info); err2 != nil { + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.GET_NUM_SENTINELS_IN_MEM, metrics.FAIL, metrics.SENTINEL_NOT_READY) return 0, err2 } match := sentinelNumberRE.FindStringSubmatch(info) if len(match) == 0 { + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.GET_NUM_SENTINELS_IN_MEM, metrics.FAIL, metrics.REGEX_NOT_FOUND) return 0, errors.New("seninel regex not found") } nSentinels, err := strconv.Atoi(match[1]) if err != nil { + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.GET_NUM_SENTINELS_IN_MEM, metrics.FAIL, metrics.MISC) return 0, err } + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.GET_NUM_SENTINELS_IN_MEM, metrics.SUCCESS, metrics.NOT_APPLICABLE) return int32(nSentinels), nil } @@ -103,20 +107,24 @@ func (c *client) GetNumberSentinelSlavesInMemory(ip string) (int32, error) { defer rClient.Close() info, err := rClient.Info(context.TODO(), "sentinel").Result() if err != nil { - c.recordRedisError(err, ip) + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.GET_NUM_REDIS_SLAVES_IN_MEM, metrics.FAIL, getRedisError(err)) return 0, err } if err2 := isSentinelReady(info); err2 != nil { + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.GET_NUM_REDIS_SLAVES_IN_MEM, metrics.FAIL, metrics.SENTINEL_NOT_READY) return 0, err2 } match := slaveNumberRE.FindStringSubmatch(info) if len(match) == 0 { + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.GET_NUM_REDIS_SLAVES_IN_MEM, metrics.FAIL, metrics.REGEX_NOT_FOUND) return 0, errors.New("slaves regex not found") } nSlaves, err := strconv.Atoi(match[1]) if err != nil { + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.GET_NUM_REDIS_SLAVES_IN_MEM, metrics.FAIL, metrics.MISC) return 0, err } + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.GET_NUM_REDIS_SLAVES_IN_MEM, metrics.SUCCESS, metrics.NOT_APPLICABLE) return int32(nSlaves), nil } @@ -140,13 +148,15 @@ func (c *client) ResetSentinel(ip string) error { cmd := rediscli.NewIntCmd(context.TODO(), "SENTINEL", "reset", "*") err := rClient.Process(context.TODO(), cmd) if err != nil { - c.recordRedisError(err, ip) + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.RESET_SENTINEL, metrics.FAIL, getRedisError(err)) return err } _, err = cmd.Result() if err != nil { + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.RESET_SENTINEL, metrics.FAIL, getRedisError(err)) return err } + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.RESET_SENTINEL, metrics.SUCCESS, metrics.NOT_APPLICABLE) return nil } @@ -162,17 +172,16 @@ func (c *client) GetSlaveOf(ip, port, password string) (string, error) { defer rClient.Close() info, err := rClient.Info(context.TODO(), "replication").Result() if err != nil { - log.Errorf("error while getting masterIP : %v", err) - c.recordRedisError(err, ip) + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, ip, metrics.GET_SLAVE_OF, metrics.FAIL, getRedisError(err)) return "", err } match := redisMasterHostRE.FindStringSubmatch(info) if len(match) == 0 { - log.Errorf("error while getting masterIP : %v", err) - c.metricsRecorder.SetRedisInstance(ip, ip, "master") + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, ip, metrics.GET_SLAVE_OF, metrics.FAIL, metrics.REGEX_NOT_FOUND) + log.Errorf("error while getting masterIP : No match for for %v while querying redis instance %v for replication info", redisMasterHostREString, ip) return "", nil } - c.metricsRecorder.SetRedisInstance(ip, match[1], "slave") + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, ip, metrics.GET_SLAVE_OF, metrics.SUCCESS, metrics.NOT_APPLICABLE) return match[1], nil } @@ -186,9 +195,10 @@ func (c *client) IsMaster(ip, port, password string) (bool, error) { defer rClient.Close() info, err := rClient.Info(context.TODO(), "replication").Result() if err != nil { - c.recordRedisError(err, ip) + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, ip, metrics.IS_MASTER, metrics.FAIL, getRedisError(err)) return false, err } + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, ip, metrics.IS_MASTER, metrics.SUCCESS, metrics.NOT_APPLICABLE) return strings.Contains(info, redisRoleMaster), nil } @@ -210,12 +220,12 @@ func (c *client) MonitorRedisWithPort(ip, monitor, port, quorum, password string cmd = rediscli.NewBoolCmd(context.TODO(), "SENTINEL", "MONITOR", masterName, monitor, port, quorum) err := rClient.Process(context.TODO(), cmd) if err != nil { - c.recordRedisError(err, ip) + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, ip, metrics.MONITOR_REDIS_WITH_PORT, metrics.FAIL, getRedisError(err)) return err } _, err = cmd.Result() if err != nil { - c.recordRedisError(err, ip) + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, ip, metrics.MONITOR_REDIS_WITH_PORT, metrics.FAIL, getRedisError(err)) return err } @@ -223,7 +233,7 @@ func (c *client) MonitorRedisWithPort(ip, monitor, port, quorum, password string cmd = rediscli.NewBoolCmd(context.TODO(), "SENTINEL", "SET", masterName, "auth-pass", password) err := rClient.Process(context.TODO(), cmd) if err != nil { - c.recordRedisError(err, ip) + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, ip, metrics.MONITOR_REDIS_WITH_PORT, metrics.FAIL, getRedisError(err)) return err } _, err = cmd.Result() @@ -231,6 +241,7 @@ func (c *client) MonitorRedisWithPort(ip, monitor, port, quorum, password string return err } } + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, ip, metrics.MONITOR_REDIS_WITH_PORT, metrics.SUCCESS, metrics.NOT_APPLICABLE) return nil } @@ -243,9 +254,10 @@ func (c *client) MakeMaster(ip string, port string, password string) error { rClient := rediscli.NewClient(options) defer rClient.Close() if res := rClient.SlaveOf(context.TODO(), "NO", "ONE"); res.Err() != nil { - c.recordRedisError(res.Err(), ip) + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, ip, metrics.MAKE_MASTER, metrics.FAIL, getRedisError(res.Err())) return res.Err() } + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, ip, metrics.MAKE_MASTER, metrics.FAIL, metrics.NOT_APPLICABLE) return nil } @@ -262,9 +274,10 @@ func (c *client) MakeSlaveOfWithPort(ip, masterIP, masterPort, password string) rClient := rediscli.NewClient(options) defer rClient.Close() if res := rClient.SlaveOf(context.TODO(), masterIP, masterPort); res.Err() != nil { - c.recordRedisError(res.Err(), ip) + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, ip, metrics.MAKE_SLAVE_OF, metrics.FAIL, getRedisError(res.Err())) return res.Err() } + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, ip, metrics.MAKE_SLAVE_OF, metrics.SUCCESS, metrics.NOT_APPLICABLE) return nil } @@ -279,16 +292,17 @@ func (c *client) GetSentinelMonitor(ip string) (string, string, error) { cmd := rediscli.NewSliceCmd(context.TODO(), "SENTINEL", "master", masterName) err := rClient.Process(context.TODO(), cmd) if err != nil { - c.recordRedisError(err, ip) + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.GET_SENTINEL_MONITOR, metrics.FAIL, getRedisError(err)) return "", "", err } res, err := cmd.Result() if err != nil { - c.recordRedisError(err, ip) + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.GET_SENTINEL_MONITOR, metrics.FAIL, getRedisError(err)) return "", "", err } masterIP := res[3].(string) masterPort := res[5].(string) + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, ip, metrics.GET_SENTINEL_MONITOR, metrics.SUCCESS, metrics.NOT_APPLICABLE) return masterIP, masterPort, nil } @@ -337,9 +351,10 @@ func (c *client) SetCustomRedisConfig(ip string, port string, configs []string, func (c *client) applyRedisConfig(parameter string, value string, rClient *rediscli.Client) error { result := rClient.ConfigSet(context.TODO(), parameter, value) if nil != result.Err() { - c.recordRedisError(result.Err(), strings.Split(rClient.Options().Addr, ":")[0]) + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, strings.Split(rClient.Options().Addr, ":")[0], metrics.APPLY_REDIS_CONFIG, metrics.FAIL, getRedisError(result.Err())) + return result.Err() } - + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, strings.Split(rClient.Options().Addr, ":")[0], metrics.APPLY_REDIS_CONFIG, metrics.SUCCESS, metrics.NOT_APPLICABLE) return result.Err() } @@ -347,9 +362,10 @@ func (c *client) applySentinelConfig(parameter string, value string, rClient *re cmd := rediscli.NewStatusCmd(context.TODO(), "SENTINEL", "set", masterName, parameter, value) err := rClient.Process(context.TODO(), cmd) if err != nil { - c.recordRedisError(err, strings.Split(rClient.Options().Addr, ":")[0]) + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, strings.Split(rClient.Options().Addr, ":")[0], metrics.APPLY_SENTINEL_CONFIG, metrics.FAIL, getRedisError(err)) return err } + c.metricsRecorder.RecordRedisOperation(metrics.KIND_SENTINEL, strings.Split(rClient.Options().Addr, ":")[0], metrics.APPLY_SENTINEL_CONFIG, metrics.SUCCESS, metrics.NOT_APPLICABLE) return cmd.Err() } @@ -371,27 +387,29 @@ func (c *client) SlaveIsReady(ip, port, password string) (bool, error) { defer rClient.Close() info, err := rClient.Info(context.TODO(), "replication").Result() if err != nil { - c.recordRedisError(err, err.Error()) + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, strings.Split(rClient.Options().Addr, ":")[0], metrics.SLAVE_IS_READY, metrics.FAIL, getRedisError(err)) return false, err } ok := !strings.Contains(info, redisSyncing) && !strings.Contains(info, redisMasterSillPending) && strings.Contains(info, redisLinkUp) - + c.metricsRecorder.RecordRedisOperation(metrics.KIND_REDIS, strings.Split(rClient.Options().Addr, ":")[0], metrics.SLAVE_IS_READY, metrics.SUCCESS, metrics.NOT_APPLICABLE) return ok, nil } -func (c *client) recordRedisError(err error, ip string) { +func getRedisError(err error) string { if strings.Contains(err.Error(), "NOAUTH") { - c.metricsRecorder.IncrRedisUnhealthyCount(metrics.NOT_APPLICABLE, metrics.NOT_APPLICABLE, metrics.NOAUTH, ip) + return metrics.NOAUTH } else if strings.Contains(err.Error(), "WRONGPASS") { - c.metricsRecorder.IncrRedisUnhealthyCount(metrics.NOT_APPLICABLE, metrics.NOT_APPLICABLE, metrics.WRONG_PASSWORD_USED, ip) + return metrics.WRONG_PASSWORD_USED } else if strings.Contains(err.Error(), "NOPERM") { - c.metricsRecorder.IncrRedisUnhealthyCount(metrics.NOT_APPLICABLE, metrics.NOT_APPLICABLE, metrics.NOPERM, ip) + return metrics.NOPERM } else if strings.Contains(err.Error(), "i/o timeout") { - c.metricsRecorder.IncrRedisUnhealthyCount(metrics.NOT_APPLICABLE, metrics.NOT_APPLICABLE, metrics.IO_TIMEOUT, ip) + return metrics.IO_TIMEOUT } else if strings.Contains(err.Error(), "connection refused") { - c.metricsRecorder.IncrRedisUnhealthyCount(metrics.NOT_APPLICABLE, metrics.NOT_APPLICABLE, metrics.CONNECTION_REFUSED, ip) + return metrics.CONNECTION_REFUSED + } else { + return "MISC" } } From 000a4399d2c855fd4dd1264d4a6c856437abe601 Mon Sep 17 00:00:00 2001 From: Raghu Nandan B S Date: Tue, 27 Sep 2022 16:17:33 +0530 Subject: [PATCH 06/24] revert unwanted changes --- example/operator/all-redis-operator-resources.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example/operator/all-redis-operator-resources.yaml b/example/operator/all-redis-operator-resources.yaml index ec4027d16..b30afee3d 100644 --- a/example/operator/all-redis-operator-resources.yaml +++ b/example/operator/all-redis-operator-resources.yaml @@ -18,8 +18,8 @@ spec: spec: serviceAccountName: redisoperator containers: - - image: redis-operator:latest - imagePullPolicy: Never + - image: quay.io/spotahome/redis-operator:v1.1.0 + imagePullPolicy: IfNotPresent name: app securityContext: readOnlyRootFilesystem: true From 244dd2e9e4e4f45dd1554c749db6b279c1755525 Mon Sep 17 00:00:00 2001 From: Raghu Nandan B S Date: Fri, 30 Sep 2022 13:55:36 +0530 Subject: [PATCH 07/24] Add metrics dashboard example --- .../redisfailover-health.json | 2276 +++++++++++++++++ .../grafana-dashboard/summary-dashboard.json | 200 ++ service/k8s/service.go | 3 +- 3 files changed, 2478 insertions(+), 1 deletion(-) create mode 100644 example/grafana-dashboard/redisfailover-health.json create mode 100644 example/grafana-dashboard/summary-dashboard.json diff --git a/example/grafana-dashboard/redisfailover-health.json b/example/grafana-dashboard/redisfailover-health.json new file mode 100644 index 000000000..6006a487b --- /dev/null +++ b/example/grafana-dashboard/redisfailover-health.json @@ -0,0 +1,2276 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 6030, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 12, + "panels": [], + "title": "Health Summary", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "displayName": "Operator", + "mappings": [ + { + "options": { + "0": { + "index": 1, + "text": "ERROR" + }, + "1": { + "index": 0, + "text": "OK" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "group by (namespace, name) (redis_operator_controller_cluster_ok{name=\"$tenant\"})", + "legendFormat": "{{label_name}}", + "range": true, + "refId": "A" + } + ], + "title": "$tenant - operator functioning health", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "displayName": "Sentinel", + "mappings": [ + { + "options": { + "0": { + "color": "green", + "index": 0, + "text": "Healthy" + } + }, + "type": "value" + }, + { + "options": { + "from": 0.1, + "result": { + "color": "orange", + "index": 1, + "text": "Unhealthy" + }, + "to": 999999999 + }, + "type": "range" + }, + { + "options": { + "match": "null", + "result": { + "index": 2, + "text": "Healthy" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 1 + }, + "id": 10, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "max" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "max(rate(redis_operator_controller_sentinel_unhealthy{resource=\"$tenant\"}[2m]) * 30)", + "legendFormat": "Sentinel Status", + "range": true, + "refId": "A" + } + ], + "title": "$tenant sentinel health", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "displayName": "redis", + "mappings": [ + { + "options": { + "0": { + "color": "green", + "index": 0, + "text": "Healthy" + } + }, + "type": "value" + }, + { + "options": { + "from": 0.1, + "result": { + "color": "orange", + "index": 1, + "text": "Unhealthy" + }, + "to": 999999999 + }, + "type": "range" + }, + { + "options": { + "match": "null", + "result": { + "color": "green", + "index": 2, + "text": "Healthy" + } + }, + "type": "special" + } + ], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 1 + }, + "id": 29, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "max" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "max(rate(redis_operator_controller_redis_unhealthy{resource=\"$tenant\"}[2m]) * 30)", + "legendFormat": "Sentinel Status", + "range": true, + "refId": "A" + } + ], + "title": "$tenant redis health", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "blue", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 9 + }, + "id": 35, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(kube_pod_info{created_by_kind=\"StatefulSet\", created_by_name=~\"rfr-$tenant\"})", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total redis pods", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "blue", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 9 + }, + "id": 36, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(kube_pod_info{created_by_kind=\"ReplicaSet\", created_by_name=~\"rfs-$tenant-[0-9a-z]+\"})", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total sentinel pods", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "blue", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 9 + }, + "id": 34, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum (rate(redis_operator_controller_redis_operations{IP=~\"$redis_pod_ip|$sentinel_pod_ip\"}[1m]) * 30)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total redis API calls made in last minute (aggregated)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "blue", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 9 + }, + "id": 33, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.5", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "editorMode": "code", + "expr": "sum (rate(redis_operator_controller_k8s_operations{namespace=\"$kubernetesnamespace\", object=~\"$kubernetesResource\"}[1m]) * 30)", + "format": "table", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total K8s API calls made in last minute (aggregated)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 15 + }, + "id": 14, + "panels": [], + "title": "Operator - Functioning Details", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 50, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [ + { + "options": { + "from": 0.1, + "result": { + "color": "red", + "index": 0 + }, + "to": 99999999 + }, + "type": "range" + }, + { + "options": { + "match": "null", + "result": { + "color": "blue", + "index": 1, + "text": "0" + } + }, + "type": "special" + } + ], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (namespace, resource, indicator) (rate(redis_operator_controller_sentinel_unhealthy{resource=\"$tenant\"}[2m]) * 30)", + "legendFormat": "{{namespace}} {{resource}} {{indicator}}", + "range": true, + "refId": "A" + } + ], + "title": "Sentinel Unhealthy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 50, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "always", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [ + { + "options": { + "from": 0.001, + "result": { + "color": "red", + "index": 0 + }, + "to": 999999 + }, + "type": "range" + } + ], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (namespace, resource, indicator) (rate(redis_operator_controller_redis_unhealthy{resource=\"$tenant\"}[2m]) * 30)", + "legendFormat": "{{namespace}} {{resource}} {{indicator}}", + "range": true, + "refId": "A" + } + ], + "title": "Redis Unhealthy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (namespace, kind, object, operation) (rate(redis_operator_controller_k8s_operations{namespace=\"$kubernetesnamespace\", object=~\"$kubernetesResource\",status=\"FAIL\"}[1m]) * 30)", + "legendFormat": "{{namespace}} {{kind}} {{object}} {{operation}} {{status}}", + "range": true, + "refId": "A" + } + ], + "title": "k8s ops - failures", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (namespace, kind, operation, status, IP) (rate(redis_operator_controller_redis_operations{IP=~\"$redis_pod_ip|$sentinel_pod_ip\",status=\"FAIL\"}[1m]) * 30)", + "legendFormat": "{{namespace}} {{kind}} {{IP}} {{operation}} {{status}} ", + "range": true, + "refId": "A" + } + ], + "title": "redis ops - failures", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(redis_operator_controller_k8s_operations{namespace=\"$kubernetesnamespace\", object=~\"$kubernetesResource\"}[1m]) * 30", + "legendFormat": "{{namespace}} {{kind}} {{object}} {{operation}} {{status}}", + "range": true, + "refId": "A" + } + ], + "title": "k8s ops - all", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (namespace, kind, operation, status, IP) (rate(redis_operator_controller_redis_operations{IP=~\"$redis_pod_ip|$sentinel_pod_ip\"}[1m]) * 30)", + "legendFormat": "{{namespace}} {{kind}} {{IP}} {{operation}} {{status}} ", + "range": true, + "refId": "A" + } + ], + "title": "redis ops - all", + "type": "timeseries" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 40 + }, + "id": 31, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 11, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 35 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "(rate(go_memstats_mallocs_total{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}[1m]))", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "rate of objects allocated [heap]", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 35 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_memstats_mallocs_total{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"} - go_memstats_frees_total{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "number of live objects [heap]", + "type": "timeseries" + }, + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 43 + }, + "id": 26, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_memstats_sys_bytes{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "Total Used Memory", + "type": "timeseries" + }, + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 43 + }, + "id": 25, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_memstats_heap_alloc_bytes{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "heap memory in use", + "type": "timeseries" + }, + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 51 + }, + "id": 23, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "(rate(go_memstats_alloc_bytes_total{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}[1m]))", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "stack memory allocation rate", + "type": "timeseries" + }, + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_memstats_stack_inuse_bytes{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "stack memory", + "type": "timeseries" + }, + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 59 + }, + "id": 22, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_goroutines{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "number of goroutines", + "type": "timeseries" + }, + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 59 + }, + "id": 27, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_gc_duration_seconds{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\", quantile=\"1\"}", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "GC Duration [max]", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "redisoperator-7c8f6975b4-px5tc" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 67 + }, + "id": 28, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (pod, container) ((container_cpu_cfs_throttled_periods_total{container=\"app\", pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"} / container_cpu_cfs_periods_total{container=\"app\", pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"} ) * 100)", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU throttled [%]", + "type": "timeseries" + } + ], + "title": "Operator system metrics", + "type": "row" + } + ], + "refresh": "30s", + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "redis-failovertest", + "value": "redis-failovertest" + }, + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "definition": "label_values(redis_operator_controller_cluster_ok, name)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "tenant", + "options": [], + "query": { + "query": "label_values(redis_operator_controller_cluster_ok, name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "definition": "label_values(redis_operator_controller_ensure_resource_success{resource_name=\"$tenant\"}, object_name)", + "hide": 2, + "includeAll": true, + "multi": false, + "name": "kubernetesResource", + "options": [], + "query": { + "query": "label_values(redis_operator_controller_ensure_resource_success{resource_name=\"$tenant\"}, object_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "failovertest", + "value": "failovertest" + }, + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "definition": "label_values(redis_operator_controller_ensure_resource_success{resource_name=\"$tenant\"}, object_namespace)", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "kubernetesnamespace", + "options": [], + "query": { + "query": "label_values(redis_operator_controller_ensure_resource_success{resource_name=\"$tenant\"}, object_namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "definition": "label_values(kube_pod_info{created_by_kind=\"StatefulSet\", created_by_name=~\"rfr-$tenant\"}, pod_ip)", + "hide": 2, + "includeAll": true, + "multi": false, + "name": "redis_pod_ip", + "options": [], + "query": { + "query": "label_values(kube_pod_info{created_by_kind=\"StatefulSet\", created_by_name=~\"rfr-$tenant\"}, pod_ip)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "definition": "label_values(kube_pod_info{ created_by_kind=\"ReplicaSet\", created_by_name=~\"rfs-$tenant-[0-9a-z]{10}\"}, pod_ip)", + "hide": 2, + "includeAll": true, + "multi": false, + "name": "sentinel_pod_ip", + "options": [], + "query": { + "query": "label_values(kube_pod_info{ created_by_kind=\"ReplicaSet\", created_by_name=~\"rfs-$tenant-[0-9a-z]{10}\"}, pod_ip)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "2022-09-30T06:56:37.303Z", + "to": "2022-09-30T07:56:37.303Z" + }, + "timepicker": {}, + "timezone": "", + "title": "[WIP] redis operator", + "uid": "redis-failover", + "version": 6, + "weekStart": "" + } \ No newline at end of file diff --git a/example/grafana-dashboard/summary-dashboard.json b/example/grafana-dashboard/summary-dashboard.json new file mode 100644 index 000000000..d4714455d --- /dev/null +++ b/example/grafana-dashboard/summary-dashboard.json @@ -0,0 +1,200 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [ + { + "targetBlank": true, + "title": "data link", + "url": "d/redis-failover/wip-redis-operator?orgId=98&refresh=30s&${tenant:queryparam}&var-kubernetesResource=All&var-redis_pod_ip=All&var-sentinel_pod_ip=All&from=${__from}&to=${__to}" + } + ], + "mappings": [ + { + "options": { + "0": { + "color": "green", + "index": 0, + "text": "Healthy" + } + }, + "type": "value" + }, + { + "options": { + "match": "null", + "result": { + "color": "green", + "index": 1, + "text": "Healthy" + } + }, + "type": "special" + }, + { + "options": { + "from": 0.01, + "result": { + "color": "orange", + "index": 2, + "text": "Unhealthy" + }, + "to": 99999 + }, + "type": "range" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 2, + "links": [ + { + "targetBlank": true, + "title": "details", + "url": "d/redis-failover/wip-redis-operator?orgId=98&refresh=30s&${tenant:queryparam}&var-kubernetesResource=All&var-redis_pod_ip=All&var-sentinel_pod_ip=All&from=${__from}&to=${__to}" + } + ], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "max" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.5", + "repeat": "tenant", + "repeatDirection": "h", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "editorMode": "code", + "expr": "max(rate(redis_operator_controller_sentinel_unhealthy{resource=~\"$tenant\"}[2m]) * 30) ", + "legendFormat": "Sentinels Status", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "editorMode": "code", + "expr": "max(rate(redis_operator_controller_redis_unhealthy{resource=~\"$tenant\"}[2m]) * 30)", + "hide": false, + "legendFormat": "Redis Status", + "range": true, + "refId": "B" + } + ], + "title": "tenant health - $tenant", + "transparent": true, + "type": "stat" + } + ], + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "FoDDSbR7k" + }, + "definition": "label_values(redis_operator_controller_cluster_ok, name)", + "hide": 0, + "includeAll": true, + "multi": false, + "name": "tenant", + "options": [], + "query": { + "query": "label_values(redis_operator_controller_cluster_ok, name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "[WIP] redis operator - Summary", + "weekStart": "" +} \ No newline at end of file diff --git a/service/k8s/service.go b/service/k8s/service.go index 24341b080..9e86f00b0 100644 --- a/service/k8s/service.go +++ b/service/k8s/service.go @@ -42,9 +42,10 @@ func NewServiceService(kubeClient kubernetes.Interface, logger log.Logger, metri func (s *ServiceService) GetService(namespace string, name string) (*corev1.Service, error) { service, err := s.kubeClient.CoreV1().Services(namespace).Get(context.TODO(), name, metav1.GetOptions{}) - log.Errorf("Error while getting service %v in %v namespace : %v", name, namespace, err) + recordMetrics(namespace, "Service", name, "GET", err, s.metricsRecorder) if err != nil { + log.Errorf("Error while getting service %v in %v namespace : %v", name, namespace, err) return nil, err } return service, err From a719a1c27f7e4b3d2920021b66b50562ca54e2c1 Mon Sep 17 00:00:00 2001 From: Raghu Nandan B S Date: Sat, 1 Oct 2022 11:30:27 +0530 Subject: [PATCH 08/24] revert unwanted changes --- api/redisfailover/v1/validate.go | 1 - docker/development/Dockerfile | 2 +- operator/redisfailover/checker.go | 2 -- operator/redisfailover/service/check.go | 2 -- service/k8s/service.go | 2 -- service/k8s/util.go | 1 - 6 files changed, 1 insertion(+), 9 deletions(-) diff --git a/api/redisfailover/v1/validate.go b/api/redisfailover/v1/validate.go index f3efedbff..ec5a7e66e 100644 --- a/api/redisfailover/v1/validate.go +++ b/api/redisfailover/v1/validate.go @@ -12,7 +12,6 @@ const ( // Validate set the values by default if not defined and checks if the values given are valid func (r *RedisFailover) Validate() error { - if len(r.Name) > maxNameLength { return fmt.Errorf("name length can't be higher than %d", maxNameLength) } diff --git a/docker/development/Dockerfile b/docker/development/Dockerfile index 5a0959f62..0e247dbcb 100644 --- a/docker/development/Dockerfile +++ b/docker/development/Dockerfile @@ -18,7 +18,7 @@ RUN wget http://github.com/kubernetes/code-generator/archive/kubernetes-${CODEGE # Mock creator ARG MOCKERY_VERSION="2.9.4" -RUN wget -c "https://github.com/vektra/mockery/releases/download/v2.9.4/mockery_2.9.4_Linux_arm64.tar.gz" -O - | tar -xz -C /go/bin/ +RUN wget -c https://github.com/vektra/mockery/releases/download/v${MOCKERY_VERSION}/mockery_${MOCKERY_VERSION}_$(uname -o)_$(uname -m).tar.gz -O - | tar -xz -C /go/bin/ # Create user ARG uid=1000 diff --git a/operator/redisfailover/checker.go b/operator/redisfailover/checker.go index ee0c86083..94e75751b 100644 --- a/operator/redisfailover/checker.go +++ b/operator/redisfailover/checker.go @@ -101,13 +101,11 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e // All sentinels points to the same redis master // Sentinel has not death nodes // Sentinel knows the correct slave number - if err := r.rfChecker.CheckRedisNumber(rf); err != nil { r.logger.Debug("Number of redis mismatch, this could be for a change on the statefulset") r.mClient.IncrRedisUnhealthyCount(rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE) return nil } - if err := r.rfChecker.CheckSentinelNumber(rf); err != nil { r.logger.Debug("Number of sentinel mismatch, this could be for a change on the deployment") r.mClient.IncrSentinelUnhealthyCount(rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE) diff --git a/operator/redisfailover/service/check.go b/operator/redisfailover/service/check.go index b1add5f9d..ad86e858a 100644 --- a/operator/redisfailover/service/check.go +++ b/operator/redisfailover/service/check.go @@ -123,7 +123,6 @@ func (r *RedisFailoverChecker) CheckAllSlavesFromMaster(master string, rf *redis } slave, err := r.redisClient.GetSlaveOf(rp.Status.PodIP, rport, password) - // set redis instance metrics if err != nil { r.logger.Errorf("Get slave of master failed, maybe this node is not ready, pod ip: %s", rp.Status.PodIP) return err @@ -131,7 +130,6 @@ func (r *RedisFailoverChecker) CheckAllSlavesFromMaster(master string, rf *redis if slave != "" && slave != master { return fmt.Errorf("slave %s don't have the master %s, has %s", rp.Status.PodIP, master, slave) } - } return nil } diff --git a/service/k8s/service.go b/service/k8s/service.go index 9e86f00b0..867b4cebe 100644 --- a/service/k8s/service.go +++ b/service/k8s/service.go @@ -42,7 +42,6 @@ func NewServiceService(kubeClient kubernetes.Interface, logger log.Logger, metri func (s *ServiceService) GetService(namespace string, name string) (*corev1.Service, error) { service, err := s.kubeClient.CoreV1().Services(namespace).Get(context.TODO(), name, metav1.GetOptions{}) - recordMetrics(namespace, "Service", name, "GET", err, s.metricsRecorder) if err != nil { log.Errorf("Error while getting service %v in %v namespace : %v", name, namespace, err) @@ -80,7 +79,6 @@ func (s *ServiceService) UpdateService(namespace string, service *corev1.Service return err } s.logger.WithField("namespace", namespace).WithField("serviceName", service.Name).Infof("service updated") - return nil } func (s *ServiceService) CreateOrUpdateService(namespace string, service *corev1.Service) error { diff --git a/service/k8s/util.go b/service/k8s/util.go index 9c2cb1668..2cd9bbd73 100644 --- a/service/k8s/util.go +++ b/service/k8s/util.go @@ -41,5 +41,4 @@ func recordMetrics(namespace string, kind string, object string, operation strin } else { metricsRecorder.RecordK8sOperation(namespace, kind, object, operation, metrics.FAIL, metrics.K8S_MISC) } - } From c6fb389e3f21d175a5367d06cadec1ade359486b Mon Sep 17 00:00:00 2001 From: Raghu Nandan B S Date: Sat, 1 Oct 2022 11:31:07 +0530 Subject: [PATCH 09/24] revert unwanted changes --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index c91bc8c4b..21ca105b7 100644 --- a/Makefile +++ b/Makefile @@ -94,6 +94,7 @@ image: deps-development -t $(SERVICE_NAME) \ -t $(REPOSITORY):latest \ -t $(REPOSITORY):$(COMMIT) \ + -t $(REPOSITORY):$(BRANCH) \ -f $(APP_DIR)/Dockerfile \ . From 6928c084775ef582ac545259aa8ffc66c7736beb Mon Sep 17 00:00:00 2001 From: "guozhi.li" Date: Thu, 13 Oct 2022 10:19:50 +0800 Subject: [PATCH 10/24] support custom cmd. value is empty --- service/redis/client.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/service/redis/client.go b/service/redis/client.go index e3d070b2f..63d722a17 100644 --- a/service/redis/client.go +++ b/service/redis/client.go @@ -306,7 +306,8 @@ func (c *client) SetCustomRedisConfig(ip string, port string, configs []string, return err } // If the configuration is an empty line , it will result in an incorrect configSet, which will not run properly down the line. - if strings.TrimSpace(param) == "" || strings.TrimSpace(value) == "" { + // `config set save ""` should support + if strings.TrimSpace(param) == "" { continue } if err := c.applyRedisConfig(param, value, rClient); err != nil { From cba38493a3e17cc280aed7e20bbee60e4d34ade2 Mon Sep 17 00:00:00 2001 From: corneredrat Date: Thu, 13 Oct 2022 21:25:44 +0530 Subject: [PATCH 11/24] revamp checker metrics - add status label to checker - metrics, and record each operation as success / fail --- Makefile | 1 - api/redisfailover/v1/validate.go | 19 +++- cmd/redisoperator/main.go | 1 + docker/development/Dockerfile | 2 +- .../all-redis-operator-resources.yaml | 65 ++++++++++- example/redisfailover/basic.yaml | 2 +- metrics/dummy.go | 18 ++-- metrics/metrics.go | 102 ++++++------------ operator/redisfailover/checker.go | 85 +++++++++++---- operator/redisfailover/handler.go | 11 ++ operator/redisfailover/service/client.go | 4 +- service.yaml | 20 ++++ 12 files changed, 214 insertions(+), 116 deletions(-) create mode 100644 service.yaml diff --git a/Makefile b/Makefile index 21ca105b7..c91bc8c4b 100644 --- a/Makefile +++ b/Makefile @@ -94,7 +94,6 @@ image: deps-development -t $(SERVICE_NAME) \ -t $(REPOSITORY):latest \ -t $(REPOSITORY):$(COMMIT) \ - -t $(REPOSITORY):$(BRANCH) \ -f $(APP_DIR)/Dockerfile \ . diff --git a/api/redisfailover/v1/validate.go b/api/redisfailover/v1/validate.go index ec5a7e66e..5b4bcb4f8 100644 --- a/api/redisfailover/v1/validate.go +++ b/api/redisfailover/v1/validate.go @@ -16,7 +16,6 @@ func (r *RedisFailover) Validate() error { return fmt.Errorf("name length can't be higher than %d", maxNameLength) } - initialRedisCustomConfig := defaultRedisCustomConfig if r.Bootstrapping() { if r.Spec.BootstrapNode.Host == "" { return errors.New("BootstrapNode must include a host when provided") @@ -25,11 +24,11 @@ func (r *RedisFailover) Validate() error { if r.Spec.BootstrapNode.Port == "" { r.Spec.BootstrapNode.Port = strconv.Itoa(defaultRedisPort) } - initialRedisCustomConfig = bootstrappingRedisCustomConfig + r.Spec.Redis.CustomConfig = deduplicateStr(append(bootstrappingRedisCustomConfig, r.Spec.Redis.CustomConfig...)) + } else { + r.Spec.Redis.CustomConfig = deduplicateStr(append(defaultRedisCustomConfig, r.Spec.Redis.CustomConfig...)) } - r.Spec.Redis.CustomConfig = append(initialRedisCustomConfig, r.Spec.Redis.CustomConfig...) - if r.Spec.Redis.Image == "" { r.Spec.Redis.Image = defaultImage } @@ -64,3 +63,15 @@ func (r *RedisFailover) Validate() error { return nil } + +func deduplicateStr(strSlice []string) []string { + allKeys := make(map[string]bool) + list := []string{} + for _, item := range strSlice { + if _, value := allKeys[item]; !value { + allKeys[item] = true + list = append(list, item) + } + } + return list +} diff --git a/cmd/redisoperator/main.go b/cmd/redisoperator/main.go index c42b73874..1961dac28 100644 --- a/cmd/redisoperator/main.go +++ b/cmd/redisoperator/main.go @@ -5,6 +5,7 @@ import ( "fmt" "io/ioutil" "net/http" + _ "net/http/pprof" "os" "os/signal" "strings" diff --git a/docker/development/Dockerfile b/docker/development/Dockerfile index 0e247dbcb..5a0959f62 100644 --- a/docker/development/Dockerfile +++ b/docker/development/Dockerfile @@ -18,7 +18,7 @@ RUN wget http://github.com/kubernetes/code-generator/archive/kubernetes-${CODEGE # Mock creator ARG MOCKERY_VERSION="2.9.4" -RUN wget -c https://github.com/vektra/mockery/releases/download/v${MOCKERY_VERSION}/mockery_${MOCKERY_VERSION}_$(uname -o)_$(uname -m).tar.gz -O - | tar -xz -C /go/bin/ +RUN wget -c "https://github.com/vektra/mockery/releases/download/v2.9.4/mockery_2.9.4_Linux_arm64.tar.gz" -O - | tar -xz -C /go/bin/ # Create user ARG uid=1000 diff --git a/example/operator/all-redis-operator-resources.yaml b/example/operator/all-redis-operator-resources.yaml index b30afee3d..550f4c234 100644 --- a/example/operator/all-redis-operator-resources.yaml +++ b/example/operator/all-redis-operator-resources.yaml @@ -18,9 +18,15 @@ spec: spec: serviceAccountName: redisoperator containers: - - image: quay.io/spotahome/redis-operator:v1.1.0 - imagePullPolicy: IfNotPresent + - image: redis-operator:latest + args: + - --debug + imagePullPolicy: Never name: app + ports: + - name: metrics + containerPort: 9710 + protocol: TCP securityContext: readOnlyRootFilesystem: true runAsNonRoot: true @@ -108,3 +114,58 @@ apiVersion: v1 kind: ServiceAccount metadata: name: redisoperator +--- + +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: http + prometheus.io/scrape: "true" + name: redisoperator + labels: + app: redisoperator +spec: + type: ClusterIP + ports: + - name: metrics + port: 9710 + protocol: TCP + targetPort: metrics + selector: + app: redisoperator +--- + +#apiVersion: monitoring.coreos.com/v1 +#kind: ServiceMonitor +#metadata: +# name: redis-operator-metrics +# labels: +# app: redisoperator +# release: prometheus +#spec: +# selector: +# matchLabels: +# app: redisoperator +# endpoints: +# - port: metrics +# namespaceSelector: +# matchNames: +# - default +#--- +# +# +#apiVersion: monitoring.coreos.com/v1 +#kind: PodMonitor +#metadata: +# name: redisoperator +# labels: +# app: redisoperator +# release: prometheus +#spec: +# selector: +# matchLabels: +# app: redisoperator +# podMetricsEndpoints: +# - port: metrics diff --git a/example/redisfailover/basic.yaml b/example/redisfailover/basic.yaml index c3ebb1f3a..d8a2947be 100644 --- a/example/redisfailover/basic.yaml +++ b/example/redisfailover/basic.yaml @@ -1,7 +1,7 @@ apiVersion: databases.spotahome.com/v1 kind: RedisFailover metadata: - name: redisfailover + name: redisfailover-metrics-test spec: sentinel: replicas: 3 diff --git a/metrics/dummy.go b/metrics/dummy.go index a5138daf3..8079de605 100644 --- a/metrics/dummy.go +++ b/metrics/dummy.go @@ -14,20 +14,18 @@ type dummy struct { koopercontroller.MetricsRecorder } -func (d *dummy) SetClusterOK(namespace string, name string) {} -func (d *dummy) SetClusterError(namespace string, name string) {} -func (d *dummy) DeleteCluster(namespace string, name string) {} -func (d *dummy) IncrEnsureResourceSuccessCount(objectNamespace string, objectName string, objectKind string, resourceName string) { -} -func (d *dummy) IncrEnsureResourceFailureCount(objectNamespace string, objectName string, objectKind string, resourceName string) { -} +func (d *dummy) SetClusterOK(namespace string, name string) {} +func (d *dummy) SetClusterError(namespace string, name string) {} +func (d *dummy) DeleteCluster(namespace string, name string) {} func (d *dummy) SetRedisInstance(IP string, masterIP string, role string) {} func (d *dummy) ResetRedisInstance() {} -func (d *dummy) IncrRedisUnhealthyCount(namespace string, resource string, indicator string, instance string) { +func (d *dummy) RecordEnsureOperation(objectNamespace string, objectName string, objectKind string, resourceName string, status string) { +} +func (d *dummy) RecordRedisCheck(namespace string, resource string, indicator string, instance string, status string) { } -func (d *dummy) IncrSentinelUnhealthyCount(namespace string, resource string, indicator string, instance string) { +func (d *dummy) RecordSentinelCheck(namespace string, resource string, indicator string, instance string, status string) { } func (d dummy) RecordK8sOperation(namespace string, kind string, object string, operation string, status string, err string) { } -func (d dummy) RecordRedisOperation(kind /*redis/sentinel? */ string, IP string, operation string, status string, err string) { +func (d dummy) RecordRedisOperation(kind string, IP string, operation string, status string, err string) { } diff --git a/metrics/metrics.go b/metrics/metrics.go index b8c0bf2e0..07bcb4ebf 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -14,6 +14,8 @@ const ( const ( SUCCESS = "SUCCESS" FAIL = "FAIL" + STATUS_HEALTHY = "HEALTHY" + STATUS_UNHEALTHY = "UNHEALTHY" NOT_APPLICABLE = "NA" UNHEALTHY = 1.0 HEALTHY = 0.0 @@ -64,67 +66,31 @@ type Recorder interface { SetClusterError(namespace string, name string) DeleteCluster(namespace string, name string) - // Indicate if `ensure` operation succeeded - IncrEnsureResourceSuccessCount(objectNamespace string, objectName string, objectKind string, resourceName string) - // Indicate if `ensure` operation failed - IncrEnsureResourceFailureCount(objectNamespace string, objectName string, objectKind string, resourceName string) - // Indicate redis instances being monitored SetRedisInstance(IP string, masterIP string, role string) - ResetRedisInstance() + RecordEnsureOperation(objectNamespace string, objectName string, objectKind string, resourceName string, status string) - IncrRedisUnhealthyCount(namespace string, resource string, indicator /* aspect of redis that is unhealthy */ string, instance string) - IncrSentinelUnhealthyCount(namespace string, resource string, indicator /* aspect of redis that is unhealthy */ string, instance string) + RecordRedisCheck(namespace string, resource string, indicator /* aspect of redis that is unhealthy */ string, instance string, status string) + RecordSentinelCheck(namespace string, resource string, indicator /* aspect of sentinel that is unhealthy */ string, instance string, status string) RecordK8sOperation(namespace string, kind string, object string, operation string, status string, err string) - - RecordRedisOperation(kind /*redis/sentinel? */ string, IP string, operation string, status string, err string) + RecordRedisOperation(kind string, IP string, operation string, status string, err string) } // PromMetrics implements the instrumenter so the metrics can be managed by Prometheus. type recorder struct { // Metrics fields. clusterOK *prometheus.GaugeVec // clusterOk is the status of a cluster - ensureResourceSuccess *prometheus.CounterVec // number of successful "ensure" operators performed by the controller. + ensureResource *prometheus.CounterVec // number of successful "ensure" operators performed by the controller. ensureResourceFailure *prometheus.CounterVec // number of failed "ensure" operators performed by the controller. redisInstance *prometheus.GaugeVec // indicates known redis instances, with IPs and master/slave status - redisUnhealthy *prometheus.CounterVec // indicates any error encountered in managed redis instance(s) - sentinelUnhealthy *prometheus.CounterVec // indicates any error encountered in managed sentinel instance(s) + redisCheck *prometheus.CounterVec // indicates any error encountered in managed redis instance(s) + sentinelCheck *prometheus.CounterVec // indicates any error encountered in managed sentinel instance(s) k8sServiceOperations *prometheus.CounterVec // number of operations performed on k8s redisOperations *prometheus.CounterVec // number of operations performed on redis/sentinel instances koopercontroller.MetricsRecorder } -type ensureResourceSuccessRecorder struct { - ensureResourceSuccess *prometheus.CounterVec - koopercontroller.MetricsRecorder -} - -type ensureResourceFailureRecorder struct { - ensureResourceSuccess *prometheus.CounterVec - koopercontroller.MetricsRecorder -} - -type redisInstanceRecorder struct { - ensureResourceSuccess *prometheus.GaugeVec - koopercontroller.MetricsRecorder -} - -type k8sServiceOperationsRecorder struct { - operations *prometheus.CounterVec - koopercontroller.MetricsRecorder -} - -type k8sServiceerrorRecorder struct { - errors *prometheus.CounterVec - koopercontroller.MetricsRecorder -} - -type redisHealthRecorder struct { - redisUnhealthy *prometheus.CounterVec - koopercontroller.MetricsRecorder -} - // NewPrometheusMetrics returns a new PromMetrics object. func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { // Create metrics. @@ -135,12 +101,12 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { Help: "Number of failover clusters managed by the operator.", }, []string{"namespace", "name"}) - ensureResourceSuccess := prometheus.NewCounterVec(prometheus.CounterOpts{ + ensureResource := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: promControllerSubsystem, - Name: "ensure_resource_success", + Name: "ensure_resource", Help: "number of successful 'ensure' operations on a resource performed by the controller.", - }, []string{"object_namespace", "object_name", "object_kind", "resource_name"}) + }, []string{"object_namespace", "object_name", "object_kind", "resource_name", "status"}) ensureResourceFailure := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, @@ -156,19 +122,19 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { Help: "redis instances discovered. IPs of redis instances, and Master/Slave role as indicators in the labels.", }, []string{"IP", "MasterIP", "role"}) - redisUnhealthy := prometheus.NewCounterVec(prometheus.CounterOpts{ + redisCheck := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: promControllerSubsystem, - Name: "redis_unhealthy", + Name: "redis_check", Help: "indicates any error encountered in managed redis instance(s)", - }, []string{"namespace", "resource", "indicator", "instance"}) + }, []string{"namespace", "resource", "indicator", "instance", "status"}) - sentinelUnhealthy := prometheus.NewCounterVec(prometheus.CounterOpts{ + sentinelCheck := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: promControllerSubsystem, - Name: "sentinel_unhealthy", + Name: "sentinel_check", Help: "indicates any error encountered in managed sentinel instance(s)", - }, []string{"namespace", "resource", "indicator", "instance"}) + }, []string{"namespace", "resource", "indicator", "instance", "status"}) redisOperations := prometheus.NewCounterVec( prometheus.CounterOpts{ @@ -188,11 +154,11 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { // Create the instance. r := recorder{ clusterOK: clusterOK, - ensureResourceSuccess: ensureResourceSuccess, + ensureResource: ensureResource, ensureResourceFailure: ensureResourceFailure, redisInstance: redisInstance, - redisUnhealthy: redisUnhealthy, - sentinelUnhealthy: sentinelUnhealthy, + redisCheck: redisCheck, + sentinelCheck: sentinelCheck, k8sServiceOperations: k8sServiceOperations, redisOperations: redisOperations, MetricsRecorder: kooperprometheus.New(kooperprometheus.Config{ @@ -203,11 +169,11 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { // Register metrics. reg.MustRegister( r.clusterOK, - r.ensureResourceSuccess, + r.ensureResource, r.ensureResourceFailure, r.redisInstance, - r.redisUnhealthy, - r.sentinelUnhealthy, + r.redisCheck, + r.sentinelCheck, r.k8sServiceOperations, r.redisOperations, ) @@ -230,28 +196,20 @@ func (r recorder) DeleteCluster(namespace string, name string) { r.clusterOK.DeleteLabelValues(namespace, name) } -func (r recorder) IncrEnsureResourceSuccessCount(objectNamespace string, objectName string, objectKind string, resourceName string) { - r.ensureResourceSuccess.WithLabelValues(objectNamespace, objectName, objectKind, resourceName).Add(1) -} - -func (r recorder) IncrEnsureResourceFailureCount(objectNamespace string, objectName string, objectKind string, resourceName string) { - r.ensureResourceSuccess.WithLabelValues(objectNamespace, objectName, objectKind, resourceName).Add(1) -} - func (r recorder) SetRedisInstance(IP string, masterIP string, role string) { r.redisInstance.WithLabelValues(IP, masterIP, role).Set(1) } -func (r recorder) ResetRedisInstance() { - r.redisInstance.Reset() +func (r recorder) RecordEnsureOperation(objectNamespace string, objectName string, objectKind string, resourceName string, status string) { + r.ensureResource.WithLabelValues(objectNamespace, objectName, objectKind, resourceName, status).Add(1) } -func (r recorder) IncrRedisUnhealthyCount(namespace string, resource string, indicator /* aspect of redis that is unhealthy */ string, instance string) { - r.redisUnhealthy.WithLabelValues(namespace, resource, indicator, instance).Add(1) +func (r recorder) RecordRedisCheck(namespace string, resource string, indicator /* aspect of redis that is unhealthy */ string, instance string, status string) { + r.redisCheck.WithLabelValues(namespace, resource, indicator, instance, status).Add(1) } -func (r recorder) IncrSentinelUnhealthyCount(namespace string, resource string, indicator /* aspect of sentinel that is unhealthy */ string, instance string) { - r.sentinelUnhealthy.WithLabelValues(namespace, resource, indicator, instance).Add(1) +func (r recorder) RecordSentinelCheck(namespace string, resource string, indicator /* aspect of sentinel that is unhealthy */ string, instance string, status string) { + r.sentinelCheck.WithLabelValues(namespace, resource, indicator, instance, status).Add(1) } func (r recorder) RecordK8sOperation(namespace string, kind string, object string, operation string, status string, err string) { diff --git a/operator/redisfailover/checker.go b/operator/redisfailover/checker.go index 94e75751b..d3884236e 100644 --- a/operator/redisfailover/checker.go +++ b/operator/redisfailover/checker.go @@ -101,14 +101,19 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e // All sentinels points to the same redis master // Sentinel has not death nodes // Sentinel knows the correct slave number - if err := r.rfChecker.CheckRedisNumber(rf); err != nil { + + err := r.rfChecker.CheckRedisNumber(rf) + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) + if err != nil { r.logger.Debug("Number of redis mismatch, this could be for a change on the statefulset") - r.mClient.IncrRedisUnhealthyCount(rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE) return nil } - if err := r.rfChecker.CheckSentinelNumber(rf); err != nil { + r.mClient.RecordRedisCheck(rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, metrics.STATUS_HEALTHY) + + err = r.rfChecker.CheckSentinelNumber(rf) + setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) + if err != nil { r.logger.Debug("Number of sentinel mismatch, this could be for a change on the deployment") - r.mClient.IncrSentinelUnhealthyCount(rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE) return nil } @@ -118,7 +123,7 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e } switch nMasters { case 0: - r.mClient.IncrRedisUnhealthyCount(rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE) + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("No masters detected")) redisesIP, err := r.rfChecker.GetRedisesIPs(rf) if err != nil { return err @@ -145,9 +150,10 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e return nil } case 1: + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, nil) break default: - r.mClient.IncrRedisUnhealthyCount(rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE) + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("Multiple masters detected")) return errors.New("More than one master, fix manually") } @@ -155,15 +161,19 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e if err != nil { return err } - if err2 := r.rfChecker.CheckAllSlavesFromMaster(master, rf); err2 != nil { - r.mClient.IncrRedisUnhealthyCount(rf.Namespace, rf.Name, metrics.SLAVE_WRONG_MASTER, metrics.NOT_APPLICABLE) + + err2 := r.rfChecker.CheckAllSlavesFromMaster(master, rf) + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.SLAVE_WRONG_MASTER, metrics.NOT_APPLICABLE, err) + if err2 != nil { r.logger.Debug("Not all slaves have the same master") if err3 := r.rfHealer.SetMasterOnAll(master, rf); err3 != nil { return err3 } } - if err := r.applyRedisCustomConfig(rf); err != nil { + err = r.applyRedisCustomConfig(rf) + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.APPLY_REDIS_CONFIG, metrics.NOT_APPLICABLE, err) + if err != nil { return err } @@ -179,8 +189,9 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e port := getRedisPort(rf.Spec.Redis.Port) for _, sip := range sentinels { - if err := r.rfChecker.CheckSentinelMonitor(sip, master, port); err != nil { - r.mClient.IncrSentinelUnhealthyCount(rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip) + err = r.rfChecker.CheckSentinelMonitor(sip, master, port) + setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, sip, metrics.NOT_APPLICABLE, err) + if err != nil { r.logger.Debug("Sentinel is not monitoring the correct master") if err := r.rfHealer.NewSentinelMonitor(sip, master, rf); err != nil { return err @@ -192,17 +203,20 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e } func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.RedisFailover) error { - if err := r.rfChecker.CheckRedisNumber(rf); err != nil { + err := r.rfChecker.CheckRedisNumber(rf) + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) + if err != nil { r.logger.Debug("Number of redis mismatch, this could be for a change on the statefulset") return nil } - err := r.UpdateRedisesPods(rf) + err = r.UpdateRedisesPods(rf) if err != nil { return err } - - if err := r.applyRedisCustomConfig(rf); err != nil { + err = r.applyRedisCustomConfig(rf) + setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.APPLY_REDIS_CONFIG, metrics.NOT_APPLICABLE, err) + if err != nil { return err } @@ -212,8 +226,9 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red } if rf.SentinelsAllowed() { - if err := r.rfChecker.CheckSentinelNumber(rf); err != nil { - r.logger.Debug("Number of sentinel mismatch, this could be for a change on the deployment") + err = r.rfChecker.CheckSentinelNumber(rf) + setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) + if err != nil { return nil } @@ -222,7 +237,9 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red return err } for _, sip := range sentinels { - if err := r.rfChecker.CheckSentinelMonitor(sip, bootstrapSettings.Host, bootstrapSettings.Port); err != nil { + err = r.rfChecker.CheckSentinelMonitor(sip, bootstrapSettings.Host, bootstrapSettings.Port) + setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip, err) + if err != nil { r.logger.Debug("Sentinel is not monitoring the correct master") if err := r.rfHealer.NewSentinelMonitorWithPort(sip, bootstrapSettings.Host, bootstrapSettings.Port, rf); err != nil { return err @@ -249,25 +266,30 @@ func (r *RedisFailoverHandler) applyRedisCustomConfig(rf *redisfailoverv1.RedisF func (r *RedisFailoverHandler) checkAndHealSentinels(rf *redisfailoverv1.RedisFailover, sentinels []string) error { for _, sip := range sentinels { - if err := r.rfChecker.CheckSentinelNumberInMemory(sip, rf); err != nil { - r.mClient.IncrSentinelUnhealthyCount(rf.Namespace, rf.Name, metrics.SENTINEL_NUMBER_IN_MEMORY_MISMATCH, metrics.NOT_APPLICABLE) + err := r.rfChecker.CheckSentinelNumberInMemory(sip, rf) + setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_NUMBER_IN_MEMORY_MISMATCH, sip, err) + if err != nil { r.logger.Debug("Sentinel has more sentinel in memory than spected") if err := r.rfHealer.RestoreSentinel(sip); err != nil { return err } } + } for _, sip := range sentinels { - if err := r.rfChecker.CheckSentinelSlavesNumberInMemory(sip, rf); err != nil { + err := r.rfChecker.CheckSentinelSlavesNumberInMemory(sip, rf) + setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH, sip, err) + if err != nil { r.logger.Debug("Sentinel has more slaves in memory than spected") if err := r.rfHealer.RestoreSentinel(sip); err != nil { - r.mClient.IncrSentinelUnhealthyCount(rf.Namespace, rf.Name, metrics.REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH, metrics.NOT_APPLICABLE) return err } } } for _, sip := range sentinels { - if err := r.rfHealer.SetSentinelCustomConfig(sip, rf); err != nil { + err := r.rfHealer.SetSentinelCustomConfig(sip, rf) + setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.APPLY_SENTINEL_CONFIG, sip, err) + if err != nil { return err } } @@ -277,3 +299,20 @@ func (r *RedisFailoverHandler) checkAndHealSentinels(rf *redisfailoverv1.RedisFa func getRedisPort(p int32) string { return strconv.Itoa(int(p)) } + +func setRedisCheckerMetrics(metricsClient metrics.Recorder, mode /* redis or sentinel? */ string, rfNamespace string, rfName string, property string, IP string, err error) { + if mode == "sentinel" { + if err != nil { + metricsClient.RecordSentinelCheck(rfNamespace, rfName, property, IP, metrics.STATUS_UNHEALTHY) + } else { + metricsClient.RecordSentinelCheck(rfNamespace, rfName, property, IP, metrics.STATUS_HEALTHY) + } + + } else if mode == "redis" { + if err != nil { + metricsClient.RecordRedisCheck(rfNamespace, rfName, property, IP, metrics.STATUS_UNHEALTHY) + } else { + metricsClient.RecordRedisCheck(rfNamespace, rfName, property, IP, metrics.STATUS_HEALTHY) + } + } +} diff --git a/operator/redisfailover/handler.go b/operator/redisfailover/handler.go index 6a03afd56..94beed0f5 100644 --- a/operator/redisfailover/handler.go +++ b/operator/redisfailover/handler.go @@ -3,7 +3,10 @@ package redisfailover import ( "context" "fmt" + "os" "regexp" + "runtime/debug" + "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -82,6 +85,14 @@ func (r *RedisFailoverHandler) Handle(_ context.Context, obj runtime.Object) err } r.mClient.SetClusterOK(rf.Namespace, rf.Name) + + // --------------------- + file, err := os.Create("/tmp/" + fmt.Sprintf("%v", time.Now().Second()) + ".dump") + if err != nil { + log.Error(err) + } + debug.WriteHeapDump(file.Fd()) + return nil } diff --git a/operator/redisfailover/service/client.go b/operator/redisfailover/service/client.go index b031b0c92..58c519813 100644 --- a/operator/redisfailover/service/client.go +++ b/operator/redisfailover/service/client.go @@ -174,8 +174,8 @@ func (r *RedisFailoverKubeClient) ensurePodDisruptionBudget(rf *redisfailoverv1. func (r *RedisFailoverKubeClient) setEnsureOperationMetrics(objectNamespace string, objectName string, objectKind string, ownerName string, err error) error { if nil != err { - r.metricsClient.IncrEnsureResourceFailureCount(objectNamespace, objectName, objectKind, ownerName) + r.metricsClient.RecordEnsureOperation(objectNamespace, objectName, objectKind, ownerName, metrics.FAIL) } - r.metricsClient.IncrEnsureResourceSuccessCount(objectNamespace, objectName, objectKind, ownerName) + r.metricsClient.RecordEnsureOperation(objectNamespace, objectName, objectKind, ownerName, metrics.SUCCESS) return err } diff --git a/service.yaml b/service.yaml new file mode 100644 index 000000000..0c966dbf2 --- /dev/null +++ b/service.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: http + prometheus.io/scrape: "true" + name: redisoperator + labels: + app: redisoperator + namespace: redis-failover +spec: + type: ClusterIP + ports: + - name: metrics + port: 9710 + protocol: TCP + targetPort: metrics + selector: + app: redisoperator \ No newline at end of file From 2e8130cc33756a1c9232cf72f84e41555826ee56 Mon Sep 17 00:00:00 2001 From: corneredrat Date: Thu, 13 Oct 2022 21:32:13 +0530 Subject: [PATCH 12/24] revert unwanted changes --- Makefile | 1 + docker/development/Dockerfile | 2 +- .../all-redis-operator-resources.yaml | 74 +------------------ example/redisfailover/basic.yaml | 2 +- service.yaml | 20 ----- 5 files changed, 6 insertions(+), 93 deletions(-) delete mode 100644 service.yaml diff --git a/Makefile b/Makefile index c91bc8c4b..21ca105b7 100644 --- a/Makefile +++ b/Makefile @@ -94,6 +94,7 @@ image: deps-development -t $(SERVICE_NAME) \ -t $(REPOSITORY):latest \ -t $(REPOSITORY):$(COMMIT) \ + -t $(REPOSITORY):$(BRANCH) \ -f $(APP_DIR)/Dockerfile \ . diff --git a/docker/development/Dockerfile b/docker/development/Dockerfile index 5a0959f62..0e247dbcb 100644 --- a/docker/development/Dockerfile +++ b/docker/development/Dockerfile @@ -18,7 +18,7 @@ RUN wget http://github.com/kubernetes/code-generator/archive/kubernetes-${CODEGE # Mock creator ARG MOCKERY_VERSION="2.9.4" -RUN wget -c "https://github.com/vektra/mockery/releases/download/v2.9.4/mockery_2.9.4_Linux_arm64.tar.gz" -O - | tar -xz -C /go/bin/ +RUN wget -c https://github.com/vektra/mockery/releases/download/v${MOCKERY_VERSION}/mockery_${MOCKERY_VERSION}_$(uname -o)_$(uname -m).tar.gz -O - | tar -xz -C /go/bin/ # Create user ARG uid=1000 diff --git a/example/operator/all-redis-operator-resources.yaml b/example/operator/all-redis-operator-resources.yaml index 550f4c234..1e184380e 100644 --- a/example/operator/all-redis-operator-resources.yaml +++ b/example/operator/all-redis-operator-resources.yaml @@ -18,15 +18,9 @@ spec: spec: serviceAccountName: redisoperator containers: - - image: redis-operator:latest - args: - - --debug - imagePullPolicy: Never + - image: quay.io/spotahome/redis-operator:v1.1.0 + imagePullPolicy: IfNotPresent name: app - ports: - - name: metrics - containerPort: 9710 - protocol: TCP securityContext: readOnlyRootFilesystem: true runAsNonRoot: true @@ -102,70 +96,8 @@ rules: - poddisruptionbudgets verbs: - "*" - - apiGroups: - - coordination.k8s.io - resources: - - leases - verbs: - - "*" - --- apiVersion: v1 kind: ServiceAccount metadata: - name: redisoperator ---- - -apiVersion: v1 -kind: Service -metadata: - annotations: - prometheus.io/path: /metrics - prometheus.io/port: http - prometheus.io/scrape: "true" - name: redisoperator - labels: - app: redisoperator -spec: - type: ClusterIP - ports: - - name: metrics - port: 9710 - protocol: TCP - targetPort: metrics - selector: - app: redisoperator ---- - -#apiVersion: monitoring.coreos.com/v1 -#kind: ServiceMonitor -#metadata: -# name: redis-operator-metrics -# labels: -# app: redisoperator -# release: prometheus -#spec: -# selector: -# matchLabels: -# app: redisoperator -# endpoints: -# - port: metrics -# namespaceSelector: -# matchNames: -# - default -#--- -# -# -#apiVersion: monitoring.coreos.com/v1 -#kind: PodMonitor -#metadata: -# name: redisoperator -# labels: -# app: redisoperator -# release: prometheus -#spec: -# selector: -# matchLabels: -# app: redisoperator -# podMetricsEndpoints: -# - port: metrics + name: redisoperator \ No newline at end of file diff --git a/example/redisfailover/basic.yaml b/example/redisfailover/basic.yaml index d8a2947be..c3ebb1f3a 100644 --- a/example/redisfailover/basic.yaml +++ b/example/redisfailover/basic.yaml @@ -1,7 +1,7 @@ apiVersion: databases.spotahome.com/v1 kind: RedisFailover metadata: - name: redisfailover-metrics-test + name: redisfailover spec: sentinel: replicas: 3 diff --git a/service.yaml b/service.yaml deleted file mode 100644 index 0c966dbf2..000000000 --- a/service.yaml +++ /dev/null @@ -1,20 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - annotations: - prometheus.io/path: /metrics - prometheus.io/port: http - prometheus.io/scrape: "true" - name: redisoperator - labels: - app: redisoperator - namespace: redis-failover -spec: - type: ClusterIP - ports: - - name: metrics - port: 9710 - protocol: TCP - targetPort: metrics - selector: - app: redisoperator \ No newline at end of file From ed54086428e3b771766190a03ee0f32689a14923 Mon Sep 17 00:00:00 2001 From: corneredrat Date: Thu, 13 Oct 2022 21:37:16 +0530 Subject: [PATCH 13/24] Fix lint checks --- operator/redisfailover/checker.go | 1 - 1 file changed, 1 deletion(-) diff --git a/operator/redisfailover/checker.go b/operator/redisfailover/checker.go index d3884236e..a7f619b82 100644 --- a/operator/redisfailover/checker.go +++ b/operator/redisfailover/checker.go @@ -151,7 +151,6 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e } case 1: setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, nil) - break default: setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.NUMBER_OF_MASTERS, metrics.NOT_APPLICABLE, errors.New("Multiple masters detected")) return errors.New("More than one master, fix manually") From 0939f8feebb9d71debb3af57d9fe7eb707d4ed93 Mon Sep 17 00:00:00 2001 From: corneredrat Date: Thu, 13 Oct 2022 21:47:45 +0530 Subject: [PATCH 14/24] Fix intgration tests --- test/integration/redisfailover/creation_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/integration/redisfailover/creation_test.go b/test/integration/redisfailover/creation_test.go index 00eb5c656..709493120 100644 --- a/test/integration/redisfailover/creation_test.go +++ b/test/integration/redisfailover/creation_test.go @@ -81,7 +81,7 @@ func TestRedisFailover(t *testing.T) { require.NoError(err) // Create the redis clients - redisClient := redis.New() + redisClient := redis.New(metrics.Dummy) clients := clients{ k8sClient: k8sClient, @@ -91,7 +91,7 @@ func TestRedisFailover(t *testing.T) { } // Create kubernetes service. - k8sservice := k8s.New(k8sClient, customClient, aeClientset, log.Dummy) + k8sservice := k8s.New(k8sClient, customClient, aeClientset, log.Dummy, metrics.Dummy) // Prepare namespace prepErr := clients.prepareNS() From 00182201452f8699a47e1d4554d44ef10cdd00e4 Mon Sep 17 00:00:00 2001 From: corneredrat Date: Thu, 13 Oct 2022 22:23:26 +0530 Subject: [PATCH 15/24] force rebuild test From 06db7837c8a782c95e419ae26e4d906b25b3cefc Mon Sep 17 00:00:00 2001 From: corneredrat Date: Fri, 14 Oct 2022 17:33:31 +0530 Subject: [PATCH 16/24] revert unwanted changes --- .../redisfailover-health.json | 4601 +++++++++-------- .../grafana-dashboard/summary-dashboard.json | 200 - .../all-redis-operator-resources.yaml | 64 +- 3 files changed, 2537 insertions(+), 2328 deletions(-) delete mode 100644 example/grafana-dashboard/summary-dashboard.json diff --git a/example/grafana-dashboard/redisfailover-health.json b/example/grafana-dashboard/redisfailover-health.json index 6006a487b..e0d7845bd 100644 --- a/example/grafana-dashboard/redisfailover-health.json +++ b/example/grafana-dashboard/redisfailover-health.json @@ -1,2276 +1,2623 @@ { - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" - }, - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": 6030, - "links": [], - "liveNow": false, - "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 12, - "panels": [], - "title": "Health Summary", - "type": "row" - }, + "annotations": { + "list": [ { + "builtIn": 1, "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" + "type": "grafana", + "uid": "-- Grafana --" }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "displayName": "Operator", - "mappings": [ - { - "options": { - "0": { - "index": 1, - "text": "ERROR" - }, - "1": { - "index": 0, - "text": "OK" - } - }, - "type": "value" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 28, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 12, + "panels": [], + "title": "Health Summary", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "displayName": "Operator", + "mappings": [ + { + "options": { + "0": { + "index": 1, + "text": "ERROR" }, - { - "color": "red", - "value": 80 + "1": { + "index": 0, + "text": "OK" } - ] + }, + "type": "value" } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 1 - }, - "id": 2, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "center", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.1.5", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "group by (namespace, name) (redis_operator_controller_cluster_ok{name=\"$tenant\"})", - "legendFormat": "{{label_name}}", - "range": true, - "refId": "A" + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] } - ], - "title": "$tenant - operator functioning health", - "transparent": true, - "type": "stat" + }, + "overrides": [] }, - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" + "textMode": "auto" + }, + "pluginVersion": "9.1.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "group by (namespace, name) (redis_operator_controller_cluster_ok{name=\"$tenant\"})", + "legendFormat": "{{label_name}}", + "range": true, + "refId": "A" + } + ], + "title": "$tenant - operator functioning health", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "displayName": "sentinel", + "mappings": [ + { + "options": { + "0": { + "color": "green", + "index": 0, + "text": "Healthy" + } + }, + "type": "value" }, - "displayName": "Sentinel", - "mappings": [ - { - "options": { - "0": { - "color": "green", - "index": 0, - "text": "Healthy" - } + { + "options": { + "from": 0.1, + "result": { + "color": "orange", + "index": 1, + "text": "Unhealthy" }, - "type": "value" + "to": 999999999 + }, + "type": "range" + }, + { + "options": { + "match": "null", + "result": { + "color": "green", + "index": 2, + "text": "Healthy" + } }, + "type": "special" + } + ], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ { - "options": { - "from": 0.1, - "result": { - "color": "orange", - "index": 1, - "text": "Unhealthy" - }, - "to": 999999999 - }, - "type": "range" + "color": "green", + "value": null }, { - "options": { - "match": "null", - "result": { - "index": 2, - "text": "Healthy" - } - }, - "type": "special" + "color": "red", + "value": 80 } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 1 - }, - "id": 10, - "options": { - "colorMode": "background", - "graphMode": "area", - "justifyMode": "center", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "max" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.1.5", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "max(rate(redis_operator_controller_sentinel_unhealthy{resource=\"$tenant\"}[2m]) * 30)", - "legendFormat": "Sentinel Status", - "range": true, - "refId": "A" + ] } - ], - "title": "$tenant sentinel health", - "transparent": true, - "type": "stat" + }, + "overrides": [] }, - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 1 + }, + "id": 44, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "max" + ], + "fields": "", + "values": false }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" + "textMode": "auto" + }, + "pluginVersion": "9.1.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "max(rate(redis_operator_controller_sentinel_check{resource=\"$tenant\", status=\"UNHEALTHY\"}[30s]) * 30)\n", + "legendFormat": "Sentinel Status", + "range": true, + "refId": "A" + } + ], + "title": "$tenant sentinel health", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "displayName": "redis", + "mappings": [ + { + "options": { + "0": { + "color": "green", + "index": 0, + "text": "Healthy" + } + }, + "type": "value" }, - "displayName": "redis", - "mappings": [ - { - "options": { - "0": { - "color": "green", - "index": 0, - "text": "Healthy" - } + { + "options": { + "from": 0.1, + "result": { + "color": "orange", + "index": 1, + "text": "Unhealthy" }, - "type": "value" + "to": 999999999 + }, + "type": "range" + }, + { + "options": { + "match": "null", + "result": { + "color": "green", + "index": 2, + "text": "Healthy" + } }, + "type": "special" + } + ], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ { - "options": { - "from": 0.1, - "result": { - "color": "orange", - "index": 1, - "text": "Unhealthy" - }, - "to": 999999999 - }, - "type": "range" + "color": "green", + "value": null }, { - "options": { - "match": "null", - "result": { - "color": "green", - "index": 2, - "text": "Healthy" - } - }, - "type": "special" + "color": "red", + "value": 80 } - ], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 1 - }, - "id": 29, - "options": { - "colorMode": "background", - "graphMode": "area", - "justifyMode": "center", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "max" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.1.5", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "max(rate(redis_operator_controller_redis_unhealthy{resource=\"$tenant\"}[2m]) * 30)", - "legendFormat": "Sentinel Status", - "range": true, - "refId": "A" + ] } - ], - "title": "$tenant redis health", - "transparent": true, - "type": "stat" + }, + "overrides": [] }, - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "blue", - "value": null - }, - { - "color": "blue", - "value": 80 - } - ] - } + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 1 + }, + "id": 29, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "max" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 0, - "y": 9 - }, - "id": 35, - "options": { - "colorMode": "none", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.1.5", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(kube_pod_info{created_by_kind=\"StatefulSet\", created_by_name=~\"rfr-$tenant\"})", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - } - ], - "title": "Total redis pods", - "type": "stat" + "editorMode": "code", + "expr": "max(rate(redis_operator_controller_redis_check{resource=\"$tenant\", status=\"UNHEALTHY\"}[30s]) * 30)\n", + "legendFormat": "Sentinel Status", + "range": true, + "refId": "A" + } + ], + "title": "$tenant redis health", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" }, - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "blue", - "value": null - }, - { - "color": "blue", - "value": 80 - } - ] - } + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 6, - "y": 9 - }, - "id": 36, - "options": { - "colorMode": "none", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.1.5", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(kube_pod_info{created_by_kind=\"ReplicaSet\", created_by_name=~\"rfs-$tenant-[0-9a-z]+\"})", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "blue", + "value": 80 + } + ] } - ], - "title": "Total sentinel pods", - "type": "stat" + }, + "overrides": [] }, - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "blue", - "value": null - }, - { - "color": "blue", - "value": 80 - } - ] - } + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 9 + }, + "id": 35, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 12, - "y": 9 - }, - "id": 34, - "options": { - "colorMode": "none", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.1.5", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum (rate(redis_operator_controller_redis_operations{IP=~\"$redis_pod_ip|$sentinel_pod_ip\"}[1m]) * 30)", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - } - ], - "title": "Total redis API calls made in last minute (aggregated)", - "type": "stat" + "editorMode": "code", + "exemplar": false, + "expr": "sum(kube_pod_info{created_by_kind=\"StatefulSet\", created_by_name=~\"rfr-$tenant\"})", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total redis pods", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" }, - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "blue", - "value": null - }, - { - "color": "blue", - "value": 80 - } - ] - } + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 18, - "y": 9 - }, - "id": 33, - "options": { - "colorMode": "none", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.1.5", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "editorMode": "code", - "expr": "sum (rate(redis_operator_controller_k8s_operations{namespace=\"$kubernetesnamespace\", object=~\"$kubernetesResource\"}[1m]) * 30)", - "format": "table", - "legendFormat": "__auto", - "range": true, - "refId": "A" + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "blue", + "value": 80 + } + ] } - ], - "title": "Total K8s API calls made in last minute (aggregated)", - "type": "stat" + }, + "overrides": [] }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 15 - }, - "id": 14, - "panels": [], - "title": "Operator - Functioning Details", - "type": "row" + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 9 }, - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "bars", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ + "id": 36, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(kube_pod_info{created_by_kind=\"ReplicaSet\", created_by_name=~\"rfs-$tenant-[0-9a-z]+\"})", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total sentinel pods", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "options": { - "from": 0.1, - "result": { - "color": "red", - "index": 0 - }, - "to": 99999999 - }, - "type": "range" + "color": "blue", + "value": null }, { - "options": { - "match": "null", - "result": { - "color": "blue", - "index": 1, - "text": "0" - } - }, - "type": "special" + "color": "blue", + "value": 80 } - ], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 0 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 16 - }, - "id": 8, - "options": { - "legend": { - "calcs": [ - "max" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "sortBy": "Max", - "sortDesc": true - }, - "tooltip": { - "mode": "single", - "sort": "none" + ] } }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum by (namespace, resource, indicator) (rate(redis_operator_controller_sentinel_unhealthy{resource=\"$tenant\"}[2m]) * 30)", - "legendFormat": "{{namespace}} {{resource}} {{indicator}}", - "range": true, - "refId": "A" - } - ], - "title": "Sentinel Unhealthy", - "type": "timeseries" + "overrides": [] }, - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "bars", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "always", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "normal" + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 9 + }, + "id": 34, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum (rate(redis_operator_controller_redis_operations{IP=~\"$redis_pod_ip|$sentinel_pod_ip\"}[1m]) * 30)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total redis API calls made in last minute (aggregated)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ { - "options": { - "from": 0.001, - "result": { - "color": "red", - "index": 0 - }, - "to": 999999 - }, - "type": "range" + "color": "blue", + "value": 80 } - ], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 0.1 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 16 - }, - "id": 7, - "options": { - "legend": { - "calcs": [ - "max" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "sortBy": "Max", - "sortDesc": true - }, - "tooltip": { - "mode": "single", - "sort": "none" + ] } }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum by (namespace, resource, indicator) (rate(redis_operator_controller_redis_unhealthy{resource=\"$tenant\"}[2m]) * 30)", - "legendFormat": "{{namespace}} {{resource}} {{indicator}}", - "range": true, - "refId": "A" - } - ], - "title": "Redis Unhealthy", - "type": "timeseries" + "overrides": [] }, - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 9 + }, + "id": 33, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum (rate(redis_operator_controller_k8s_operations{namespace=\"$kubernetesnamespace\", object=~\"$kubernetesResource\"}[1m]) * 30)", + "format": "table", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total K8s API calls made in last minute (aggregated)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 15 + }, + "id": 14, + "panels": [], + "title": "Operator - Functioning Details", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 50, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "mappings": [], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "always", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" } }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 24 - }, - "id": 15, - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "mean" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "sortBy": "Last *", - "sortDesc": false - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum by (namespace, kind, object, operation) (rate(redis_operator_controller_k8s_operations{namespace=\"$kubernetesnamespace\", object=~\"$kubernetesResource\",status=\"FAIL\"}[1m]) * 30)", - "legendFormat": "{{namespace}} {{kind}} {{object}} {{operation}} {{status}}", - "range": true, - "refId": "A" - } - ], - "title": "k8s ops - failures", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "mappings": [ + { + "options": { + "from": 0.001, + "result": { + "color": "red", + "index": 0 + }, + "to": 999999 }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" + "type": "range" + } + ], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null }, - "thresholdsStyle": { - "mode": "off" + { + "color": "red", + "value": 0.1 } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 24 - }, - "id": 5, - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "mean" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "sortBy": "Last *", - "sortDesc": true - }, - "tooltip": { - "mode": "single", - "sort": "none" + ] } }, - "targets": [ + "overrides": [ { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum by (namespace, kind, operation, status, IP) (rate(redis_operator_controller_redis_operations{IP=~\"$redis_pod_ip|$sentinel_pod_ip\",status=\"FAIL\"}[1m]) * 30)", - "legendFormat": "{{namespace}} {{kind}} {{IP}} {{operation}} {{status}} ", - "range": true, - "refId": "A" - } - ], - "title": "redis ops - failures", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } + "matcher": { + "id": "byRegexp", + "options": "(.*) UNHEALTHY" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" } - ] - } + } + ] }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 32 - }, - "id": 4, - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "mean" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "sortBy": "Last *", - "sortDesc": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "rate(redis_operator_controller_k8s_operations{namespace=\"$kubernetesnamespace\", object=~\"$kubernetesResource\"}[1m]) * 30", - "legendFormat": "{{namespace}} {{kind}} {{object}} {{operation}} {{status}}", - "range": true, - "refId": "A" - } - ], - "title": "k8s ops - all", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } + "matcher": { + "id": "byRegexp", + "options": "(.*) HEALTHY" }, - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 32 - }, - "id": 16, - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "mean" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "sortBy": "Last *", - "sortDesc": true - }, - "tooltip": { - "mode": "single", - "sort": "none" + } + ] } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 38, + "options": { + "legend": { + "calcs": [ + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum by (namespace, kind, operation, status, IP) (rate(redis_operator_controller_redis_operations{IP=~\"$redis_pod_ip|$sentinel_pod_ip\"}[1m]) * 30)", - "legendFormat": "{{namespace}} {{kind}} {{IP}} {{operation}} {{status}} ", - "range": true, - "refId": "A" - } - ], - "title": "redis ops - all", - "type": "timeseries" + "tooltip": { + "mode": "single", + "sort": "none" + } }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 40 - }, - "id": 31, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 11, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": " (rate(redis_operator_controller_sentinel_check{resource=\"$tenant\", status=\"UNHEALTHY\"}[60s]) * 60) > bool 0", + "instant": false, + "legendFormat": "{{indicator}} {{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Sentinel Checks - Unhealthy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 50, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "always", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [ + { + "options": { + "from": 0.001, + "result": { + "color": "red", + "index": 0 }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - } + "to": 999999 }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 35 - }, - "id": 20, - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "mean", - "max" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "sortBy": "Mean", - "sortDesc": true + "type": "range" + } + ], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null }, - "tooltip": { - "mode": "single", - "sort": "none" + { + "color": "red", + "value": 0.1 } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "(.*) UNHEALTHY" }, - "targets": [ + "properties": [ { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "(rate(go_memstats_mallocs_total{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}[1m]))", - "legendFormat": "{{pod}}", - "range": true, - "refId": "A" + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } } - ], - "title": "rate of objects allocated [heap]", - "type": "timeseries" + ] }, { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 35 - }, - "id": 21, - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "mean", - "max" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "sortBy": "Mean", - "sortDesc": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "matcher": { + "id": "byRegexp", + "options": "(.*) HEALTHY" }, - "targets": [ + "properties": [ { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "go_memstats_mallocs_total{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"} - go_memstats_frees_total{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}", - "legendFormat": "{{pod}}", - "range": true, - "refId": "A" + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } } - ], - "title": "number of live objects [heap]", - "type": "timeseries" + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 42, + "options": { + "legend": { + "calcs": [ + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" }, - { - "datasource": {}, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] + "editorMode": "code", + "exemplar": false, + "expr": " (rate(redis_operator_controller_redis_check{resource=\"$tenant\", status=\"UNHEALTHY\"}[60s]) * 60) > bool 0", + "instant": false, + "legendFormat": "{{indicator}} {{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Redis Checks - Unhealthy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 50, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "always", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [ + { + "options": { + "from": 0.001, + "result": { + "color": "red", + "index": 0 }, - "unit": "decbytes" + "to": 999999 }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 43 - }, - "id": 26, - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "mean", - "max" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "sortBy": "Mean", - "sortDesc": true + "type": "range" + } + ], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null }, - "tooltip": { - "mode": "single", - "sort": "none" + { + "color": "red", + "value": 0.1 } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "(.*) UNHEALTHY" }, - "targets": [ + "properties": [ { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "go_memstats_sys_bytes{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}", - "legendFormat": "{{pod}}", - "range": true, - "refId": "A" + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } } - ], - "title": "Total Used Memory", - "type": "timeseries" + ] }, { - "datasource": {}, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "decbytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 43 - }, - "id": 25, - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "mean", - "max" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "sortBy": "Mean", - "sortDesc": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "matcher": { + "id": "byRegexp", + "options": "(.*) HEALTHY" }, - "targets": [ + "properties": [ { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "go_memstats_heap_alloc_bytes{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}", - "legendFormat": "{{pod}}", - "range": true, - "refId": "A" + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } } - ], - "title": "heap memory in use", - "type": "timeseries" + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 46, + "options": { + "legend": { + "calcs": [ + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" }, - { - "datasource": {}, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] + "editorMode": "code", + "expr": "min by (indicator, instance) (rate(redis_operator_controller_sentinel_check{resource=\"$tenant\", status=\"HEALTHY\"}[60s]) * 60) != bool 0", + "legendFormat": "{{indicator}}", + "range": true, + "refId": "A" + } + ], + "title": "Sentinel Checks - Healthy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 50, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "always", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [ + { + "options": { + "from": 0.001, + "result": { + "color": "red", + "index": 0 }, - "unit": "decbytes" + "to": 999999 }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 51 - }, - "id": 23, - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "mean", - "max" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "sortBy": "Mean", - "sortDesc": true + "type": "range" + } + ], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null }, - "tooltip": { - "mode": "single", - "sort": "none" + { + "color": "red", + "value": 0.1 } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "(.*) UNHEALTHY" }, - "targets": [ + "properties": [ { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "(rate(go_memstats_alloc_bytes_total{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}[1m]))", - "legendFormat": "{{pod}}", - "range": true, - "refId": "A" + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } } - ], - "title": "stack memory allocation rate", - "type": "timeseries" + ] }, { - "datasource": {}, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "decbytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 51 - }, - "id": 24, - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "mean", - "max" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "sortBy": "Mean", - "sortDesc": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "matcher": { + "id": "byRegexp", + "options": "(.*) HEALTHY" }, - "targets": [ + "properties": [ { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "go_memstats_stack_inuse_bytes{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}", - "legendFormat": "{{pod}}", - "range": true, - "refId": "A" + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } } - ], - "title": "stack memory", - "type": "timeseries" + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 45, + "options": { + "legend": { + "calcs": [ + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" }, - { - "datasource": {}, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 59 - }, - "id": 22, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "editorMode": "code", + "expr": "min by (indicator, instance) (rate(redis_operator_controller_redis_check{resource=\"$tenant\", status=\"HEALTHY\"}[60s]) * 60) != bool 0", + "legendFormat": "{{indicator}}", + "range": true, + "refId": "A" + } + ], + "title": "Redis Checks - Healthy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null }, - "tooltip": { - "mode": "single", - "sort": "none" + { + "color": "red", + "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (namespace, kind, object, operation) (rate(redis_operator_controller_k8s_operations{namespace=\"$kubernetesnamespace\", object=~\"$kubernetesResource\",status=\"FAIL\"}[1m]) * 30)", + "legendFormat": "{{namespace}} {{kind}} {{object}} {{operation}} {{status}}", + "range": true, + "refId": "A" + } + ], + "title": "k8s ops - failures", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" }, - "targets": [ + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "go_goroutines{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}", - "legendFormat": "{{pod}}", - "range": true, - "refId": "A" + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 } - ], - "title": "number of goroutines", - "type": "timeseries" + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" }, - { - "datasource": {}, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 59 - }, - "id": 27, - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "sortBy": "Max", - "sortDesc": true + "editorMode": "code", + "expr": "sum by (namespace, kind, operation, status, IP) (rate(redis_operator_controller_redis_operations{IP=~\"$redis_pod_ip|$sentinel_pod_ip\",status=\"FAIL\"}[1m]) * 30)", + "legendFormat": "{{namespace}} {{kind}} {{IP}} {{operation}} {{status}} ", + "range": true, + "refId": "A" + } + ], + "title": "redis ops - failures", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null }, - "tooltip": { - "mode": "single", - "sort": "none" + { + "color": "red", + "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(redis_operator_controller_k8s_operations{namespace=\"$kubernetesnamespace\", object=~\"$kubernetesResource\"}[1m]) * 30", + "legendFormat": "{{namespace}} {{kind}} {{object}} {{operation}} {{status}}", + "range": true, + "refId": "A" + } + ], + "title": "k8s ops - all", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" }, - "targets": [ + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "go_gc_duration_seconds{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\", quantile=\"1\"}", - "legendFormat": "{{pod}}", - "range": true, - "refId": "A" + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 } - ], - "title": "GC Duration [max]", - "type": "timeseries" + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" }, - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "redisoperator-7c8f6975b4-px5tc" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 67 - }, - "id": 28, - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max", - "mean" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "sortBy": "Max", - "sortDesc": true + "editorMode": "code", + "expr": "sum by (namespace, kind, operation, status, IP) (rate(redis_operator_controller_redis_operations{IP=~\"$redis_pod_ip|$sentinel_pod_ip\"}[1m]) * 30)", + "legendFormat": "{{namespace}} {{kind}} {{IP}} {{operation}} {{status}} ", + "range": true, + "refId": "A" + } + ], + "title": "redis ops - all", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 48 + }, + "id": 31, + "panels": [], + "title": "Go Metrics system metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 11, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null }, - "tooltip": { - "mode": "single", - "sort": "none" + { + "color": "red", + "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 49 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "(rate(go_memstats_mallocs_total{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}[1m]))", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "rate of objects allocated [heap]", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" }, - "targets": [ + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum by (pod, container) ((container_cpu_cfs_throttled_periods_total{container=\"app\", pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"} / container_cpu_cfs_periods_total{container=\"app\", pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"} ) * 100)", - "legendFormat": "{{pod}}", - "range": true, - "refId": "A" + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 } - ], - "title": "CPU throttled [%]", - "type": "timeseries" + ] } - ], - "title": "Operator system metrics", - "type": "row" - } - ], - "refresh": "30s", - "schemaVersion": 37, - "style": "dark", - "tags": [], - "templating": { - "list": [ + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 49 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ { - "current": { - "selected": false, - "text": "redis-failovertest", - "value": "redis-failovertest" - }, "datasource": { "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "definition": "label_values(redis_operator_controller_cluster_ok, name)", - "hide": 0, - "includeAll": false, - "multi": false, - "name": "tenant", - "options": [], - "query": { - "query": "label_values(redis_operator_controller_cluster_ok, name)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_memstats_mallocs_total{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"} - go_memstats_frees_total{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "number of live objects [heap]", + "type": "timeseries" + }, + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 57 + }, + "id": 26, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ { - "current": { - "selected": false, - "text": "All", - "value": "$__all" - }, "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "definition": "label_values(redis_operator_controller_ensure_resource_success{resource_name=\"$tenant\"}, object_name)", - "hide": 2, - "includeAll": true, - "multi": false, - "name": "kubernetesResource", - "options": [], - "query": { - "query": "label_values(redis_operator_controller_ensure_resource_success{resource_name=\"$tenant\"}, object_name)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" + "type": "prometheus" + }, + "editorMode": "code", + "expr": "go_memstats_sys_bytes{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "Total Used Memory", + "type": "timeseries" + }, + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 57 + }, + "id": 25, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ { - "current": { - "selected": false, - "text": "failovertest", - "value": "failovertest" + "datasource": { + "type": "prometheus" + }, + "editorMode": "code", + "expr": "go_memstats_heap_alloc_bytes{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "heap memory in use", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 65 + }, + "id": 23, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { "datasource": { "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "definition": "label_values(redis_operator_controller_ensure_resource_success{resource_name=\"$tenant\"}, object_namespace)", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "kubernetesnamespace", - "options": [], - "query": { - "query": "label_values(redis_operator_controller_ensure_resource_success{resource_name=\"$tenant\"}, object_namespace)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "(rate(go_memstats_alloc_bytes_total{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}[1m]))", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "stack memory allocation rate", + "type": "timeseries" + }, + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 65 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ { - "current": { - "selected": false, - "text": "All", - "value": "$__all" + "datasource": { + "type": "prometheus" }, + "editorMode": "code", + "expr": "go_memstats_stack_inuse_bytes{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "stack memory", + "type": "timeseries" + }, + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 73 + }, + "id": 22, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "definition": "label_values(kube_pod_info{created_by_kind=\"StatefulSet\", created_by_name=~\"rfr-$tenant\"}, pod_ip)", - "hide": 2, - "includeAll": true, - "multi": false, - "name": "redis_pod_ip", - "options": [], - "query": { - "query": "label_values(kube_pod_info{created_by_kind=\"StatefulSet\", created_by_name=~\"rfr-$tenant\"}, pod_ip)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" + "type": "prometheus" + }, + "editorMode": "code", + "expr": "go_goroutines{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"}", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "number of goroutines", + "type": "timeseries" + }, + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 73 + }, + "id": 27, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ { - "current": { - "selected": false, - "text": "All", - "value": "$__all" + "datasource": { + "type": "prometheus" + }, + "editorMode": "code", + "expr": "go_gc_duration_seconds{pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\", quantile=\"1\"}", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "title": "GC Duration [max]", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "redisoperator-7c8f6975b4-px5tc" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 81 + }, + "id": 28, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { "datasource": { "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "definition": "label_values(kube_pod_info{ created_by_kind=\"ReplicaSet\", created_by_name=~\"rfs-$tenant-[0-9a-z]{10}\"}, pod_ip)", - "hide": 2, - "includeAll": true, - "multi": false, - "name": "sentinel_pod_ip", - "options": [], - "query": { - "query": "label_values(kube_pod_info{ created_by_kind=\"ReplicaSet\", created_by_name=~\"rfs-$tenant-[0-9a-z]{10}\"}, pod_ip)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (pod, container) ((container_cpu_cfs_throttled_periods_total{container=\"app\", pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"} / container_cpu_cfs_periods_total{container=\"app\", pod=~\"redisoperator-[0-9a-z]+-[0-9a-z]+\"} ) * 100)", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" } - ] - }, - "time": { - "from": "2022-09-30T06:56:37.303Z", - "to": "2022-09-30T07:56:37.303Z" - }, - "timepicker": {}, - "timezone": "", - "title": "[WIP] redis operator", - "uid": "redis-failover", - "version": 6, - "weekStart": "" - } \ No newline at end of file + ], + "title": "CPU throttled [%]", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "redisfailover", + "value": "redisfailover" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(redis_operator_controller_cluster_ok, name)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "tenant", + "options": [], + "query": { + "query": "label_values(redis_operator_controller_cluster_ok, name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(redis_operator_controller_ensure_resource{resource_name=\"$tenant\"}, object_name)", + "hide": 2, + "includeAll": true, + "multi": false, + "name": "kubernetesResource", + "options": [], + "query": { + "query": "label_values(redis_operator_controller_ensure_resource{resource_name=\"$tenant\"}, object_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(redis_operator_controller_ensure_resource{resource_name=\"$tenant\"}, object_namespace)", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "kubernetesnamespace", + "options": [], + "query": { + "query": "label_values(redis_operator_controller_ensure_resource{resource_name=\"$tenant\"}, object_namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(kube_pod_info{created_by_kind=\"StatefulSet\", created_by_name=~\"rfr-$tenant\"}, pod_ip)", + "hide": 2, + "includeAll": true, + "multi": false, + "name": "redis_pod_ip", + "options": [], + "query": { + "query": "label_values(kube_pod_info{created_by_kind=\"StatefulSet\", created_by_name=~\"rfr-$tenant\"}, pod_ip)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(kube_pod_info{ created_by_kind=\"ReplicaSet\", created_by_name=~\"rfs-$tenant-[0-9a-z]{10}\"}, pod_ip)", + "hide": 2, + "includeAll": true, + "multi": false, + "name": "sentinel_pod_ip", + "options": [], + "query": { + "query": "label_values(kube_pod_info{ created_by_kind=\"ReplicaSet\", created_by_name=~\"rfs-$tenant-[0-9a-z]{10}\"}, pod_ip)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "redis operator details", + "uid": "redis-failover", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/example/grafana-dashboard/summary-dashboard.json b/example/grafana-dashboard/summary-dashboard.json deleted file mode 100644 index d4714455d..000000000 --- a/example/grafana-dashboard/summary-dashboard.json +++ /dev/null @@ -1,200 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" - }, - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "links": [], - "liveNow": false, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "links": [ - { - "targetBlank": true, - "title": "data link", - "url": "d/redis-failover/wip-redis-operator?orgId=98&refresh=30s&${tenant:queryparam}&var-kubernetesResource=All&var-redis_pod_ip=All&var-sentinel_pod_ip=All&from=${__from}&to=${__to}" - } - ], - "mappings": [ - { - "options": { - "0": { - "color": "green", - "index": 0, - "text": "Healthy" - } - }, - "type": "value" - }, - { - "options": { - "match": "null", - "result": { - "color": "green", - "index": 1, - "text": "Healthy" - } - }, - "type": "special" - }, - { - "options": { - "from": 0.01, - "result": { - "color": "orange", - "index": 2, - "text": "Unhealthy" - }, - "to": 99999 - }, - "type": "range" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 6, - "x": 0, - "y": 0 - }, - "id": 2, - "links": [ - { - "targetBlank": true, - "title": "details", - "url": "d/redis-failover/wip-redis-operator?orgId=98&refresh=30s&${tenant:queryparam}&var-kubernetesResource=All&var-redis_pod_ip=All&var-sentinel_pod_ip=All&from=${__from}&to=${__to}" - } - ], - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "center", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "max" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.1.5", - "repeat": "tenant", - "repeatDirection": "h", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "editorMode": "code", - "expr": "max(rate(redis_operator_controller_sentinel_unhealthy{resource=~\"$tenant\"}[2m]) * 30) ", - "legendFormat": "Sentinels Status", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "editorMode": "code", - "expr": "max(rate(redis_operator_controller_redis_unhealthy{resource=~\"$tenant\"}[2m]) * 30)", - "hide": false, - "legendFormat": "Redis Status", - "range": true, - "refId": "B" - } - ], - "title": "tenant health - $tenant", - "transparent": true, - "type": "stat" - } - ], - "schemaVersion": 37, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "current": { - "selected": false, - "text": "All", - "value": "$__all" - }, - "datasource": { - "type": "prometheus", - "uid": "FoDDSbR7k" - }, - "definition": "label_values(redis_operator_controller_cluster_ok, name)", - "hide": 0, - "includeAll": true, - "multi": false, - "name": "tenant", - "options": [], - "query": { - "query": "label_values(redis_operator_controller_cluster_ok, name)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "[WIP] redis operator - Summary", - "weekStart": "" -} \ No newline at end of file diff --git a/example/operator/all-redis-operator-resources.yaml b/example/operator/all-redis-operator-resources.yaml index 1e184380e..dfc382a71 100644 --- a/example/operator/all-redis-operator-resources.yaml +++ b/example/operator/all-redis-operator-resources.yaml @@ -96,8 +96,70 @@ rules: - poddisruptionbudgets verbs: - "*" + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - "*" + --- apiVersion: v1 kind: ServiceAccount metadata: - name: redisoperator \ No newline at end of file + name: redisoperator +--- + +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: http + prometheus.io/scrape: "true" + name: redisoperator + labels: + app: redisoperator +spec: + type: ClusterIP + ports: + - name: metrics + port: 9710 + protocol: TCP + targetPort: metrics + selector: + app: redisoperator +--- + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: redis-operator-metrics + labels: + app: redisoperator + release: prometheus +spec: + selector: + matchLabels: + app: redisoperator + endpoints: + - port: metrics + namespaceSelector: + matchNames: + - default +--- + + +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: redisoperator + labels: + app: redisoperator + release: prometheus +spec: + selector: + matchLabels: + app: redisoperator + podMetricsEndpoints: + - port: metrics From 4d3f851c06f8c7103e9e8f4174eda147eef4b8d6 Mon Sep 17 00:00:00 2001 From: corneredrat Date: Fri, 14 Oct 2022 19:21:25 +0530 Subject: [PATCH 17/24] revert unwanted changes --- api/redisfailover/v1/validate.go | 1 - operator/redisfailover/handler.go | 10 ---------- 2 files changed, 11 deletions(-) diff --git a/api/redisfailover/v1/validate.go b/api/redisfailover/v1/validate.go index 5b4bcb4f8..d36d3bc0c 100644 --- a/api/redisfailover/v1/validate.go +++ b/api/redisfailover/v1/validate.go @@ -28,7 +28,6 @@ func (r *RedisFailover) Validate() error { } else { r.Spec.Redis.CustomConfig = deduplicateStr(append(defaultRedisCustomConfig, r.Spec.Redis.CustomConfig...)) } - if r.Spec.Redis.Image == "" { r.Spec.Redis.Image = defaultImage } diff --git a/operator/redisfailover/handler.go b/operator/redisfailover/handler.go index 94beed0f5..05697fd01 100644 --- a/operator/redisfailover/handler.go +++ b/operator/redisfailover/handler.go @@ -3,10 +3,7 @@ package redisfailover import ( "context" "fmt" - "os" "regexp" - "runtime/debug" - "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -86,13 +83,6 @@ func (r *RedisFailoverHandler) Handle(_ context.Context, obj runtime.Object) err r.mClient.SetClusterOK(rf.Namespace, rf.Name) - // --------------------- - file, err := os.Create("/tmp/" + fmt.Sprintf("%v", time.Now().Second()) + ".dump") - if err != nil { - log.Error(err) - } - debug.WriteHeapDump(file.Fd()) - return nil } From fcfe81518a71f4a8314bb487824bc64e62305f12 Mon Sep 17 00:00:00 2001 From: corneredrat Date: Fri, 14 Oct 2022 19:22:42 +0530 Subject: [PATCH 18/24] test without sanitising config --- service/redis/client.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/service/redis/client.go b/service/redis/client.go index 8fd453580..0558993fc 100644 --- a/service/redis/client.go +++ b/service/redis/client.go @@ -342,9 +342,9 @@ func (c *client) SetCustomRedisConfig(ip string, port string, configs []string, return err } // If the configuration is an empty line , it will result in an incorrect configSet, which will not run properly down the line. - if strings.TrimSpace(param) == "" || strings.TrimSpace(value) == "" { - continue - } + // if strings.TrimSpace(param) == "" || strings.TrimSpace(value) == "" { + // continue + // } if err := c.applyRedisConfig(param, value, rClient); err != nil { return err } From 399ebf07d2e865cc0dec6794d51eae139a97eeb8 Mon Sep 17 00:00:00 2001 From: corneredrat Date: Fri, 14 Oct 2022 21:41:01 +0530 Subject: [PATCH 19/24] revert test changes --- service/redis/client.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/service/redis/client.go b/service/redis/client.go index 0558993fc..8fd453580 100644 --- a/service/redis/client.go +++ b/service/redis/client.go @@ -342,9 +342,9 @@ func (c *client) SetCustomRedisConfig(ip string, port string, configs []string, return err } // If the configuration is an empty line , it will result in an incorrect configSet, which will not run properly down the line. - // if strings.TrimSpace(param) == "" || strings.TrimSpace(value) == "" { - // continue - // } + if strings.TrimSpace(param) == "" || strings.TrimSpace(value) == "" { + continue + } if err := c.applyRedisConfig(param, value, rClient); err != nil { return err } From 97f8a4b4ce9b5983275307a398986acfdae75090 Mon Sep 17 00:00:00 2001 From: corneredrat Date: Sat, 15 Oct 2022 16:36:53 +0530 Subject: [PATCH 20/24] Change setEnsureOperationMetrics function to stick to processing metrics; donot return errors. --- api/redisfailover/v1/validate.go | 1 + operator/redisfailover/checker.go | 4 +--- operator/redisfailover/handler.go | 1 - operator/redisfailover/service/client.go | 30 +++++++++++++++--------- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/api/redisfailover/v1/validate.go b/api/redisfailover/v1/validate.go index d36d3bc0c..5b4bcb4f8 100644 --- a/api/redisfailover/v1/validate.go +++ b/api/redisfailover/v1/validate.go @@ -28,6 +28,7 @@ func (r *RedisFailover) Validate() error { } else { r.Spec.Redis.CustomConfig = deduplicateStr(append(defaultRedisCustomConfig, r.Spec.Redis.CustomConfig...)) } + if r.Spec.Redis.Image == "" { r.Spec.Redis.Image = defaultImage } diff --git a/operator/redisfailover/checker.go b/operator/redisfailover/checker.go index a7f619b82..529897d9a 100644 --- a/operator/redisfailover/checker.go +++ b/operator/redisfailover/checker.go @@ -108,7 +108,6 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e r.logger.Debug("Number of redis mismatch, this could be for a change on the statefulset") return nil } - r.mClient.RecordRedisCheck(rf.Namespace, rf.Name, metrics.REDIS_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, metrics.STATUS_HEALTHY) err = r.rfChecker.CheckSentinelNumber(rf) setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) @@ -189,14 +188,13 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e port := getRedisPort(rf.Spec.Redis.Port) for _, sip := range sentinels { err = r.rfChecker.CheckSentinelMonitor(sip, master, port) - setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, sip, metrics.NOT_APPLICABLE, err) + setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_WRONG_MASTER, sip, err) if err != nil { r.logger.Debug("Sentinel is not monitoring the correct master") if err := r.rfHealer.NewSentinelMonitor(sip, master, rf); err != nil { return err } } - } return r.checkAndHealSentinels(rf, sentinels) } diff --git a/operator/redisfailover/handler.go b/operator/redisfailover/handler.go index 05697fd01..6a03afd56 100644 --- a/operator/redisfailover/handler.go +++ b/operator/redisfailover/handler.go @@ -82,7 +82,6 @@ func (r *RedisFailoverHandler) Handle(_ context.Context, obj runtime.Object) err } r.mClient.SetClusterOK(rf.Namespace, rf.Name) - return nil } diff --git a/operator/redisfailover/service/client.go b/operator/redisfailover/service/client.go index 2377ccdd8..22c8d60ea 100644 --- a/operator/redisfailover/service/client.go +++ b/operator/redisfailover/service/client.go @@ -69,14 +69,16 @@ func generateRedisSlaveRoleLabel() map[string]string { func (r *RedisFailoverKubeClient) EnsureSentinelService(rf *redisfailoverv1.RedisFailover, labels map[string]string, ownerRefs []metav1.OwnerReference) error { svc := generateSentinelService(rf, labels, ownerRefs) err := r.K8SService.CreateOrUpdateService(rf.Namespace, svc) - return r.setEnsureOperationMetrics(svc.Namespace, svc.Name, "Service", rf.Name, err) + r.setEnsureOperationMetrics(svc.Namespace, svc.Name, "Service", rf.Name, err) + return err } // EnsureSentinelConfigMap makes sure the sentinel configmap exists func (r *RedisFailoverKubeClient) EnsureSentinelConfigMap(rf *redisfailoverv1.RedisFailover, labels map[string]string, ownerRefs []metav1.OwnerReference) error { cm := generateSentinelConfigMap(rf, labels, ownerRefs) err := r.K8SService.CreateOrUpdateConfigMap(rf.Namespace, cm) - return r.setEnsureOperationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) + r.setEnsureOperationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) + return err } // EnsureSentinelDeployment makes sure the sentinel deployment exists in the desired state @@ -87,7 +89,8 @@ func (r *RedisFailoverKubeClient) EnsureSentinelDeployment(rf *redisfailoverv1.R d := generateSentinelDeployment(rf, labels, ownerRefs) err := r.K8SService.CreateOrUpdateDeployment(rf.Namespace, d) - return r.setEnsureOperationMetrics(d.Namespace, d.Name, "Deployment", rf.Name, err) + r.setEnsureOperationMetrics(d.Namespace, d.Name, "Deployment", rf.Name, err) + return err } // EnsureRedisStatefulset makes sure the redis statefulset exists in the desired state @@ -98,7 +101,8 @@ func (r *RedisFailoverKubeClient) EnsureRedisStatefulset(rf *redisfailoverv1.Red ss := generateRedisStatefulSet(rf, labels, ownerRefs) err := r.K8SService.CreateOrUpdateStatefulSet(rf.Namespace, ss) - return r.setEnsureOperationMetrics(ss.Namespace, ss.Name, "StatefulSet", rf.Name, err) + r.setEnsureOperationMetrics(ss.Namespace, ss.Name, "StatefulSet", rf.Name, err) + return err } // EnsureRedisConfigMap makes sure the Redis ConfigMap exists @@ -112,7 +116,8 @@ func (r *RedisFailoverKubeClient) EnsureRedisConfigMap(rf *redisfailoverv1.Redis cm := generateRedisConfigMap(rf, labels, ownerRefs, password) err = r.K8SService.CreateOrUpdateConfigMap(rf.Namespace, cm) - return r.setEnsureOperationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) + r.setEnsureOperationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) + return err } // EnsureRedisShutdownConfigMap makes sure the redis configmap with shutdown script exists @@ -124,7 +129,8 @@ func (r *RedisFailoverKubeClient) EnsureRedisShutdownConfigMap(rf *redisfailover } else { cm := generateRedisShutdownConfigMap(rf, labels, ownerRefs) err := r.K8SService.CreateOrUpdateConfigMap(rf.Namespace, cm) - return r.setEnsureOperationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) + r.setEnsureOperationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) + return err } return nil } @@ -133,7 +139,8 @@ func (r *RedisFailoverKubeClient) EnsureRedisShutdownConfigMap(rf *redisfailover func (r *RedisFailoverKubeClient) EnsureRedisReadinessConfigMap(rf *redisfailoverv1.RedisFailover, labels map[string]string, ownerRefs []metav1.OwnerReference) error { cm := generateRedisReadinessConfigMap(rf, labels, ownerRefs) err := r.K8SService.CreateOrUpdateConfigMap(rf.Namespace, cm) - return r.setEnsureOperationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) + r.setEnsureOperationMetrics(cm.Namespace, cm.Name, "ConfigMap", rf.Name, err) + return err } // EnsureRedisService makes sure the redis statefulset exists @@ -141,7 +148,8 @@ func (r *RedisFailoverKubeClient) EnsureRedisService(rf *redisfailoverv1.RedisFa svc := generateRedisService(rf, labels, ownerRefs) err := r.K8SService.CreateOrUpdateService(rf.Namespace, svc) - return r.setEnsureOperationMetrics(svc.Namespace, svc.Name, "Service", rf.Name, err) + r.setEnsureOperationMetrics(svc.Namespace, svc.Name, "Service", rf.Name, err) + return err } // EnsureNotPresentRedisService makes sure the redis service is not present @@ -169,13 +177,13 @@ func (r *RedisFailoverKubeClient) ensurePodDisruptionBudget(rf *redisfailoverv1. pdb := generatePodDisruptionBudget(name, namespace, labels, ownerRefs, minAvailable) err := r.K8SService.CreateOrUpdatePodDisruptionBudget(namespace, pdb) - return r.setEnsureOperationMetrics(pdb.Namespace, pdb.Name, "PodDisruptionBudget" /* pdb.TypeMeta.Kind isnt working; pdb.Kind isnt working either */, rf.Name, err) + r.setEnsureOperationMetrics(pdb.Namespace, pdb.Name, "PodDisruptionBudget" /* pdb.TypeMeta.Kind isnt working; pdb.Kind isnt working either */, rf.Name, err) + return err } -func (r *RedisFailoverKubeClient) setEnsureOperationMetrics(objectNamespace string, objectName string, objectKind string, ownerName string, err error) error { +func (r *RedisFailoverKubeClient) setEnsureOperationMetrics(objectNamespace string, objectName string, objectKind string, ownerName string, err error) { if nil != err { r.metricsClient.RecordEnsureOperation(objectNamespace, objectName, objectKind, ownerName, metrics.FAIL) } r.metricsClient.RecordEnsureOperation(objectNamespace, objectName, objectKind, ownerName, metrics.SUCCESS) - return err } From cf2f9ce5f2c57053f14e6b8356d0ec65a7bcf197 Mon Sep 17 00:00:00 2001 From: corneredrat Date: Sat, 15 Oct 2022 16:54:04 +0530 Subject: [PATCH 21/24] Fix wrongful attribution of metrics --- service/k8s/deployment.go | 2 +- service/k8s/rbac.go | 2 +- service/k8s/service.go | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/service/k8s/deployment.go b/service/k8s/deployment.go index ec945014b..b73cc984f 100644 --- a/service/k8s/deployment.go +++ b/service/k8s/deployment.go @@ -120,6 +120,6 @@ func (d *DeploymentService) DeleteDeployment(namespace, name string) error { // ListDeployments will give all the deployments on a given namespace func (d *DeploymentService) ListDeployments(namespace string) (*appsv1.DeploymentList, error) { deployments, err := d.kubeClient.AppsV1().Deployments(namespace).List(context.TODO(), metav1.ListOptions{}) - recordMetrics(namespace, "Deployment", metrics.NOT_APPLICABLE, "DELETE", err, d.metricsRecorder) + recordMetrics(namespace, "Deployment", metrics.NOT_APPLICABLE, "LIST", err, d.metricsRecorder) return deployments, err } diff --git a/service/k8s/rbac.go b/service/k8s/rbac.go index 300f84a4e..2ca90bb7e 100644 --- a/service/k8s/rbac.go +++ b/service/k8s/rbac.go @@ -110,7 +110,7 @@ func (r *RBACService) CreateOrUpdateRole(namespace string, role *rbacv1.Role) er func (r *RBACService) DeleteRoleBinding(namespace, name string) error { err := r.kubeClient.RbacV1().RoleBindings(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}) - recordMetrics(namespace, "Role", name, "DELETE", err, r.metricsRecorder) + recordMetrics(namespace, "RoleBinding", name, "DELETE", err, r.metricsRecorder) if err != nil { return err } diff --git a/service/k8s/service.go b/service/k8s/service.go index 867b4cebe..1e086aeae 100644 --- a/service/k8s/service.go +++ b/service/k8s/service.go @@ -61,7 +61,6 @@ func (s *ServiceService) CreateService(namespace string, service *corev1.Service } func (s *ServiceService) CreateIfNotExistsService(namespace string, service *corev1.Service) error { - log.Debugf("trying to get %v service in %v namespace... ", service.GetName(), namespace) if _, err := s.GetService(namespace, service.Name); err != nil { // If no resource we need to create. if errors.IsNotFound(err) { From dd8606f0037d219ff45e75f52b6b52439b837a99 Mon Sep 17 00:00:00 2001 From: corneredrat Date: Mon, 17 Oct 2022 17:44:03 +0530 Subject: [PATCH 22/24] revert erroneous deletion --- operator/redisfailover/checker.go | 1 + 1 file changed, 1 insertion(+) diff --git a/operator/redisfailover/checker.go b/operator/redisfailover/checker.go index 529897d9a..b51755a8f 100644 --- a/operator/redisfailover/checker.go +++ b/operator/redisfailover/checker.go @@ -226,6 +226,7 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red err = r.rfChecker.CheckSentinelNumber(rf) setRedisCheckerMetrics(r.mClient, "sentinel", rf.Namespace, rf.Name, metrics.SENTINEL_REPLICA_MISMATCH, metrics.NOT_APPLICABLE, err) if err != nil { + r.logger.Debug("Number of sentinel mismatch, this could be for a change on the deployment") return nil } From 62220a04face3401d4b6ff762ad5751c72bdfb70 Mon Sep 17 00:00:00 2001 From: corneredrat Date: Thu, 20 Oct 2022 13:20:21 +0530 Subject: [PATCH 23/24] rename counters with _total, as per prometheus' best practices --- metrics/metrics.go | 66 ++++++++++++++++------------------------------ 1 file changed, 23 insertions(+), 43 deletions(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index 07bcb4ebf..b09e0e99e 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -57,6 +57,11 @@ const ( SLAVE_IS_READY = "CHECK_IF_SLAVE_IS_READY" ) +var ( // Used for GCing stale metrics + trackedIPs map[string]bool // IPs of either redis or sentinels that are being labelled every check and heal loop + trackedResources map[string]map[string]map[string]bool // namespace -> kind -> object +) + // Instrumenter is the interface that will collect the metrics and has ability to send/expose those metrics. type Recorder interface { koopercontroller.MetricsRecorder @@ -67,7 +72,6 @@ type Recorder interface { DeleteCluster(namespace string, name string) // Indicate redis instances being monitored - SetRedisInstance(IP string, masterIP string, role string) RecordEnsureOperation(objectNamespace string, objectName string, objectKind string, resourceName string, status string) RecordRedisCheck(namespace string, resource string, indicator /* aspect of redis that is unhealthy */ string, instance string, status string) @@ -80,14 +84,12 @@ type Recorder interface { // PromMetrics implements the instrumenter so the metrics can be managed by Prometheus. type recorder struct { // Metrics fields. - clusterOK *prometheus.GaugeVec // clusterOk is the status of a cluster - ensureResource *prometheus.CounterVec // number of successful "ensure" operators performed by the controller. - ensureResourceFailure *prometheus.CounterVec // number of failed "ensure" operators performed by the controller. - redisInstance *prometheus.GaugeVec // indicates known redis instances, with IPs and master/slave status - redisCheck *prometheus.CounterVec // indicates any error encountered in managed redis instance(s) - sentinelCheck *prometheus.CounterVec // indicates any error encountered in managed sentinel instance(s) - k8sServiceOperations *prometheus.CounterVec // number of operations performed on k8s - redisOperations *prometheus.CounterVec // number of operations performed on redis/sentinel instances + clusterOK *prometheus.GaugeVec // clusterOk is the status of a cluster + ensureResource *prometheus.CounterVec // number of successful "ensure" operators performed by the controller. + redisCheck *prometheus.CounterVec // indicates any error encountered in managed redis instance(s) + sentinelCheck *prometheus.CounterVec // indicates any error encountered in managed sentinel instance(s) + k8sServiceOperations *prometheus.CounterVec // number of operations performed on k8s + redisOperations *prometheus.CounterVec // number of operations performed on redis/sentinel instances koopercontroller.MetricsRecorder } @@ -104,35 +106,21 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { ensureResource := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: promControllerSubsystem, - Name: "ensure_resource", - Help: "number of successful 'ensure' operations on a resource performed by the controller.", + Name: "ensure_resource_total", + Help: "number of 'ensure' operations on a resource performed by the controller.", }, []string{"object_namespace", "object_name", "object_kind", "resource_name", "status"}) - ensureResourceFailure := prometheus.NewCounterVec(prometheus.CounterOpts{ - Namespace: namespace, - Subsystem: promControllerSubsystem, - Name: "ensure_resource_failure", - Help: "number of failed 'ensure' operations on a resource performed by the controller.", - }, []string{"object_namespace", "object_name", "object_kind", "resource_name"}) - - redisInstance := prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Namespace: namespace, - Subsystem: promControllerSubsystem, - Name: "redis_instance_info", - Help: "redis instances discovered. IPs of redis instances, and Master/Slave role as indicators in the labels.", - }, []string{"IP", "MasterIP", "role"}) - redisCheck := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: promControllerSubsystem, - Name: "redis_check", + Name: "redis_checks_total", Help: "indicates any error encountered in managed redis instance(s)", }, []string{"namespace", "resource", "indicator", "instance", "status"}) sentinelCheck := prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: promControllerSubsystem, - Name: "sentinel_check", + Name: "sentinel_checks_total", Help: "indicates any error encountered in managed sentinel instance(s)", }, []string{"namespace", "resource", "indicator", "instance", "status"}) @@ -140,7 +128,7 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { prometheus.CounterOpts{ Namespace: namespace, Subsystem: promControllerSubsystem, - Name: "redis_operations", + Name: "redis_operations_total", Help: "number of operations performed on redis", }, []string{"kind" /* redis/sentinel? */, "IP", "operation", "status", "err"}) @@ -148,19 +136,17 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { prometheus.CounterOpts{ Namespace: namespace, Subsystem: promControllerSubsystem, - Name: "k8s_operations", + Name: "k8s_operations_total", Help: "number of operations performed on k8s", }, []string{"namespace", "kind", "object", "operation", "status", "err"}) // Create the instance. r := recorder{ - clusterOK: clusterOK, - ensureResource: ensureResource, - ensureResourceFailure: ensureResourceFailure, - redisInstance: redisInstance, - redisCheck: redisCheck, - sentinelCheck: sentinelCheck, - k8sServiceOperations: k8sServiceOperations, - redisOperations: redisOperations, + clusterOK: clusterOK, + ensureResource: ensureResource, + redisCheck: redisCheck, + sentinelCheck: sentinelCheck, + k8sServiceOperations: k8sServiceOperations, + redisOperations: redisOperations, MetricsRecorder: kooperprometheus.New(kooperprometheus.Config{ Registerer: reg, }), @@ -170,8 +156,6 @@ func NewRecorder(namespace string, reg prometheus.Registerer) Recorder { reg.MustRegister( r.clusterOK, r.ensureResource, - r.ensureResourceFailure, - r.redisInstance, r.redisCheck, r.sentinelCheck, r.k8sServiceOperations, @@ -196,10 +180,6 @@ func (r recorder) DeleteCluster(namespace string, name string) { r.clusterOK.DeleteLabelValues(namespace, name) } -func (r recorder) SetRedisInstance(IP string, masterIP string, role string) { - r.redisInstance.WithLabelValues(IP, masterIP, role).Set(1) -} - func (r recorder) RecordEnsureOperation(objectNamespace string, objectName string, objectKind string, resourceName string, status string) { r.ensureResource.WithLabelValues(objectNamespace, objectName, objectKind, resourceName, status).Add(1) } From ddc51cc02835a49a9219b20924956c6bc51aea96 Mon Sep 17 00:00:00 2001 From: corneredrat Date: Thu, 20 Oct 2022 13:21:31 +0530 Subject: [PATCH 24/24] remove unused code --- metrics/metrics.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index b09e0e99e..560e799f4 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -57,11 +57,6 @@ const ( SLAVE_IS_READY = "CHECK_IF_SLAVE_IS_READY" ) -var ( // Used for GCing stale metrics - trackedIPs map[string]bool // IPs of either redis or sentinels that are being labelled every check and heal loop - trackedResources map[string]map[string]map[string]bool // namespace -> kind -> object -) - // Instrumenter is the interface that will collect the metrics and has ability to send/expose those metrics. type Recorder interface { koopercontroller.MetricsRecorder