From 6ae22c7f4779c906022d86ba8330b7fee16d3dfc Mon Sep 17 00:00:00 2001 From: Danny Thomson Date: Mon, 25 Mar 2019 08:38:08 -0700 Subject: [PATCH] Add Initial Prometheus Metrics --- Gopkg.lock | 77 +- controller/bluegreen.go | 33 +- controller/canary.go | 31 +- controller/controller.go | 44 +- controller/metrics/metrics.go | 171 ++++ controller/metrics/metrics_test.go | 121 +++ controller/sync.go | 8 +- examples/dashboard.json | 880 ++++++++++++++++++ .../base/argo-rollouts-metrics-service.yaml | 12 + manifests/base/kustomization.yaml | 2 +- manifests/install.yaml | 13 + manifests/namespace-install.yaml | 13 + utils/defaults/defaults.go | 10 + utils/defaults/defaults_test.go | 27 + 14 files changed, 1396 insertions(+), 46 deletions(-) create mode 100644 controller/metrics/metrics.go create mode 100644 controller/metrics/metrics_test.go create mode 100644 examples/dashboard.json create mode 100644 manifests/base/argo-rollouts-metrics-service.yaml diff --git a/Gopkg.lock b/Gopkg.lock index 26a581d702..8c2b69b980 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -33,6 +33,14 @@ pruneopts = "" revision = "38f6a293f140402953f884b015014e0cd519bbb3" +[[projects]] + branch = "master" + digest = "1:c0bec5f9b98d0bc872ff5e834fac186b807b656683bd29cb82fb207a1513fabb" + name = "github.com/beorn7/perks" + packages = ["quantile"] + pruneopts = "" + revision = "3a771d992973f24aa725d07868b467d1ddfceafb" + [[projects]] digest = "1:0deddd908b6b4b768cfc272c16ee61e7088a60f7fe2f06c547bd3d8e1f8b8e77" name = "github.com/davecgh/go-spew" @@ -246,6 +254,14 @@ pruneopts = "" revision = "60711f1a8329503b04e1c88535f419d0bb440bff" +[[projects]] + digest = "1:63722a4b1e1717be7b98fc686e0b30d5e7f734b9e93d7dee86293b6deab7ea28" + name = "github.com/matttproud/golang_protobuf_extensions" + packages = ["pbutil"] + pruneopts = "" + revision = "c12348ce28de40eed0136aa2b644d0ee0650e56c" + version = "v1.0.1" + [[projects]] digest = "1:0c0ff2a89c1bb0d01887e1dac043ad7efbf3ec77482ef058ac423d13497e16fd" name = "github.com/modern-go/concurrent" @@ -294,6 +310,14 @@ revision = "5f041e8faa004a95c88a202771f4cc3e991971e6" version = "v2.0.1" +[[projects]] + digest = "1:1d7e1867c49a6dd9856598ef7c3123604ea3daabf5b83f303ff457bcbc410b1d" + name = "github.com/pkg/errors" + packages = ["."] + pruneopts = "" + revision = "ba968bfe8b2f7e042a574c888954fccecfa385b4" + version = "v0.8.1" + [[projects]] digest = "1:256484dbbcd271f9ecebc6795b2df8cad4c458dd0f5fd82a8c2fa0c29f233411" name = "github.com/pmezard/go-difflib" @@ -302,6 +326,52 @@ revision = "792786c7400a136282c1664665ae0a8db921c6c2" version = "v1.0.0" +[[projects]] + digest = "1:6f218995d6a74636cfcab45ce03005371e682b4b9bee0e5eb0ccfd83ef85364f" + name = "github.com/prometheus/client_golang" + packages = [ + "prometheus", + "prometheus/internal", + "prometheus/promhttp", + ] + pruneopts = "" + revision = "505eaef017263e299324067d40ca2c48f6a2cf50" + version = "v0.9.2" + +[[projects]] + branch = "master" + digest = "1:cd67319ee7536399990c4b00fae07c3413035a53193c644549a676091507cadc" + name = "github.com/prometheus/client_model" + packages = ["go"] + pruneopts = "" + revision = "fd36f4220a901265f90734c3183c5f0c91daa0b8" + +[[projects]] + digest = "1:96af18a3819d2ff7d6aa07e6e50955b11e477dbc8b890324c67462b84adca56b" + name = "github.com/prometheus/common" + packages = [ + "expfmt", + "internal/bitbucket.org/ww/goautoneg", + "model", + ] + pruneopts = "" + revision = "cfeb6f9992ffa54aaa4f2170ade4067ee478b250" + version = "v0.2.0" + +[[projects]] + branch = "master" + digest = "1:5dff64a37ab1e65130c24f01d5fbda61226b73cc61b6f0c8af24373509a89b73" + name = "github.com/prometheus/procfs" + packages = [ + ".", + "internal/util", + "iostats", + "nfs", + "xfs", + ] + pruneopts = "" + revision = "55ae3d9d557340b5bc24cd8aa5f6fa2c2ab31352" + [[projects]] digest = "1:9a3c631555e0351fdc4e696577bb63afd90c399d782a8462dba9d100d7021db3" name = "github.com/sirupsen/logrus" @@ -909,6 +979,9 @@ "github.com/ghodss/yaml", "github.com/go-openapi/spec", "github.com/golang/glog", + "github.com/pkg/errors", + "github.com/prometheus/client_golang/prometheus", + "github.com/prometheus/client_golang/prometheus/promhttp", "github.com/sirupsen/logrus", "github.com/spf13/cobra", "github.com/stretchr/testify/assert", @@ -924,6 +997,7 @@ "k8s.io/apimachinery/pkg/runtime/schema", "k8s.io/apimachinery/pkg/runtime/serializer", "k8s.io/apimachinery/pkg/types", + "k8s.io/apimachinery/pkg/util/diff", "k8s.io/apimachinery/pkg/util/intstr", "k8s.io/apimachinery/pkg/util/rand", "k8s.io/apimachinery/pkg/util/runtime", @@ -936,13 +1010,11 @@ "k8s.io/client-go/discovery/fake", "k8s.io/client-go/informers", "k8s.io/client-go/informers/apps/v1", - "k8s.io/client-go/informers/core/v1", "k8s.io/client-go/kubernetes", "k8s.io/client-go/kubernetes/fake", "k8s.io/client-go/kubernetes/scheme", "k8s.io/client-go/kubernetes/typed/core/v1", "k8s.io/client-go/listers/apps/v1", - "k8s.io/client-go/listers/core/v1", "k8s.io/client-go/plugin/pkg/client/auth/gcp", "k8s.io/client-go/plugin/pkg/client/auth/oidc", "k8s.io/client-go/rest", @@ -958,6 +1030,7 @@ "k8s.io/kubernetes/pkg/controller", "k8s.io/kubernetes/pkg/util/hash", "k8s.io/kubernetes/pkg/util/labels", + "k8s.io/utils/pointer", ] solver-name = "gps-cdcl" solver-version = 1 diff --git a/controller/bluegreen.go b/controller/bluegreen.go index 8f4afadeff..6d85329952 100644 --- a/controller/bluegreen.go +++ b/controller/bluegreen.go @@ -7,6 +7,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/kubernetes/pkg/controller" + "github.com/argoproj/argo-rollouts/controller/metrics" "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" "github.com/argoproj/argo-rollouts/utils/annotations" "github.com/argoproj/argo-rollouts/utils/conditions" @@ -20,34 +21,33 @@ func (c *Controller) rolloutBlueGreen(r *v1alpha1.Rollout, rsList []*appsv1.Repl logCtx := logutil.WithRollout(r) newRS, oldRSs, err := c.getAllReplicaSetsAndSyncRevision(r, rsList, true) if err != nil { - return err + return c.metricsServer.IncError(r, err) } previewSvc, activeSvc, err := c.getPreviewAndActiveServices(r) if err != nil { - return err + return c.metricsServer.IncError(r, err) } allRSs := append(oldRSs, newRS) - // Scale up, if we can. logCtx.Infof("Reconciling new ReplicaSet '%s'", newRS.Name) scaledUp, err := c.reconcileNewReplicaSet(allRSs, newRS, r) if err != nil { - return err + return c.metricsServer.IncError(r, err) } if scaledUp { logCtx.Infof("Not finished reconciling new ReplicaSet '%s'", newRS.Name) - return c.syncRolloutStatusBlueGreen(allRSs, newRS, previewSvc, activeSvc, r, false) + return c.syncRolloutStatusBlueGreen(allRSs, newRS, previewSvc, activeSvc, r, false, metrics.Progressing) } if previewSvc != nil { logCtx.Infof("Reconciling preview service '%s'", previewSvc.Name) switchPreviewSvc, err := c.reconcilePreviewService(r, newRS, previewSvc, activeSvc) if err != nil { - return err + return c.metricsServer.IncError(r, err) } if switchPreviewSvc { logCtx.Infof("Not finished reconciling preview service' %s'", previewSvc.Name) - return c.syncRolloutStatusBlueGreen(allRSs, newRS, previewSvc, activeSvc, r, true) + return c.syncRolloutStatusBlueGreen(allRSs, newRS, previewSvc, activeSvc, r, true, metrics.Progressing) } } @@ -55,38 +55,39 @@ func (c *Controller) rolloutBlueGreen(r *v1alpha1.Rollout, rsList []*appsv1.Repl pauseBeforeSwitchActive := c.reconcileBlueGreenPause(activeSvc, r) if pauseBeforeSwitchActive { logCtx.Info("Not finished reconciling pause before switching active service") - return c.syncRolloutStatusBlueGreen(allRSs, newRS, previewSvc, activeSvc, r, true) + return c.syncRolloutStatusBlueGreen(allRSs, newRS, previewSvc, activeSvc, r, true, metrics.Progressing) } logCtx.Infof("Reconciling active service '%s'", activeSvc.Name) switchActiveSvc, err := c.reconcileActiveService(r, newRS, previewSvc, activeSvc) if err != nil { - return err + return c.metricsServer.IncError(r, err) } if switchActiveSvc { logCtx.Infof("Not Finished reconciling active service '%s'", activeSvc.Name) - return c.syncRolloutStatusBlueGreen(allRSs, newRS, previewSvc, activeSvc, r, false) + return c.syncRolloutStatusBlueGreen(allRSs, newRS, previewSvc, activeSvc, r, false, metrics.Progressing) } // Scale down, if we can. logCtx.Info("Reconciling old replica sets") scaledDown, err := c.reconcileOldReplicaSets(allRSs, controller.FilterActiveReplicaSets(oldRSs), newRS, r) if err != nil { - return err + return c.metricsServer.IncError(r, err) } if scaledDown { logCtx.Info("Not finished reconciling old replica sets") - return c.syncRolloutStatusBlueGreen(allRSs, newRS, previewSvc, activeSvc, r, false) + return c.syncRolloutStatusBlueGreen(allRSs, newRS, previewSvc, activeSvc, r, false, metrics.Progressing) } logCtx.Infof("Confirming rollout is complete") if conditions.RolloutComplete(r, &r.Status) { logCtx.Info("Cleaning up old replicasets") if err := c.cleanupRollouts(oldRSs, r); err != nil { - return err + return c.metricsServer.IncError(r, err) } } - return c.syncRolloutStatusBlueGreen(allRSs, newRS, previewSvc, activeSvc, r, false) + + return c.syncRolloutStatusBlueGreen(allRSs, newRS, previewSvc, activeSvc, r, false, metrics.Completed) } func (c *Controller) reconcileBlueGreenPause(activeSvc *corev1.Service, rollout *v1alpha1.Rollout) bool { @@ -131,7 +132,7 @@ func (c *Controller) scaleDownOldReplicaSetsForBlueGreen(allRSs []*appsv1.Replic return totalScaledDown, nil } -func (c *Controller) syncRolloutStatusBlueGreen(allRSs []*appsv1.ReplicaSet, newRS *appsv1.ReplicaSet, previewSvc *corev1.Service, activeSvc *corev1.Service, r *v1alpha1.Rollout, addPause bool) error { +func (c *Controller) syncRolloutStatusBlueGreen(allRSs []*appsv1.ReplicaSet, newRS *appsv1.ReplicaSet, previewSvc *corev1.Service, activeSvc *corev1.Service, r *v1alpha1.Rollout, addPause bool, phase metrics.ReconcilePhase) error { newStatus := c.calculateBaseStatus(allRSs, newRS, r) previewSelector, ok := c.getRolloutSelectorLabel(previewSvc) if !ok { @@ -158,7 +159,7 @@ func (c *Controller) syncRolloutStatusBlueGreen(allRSs []*appsv1.ReplicaSet, new pauseStartTime, paused := calculatePauseStatus(r, addPause) newStatus.PauseStartTime = pauseStartTime - return c.persistRolloutStatus(r, &newStatus, &paused) + return c.persistRolloutStatus(r, &newStatus, &paused, phase) } // Should run only on scaling events and not during the normal rollout process. diff --git a/controller/canary.go b/controller/canary.go index ca731d258e..9e4177a963 100644 --- a/controller/canary.go +++ b/controller/canary.go @@ -8,6 +8,7 @@ import ( "k8s.io/kubernetes/pkg/controller" "k8s.io/utils/pointer" + "github.com/argoproj/argo-rollouts/controller/metrics" "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" "github.com/argoproj/argo-rollouts/utils/conditions" "github.com/argoproj/argo-rollouts/utils/defaults" @@ -21,7 +22,7 @@ func (c *Controller) rolloutCanary(rollout *v1alpha1.Rollout, rsList []*appsv1.R logCtx.Info("List of Canary steps have changed and need to reset CurrentStepIndex") newRS, previousRSs, err := c.getAllReplicaSetsAndSyncRevision(rollout, rsList, false) if err != nil { - return err + return c.metricsServer.IncError(rollout, err) } stableRS, oldRSs := replicasetutil.GetStableRS(rollout, newRS, previousRSs) return c.syncRolloutStatusCanary(oldRSs, newRS, stableRS, rollout) @@ -39,7 +40,7 @@ func (c *Controller) rolloutCanary(rollout *v1alpha1.Rollout, rsList []*appsv1.R newRS, previousRSs, err := c.getAllReplicaSetsAndSyncRevision(rollout, rsList, true) stableRS, oldRSs := replicasetutil.GetStableRS(rollout, newRS, previousRSs) if err != nil { - return err + return c.metricsServer.IncError(rollout, err) } allRSs := append(oldRSs, newRS) if stableRS != nil { @@ -49,7 +50,7 @@ func (c *Controller) rolloutCanary(rollout *v1alpha1.Rollout, rsList []*appsv1.R logCtx.Info("Reconciling StableRS") scaledStableRS, err := c.reconcileStableRS(oldRSs, newRS, stableRS, rollout) if err != nil { - return err + return c.metricsServer.IncError(rollout, err) } if scaledStableRS { logCtx.Infof("Not finished reconciling stableRS") @@ -59,7 +60,7 @@ func (c *Controller) rolloutCanary(rollout *v1alpha1.Rollout, rsList []*appsv1.R logCtx.Infof("Reconciling new ReplicaSet '%s'", newRS.Name) scaledNewRS, err := c.reconcileNewReplicaSet(allRSs, newRS, rollout) if err != nil { - return err + return c.metricsServer.IncError(rollout, err) } if scaledNewRS { logCtx.Infof("Not finished reconciling new ReplicaSet '%s'", newRS.Name) @@ -69,7 +70,7 @@ func (c *Controller) rolloutCanary(rollout *v1alpha1.Rollout, rsList []*appsv1.R logCtx.Info("Reconciling old replica sets") scaledDown, err := c.reconcileOldReplicaSetsCanary(allRSs, controller.FilterActiveReplicaSets(oldRSs), newRS, rollout) if err != nil { - return err + return c.metricsServer.IncError(rollout, err) } if scaledDown { logCtx.Info("Not finished reconciling old replica sets") @@ -210,6 +211,7 @@ func (c *Controller) syncRolloutStatusCanary(olderRSs []*appsv1.ReplicaSet, newR allRSs = append(allRSs, stableRS) } newStatus := c.calculateBaseStatus(allRSs, newRS, r) + phase := metrics.Progressing currentStep, currentStepIndex := replicasetutil.GetCurrentCanaryStep(r) newStatus.Canary.StableRS = r.Status.Canary.StableRS @@ -220,29 +222,32 @@ func (c *Controller) syncRolloutStatusCanary(olderRSs []*appsv1.ReplicaSet, newR newStatus.CurrentStepIndex = replicasetutil.ResetCurrentStepIndex(r) if r.Status.Canary.StableRS == controller.ComputeHash(&r.Spec.Template, r.Status.CollisionCount) { if newStatus.CurrentStepIndex != nil { + phase = metrics.Completed logCtx.Info("Skipping all steps because the newRS is the stableRS.") newStatus.CurrentStepIndex = pointer.Int32Ptr(stepCount) c.recorder.Eventf(r, corev1.EventTypeNormal, "SetStepIndex", "Set Step Index to %d", int(stepCount)) } } - return c.persistRolloutStatus(r, &newStatus, pointer.BoolPtr(false)) + return c.persistRolloutStatus(r, &newStatus, pointer.BoolPtr(false), phase) } if replicasetutil.CheckPodSpecChange(r) { newStatus.CurrentStepIndex = replicasetutil.ResetCurrentStepIndex(r) if r.Status.Canary.StableRS == controller.ComputeHash(&r.Spec.Template, r.Status.CollisionCount) { if newStatus.CurrentStepIndex != nil { + phase = metrics.Completed logCtx.Info("Skipping all steps because the newRS is the stableRS.") newStatus.CurrentStepIndex = pointer.Int32Ptr(stepCount) c.recorder.Eventf(r, corev1.EventTypeNormal, "SetStepIndex", "Set Step Index to %d", int(stepCount)) } } - return c.persistRolloutStatus(r, &newStatus, pointer.BoolPtr(false)) + return c.persistRolloutStatus(r, &newStatus, pointer.BoolPtr(false), phase) } if r.Status.Canary.StableRS == "" { logCtx.Info("Setting StableRS to CurrentPodHash because it is empty beforehand") + phase = metrics.Completed newStatus.Canary.StableRS = newStatus.CurrentPodHash if stepCount > 0 { if stepCount != *currentStepIndex { @@ -251,20 +256,22 @@ func (c *Controller) syncRolloutStatusCanary(olderRSs []*appsv1.ReplicaSet, newR newStatus.CurrentStepIndex = &stepCount } - return c.persistRolloutStatus(r, &newStatus, pointer.BoolPtr(false)) + return c.persistRolloutStatus(r, &newStatus, pointer.BoolPtr(false), phase) } if stepCount == 0 { logCtx.Info("Rollout has no steps so setting stableRS status to currentPodHash") newStatus.Canary.StableRS = newStatus.CurrentPodHash - return c.persistRolloutStatus(r, &newStatus, pointer.BoolPtr(false)) + phase = metrics.Completed + return c.persistRolloutStatus(r, &newStatus, pointer.BoolPtr(false), phase) } if *currentStepIndex == stepCount { logCtx.Info("Rollout has executed every step") + phase = metrics.Completed newStatus.CurrentStepIndex = &stepCount newStatus.Canary.StableRS = newStatus.CurrentPodHash - return c.persistRolloutStatus(r, &newStatus, pointer.BoolPtr(false)) + return c.persistRolloutStatus(r, &newStatus, pointer.BoolPtr(false), phase) } if completedCurrentCanaryStep(olderRSs, newRS, stableRS, r) { @@ -275,7 +282,7 @@ func (c *Controller) syncRolloutStatusCanary(olderRSs []*appsv1.ReplicaSet, newR } logCtx.Infof("Incrementing the Current Step Index to %d", *currentStepIndex) c.recorder.Eventf(r, corev1.EventTypeNormal, "SetStepIndex", "Set Step Index to %d", int(*currentStepIndex)) - return c.persistRolloutStatus(r, &newStatus, pointer.BoolPtr(false)) + return c.persistRolloutStatus(r, &newStatus, pointer.BoolPtr(false), phase) } addPause := currentStep.Pause != nil @@ -283,5 +290,5 @@ func (c *Controller) syncRolloutStatusCanary(olderRSs []*appsv1.ReplicaSet, newR newStatus.PauseStartTime = pauseStartTime newStatus.CurrentStepIndex = currentStepIndex - return c.persistRolloutStatus(r, &newStatus, &paused) + return c.persistRolloutStatus(r, &newStatus, &paused, phase) } diff --git a/controller/controller.go b/controller/controller.go index 03eaa01449..22ce83120e 100644 --- a/controller/controller.go +++ b/controller/controller.go @@ -6,10 +6,11 @@ import ( "time" "github.com/golang/glog" + "github.com/pkg/errors" log "github.com/sirupsen/logrus" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" + k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -24,6 +25,7 @@ import ( "k8s.io/client-go/util/workqueue" "k8s.io/kubernetes/pkg/controller" + "github.com/argoproj/argo-rollouts/controller/metrics" "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" clientset "github.com/argoproj/argo-rollouts/pkg/client/clientset/versioned" rolloutscheme "github.com/argoproj/argo-rollouts/pkg/client/clientset/versioned/scheme" @@ -67,6 +69,7 @@ type Controller struct { replicaSetSynced cache.InformerSynced rolloutsLister listers.RolloutLister rolloutsSynced cache.InformerSynced + metricsServer *metrics.MetricsServer // used for unit testing enqueueRollout func(obj interface{}) @@ -105,6 +108,7 @@ func NewController( KubeClient: kubeclientset, Recorder: recorder, } + metricsAddr := fmt.Sprintf("0.0.0.0:%d", 8080) controller := &Controller{ kubeclientset: kubeclientset, @@ -117,6 +121,7 @@ func NewController( workqueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "Rollouts"), recorder: recorder, resyncPeriod: resyncPeriod, + metricsServer: metrics.NewMetricsServer(metricsAddr, rolloutsInformer.Lister()), } controller.enqueueRollout = controller.enqueueRateLimited controller.enqueueRolloutAfter = controller.enqueueAfter @@ -171,6 +176,14 @@ func (c *Controller) Run(threadiness int, stopCh <-chan struct{}) error { } log.Info("Started workers") + go func() { + log.Infof("Starting Metric Server at %s", c.metricsServer.Addr) + err := c.metricsServer.ListenAndServe() + if err != nil { + err = errors.Wrap(err, "Starting Metric Server") + log.Fatal(err) + } + }() <-stopCh log.Info("Shutting down workers") @@ -246,25 +259,27 @@ func (c *Controller) processNextWorkItem() bool { func (c *Controller) syncHandler(key string) error { startTime := time.Now() log.WithField(logutil.RolloutKey, key).Infof("Started syncing rollout at (%v)", startTime) - defer func() { - log.WithField(logutil.RolloutKey, key).Infof("Finished syncing rollout (%v)", time.Since(startTime)) - }() - namespace, name, err := cache.SplitMetaNamespaceKey(key) if err != nil { return err } rollout, err := c.rolloutsLister.Rollouts(namespace).Get(name) - if errors.IsNotFound(err) { + if k8serrors.IsNotFound(err) { log.WithField(logutil.RolloutKey, key).Infof("Rollout %v has been deleted", key) - return nil + return c.metricsServer.IncError(rollout, nil) } if err != nil { - return err + return c.metricsServer.IncError(rollout, err) } // Deep-copy otherwise we are mutating our cache. r := rollout.DeepCopy() + defer func() { + duration := time.Since(startTime) + c.metricsServer.IncReconcile(r, duration) + logCtx := logutil.WithRollout(r).WithField("time_ms", duration.Seconds()*1e3) + logCtx.Info("Reconciliation completed") + }() prevCond := conditions.GetRolloutCondition(rollout.Status, v1alpha1.InvalidSpec) invalidSpecCond := conditions.VerifyRolloutSpec(r, prevCond) @@ -275,9 +290,9 @@ func (c *Controller) syncHandler(key string) error { newStatus := r.Status newStatus.ObservedGeneration = generation conditions.SetRolloutCondition(&newStatus, *invalidSpecCond) - err := c.persistRolloutStatus(r, &newStatus, nil) + err := c.persistRolloutStatus(r, &newStatus, nil, metrics.InvalidSpec) if err != nil { - return err + return c.metricsServer.IncError(rollout, err) } } return nil @@ -287,12 +302,12 @@ func (c *Controller) syncHandler(key string) error { // through adoption/orphaning. rsList, err := c.getReplicaSetsForRollouts(r) if err != nil { - return err + return c.metricsServer.IncError(rollout, err) } scalingEvent, err := c.isScalingEvent(r, rsList) if err != nil { - return err + return c.metricsServer.IncError(rollout, err) } if scalingEvent { return c.sync(r, rsList) @@ -375,3 +390,8 @@ func (c *Controller) handleObject(obj interface{}) { return } } + +func (c *Controller) reconcileError(rollout *v1alpha1.Rollout, err error) error { + c.metricsServer.IncPhase(rollout, metrics.Error) + return err +} diff --git a/controller/metrics/metrics.go b/controller/metrics/metrics.go new file mode 100644 index 0000000000..0e39e52334 --- /dev/null +++ b/controller/metrics/metrics.go @@ -0,0 +1,171 @@ +package metrics + +import ( + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + log "github.com/sirupsen/logrus" + "k8s.io/apimachinery/pkg/labels" + + "time" + + v1alpha1 "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" + rolloutlister "github.com/argoproj/argo-rollouts/pkg/client/listers/rollouts/v1alpha1" + "github.com/argoproj/argo-rollouts/utils/defaults" +) + +type MetricsServer struct { + *http.Server + reconcileHistogram *prometheus.HistogramVec + reconcilePhaseCounter *prometheus.CounterVec +} + +const ( + // MetricsPath is the endpoint to collect rollout metrics + MetricsPath = "/metrics" +) + +// Follow Prometheus naming practices +// https://prometheus.io/docs/practices/naming/ +var ( + descRolloutDefaultLabels = []string{"namespace", "name"} + + descRolloutWithStrategyLabels = append(descRolloutDefaultLabels, "strategy") + + descRolloutReconcilePhaseLabels = append(descRolloutWithStrategyLabels, "phase") + + descRolloutInfo = prometheus.NewDesc( + "rollout_info", + "Information about rollout.", + descRolloutWithStrategyLabels, + nil, + ) + + descRolloutCreated = prometheus.NewDesc( + "rollout_created_time", + "Creation time in unix timestamp for an rollout.", + descRolloutWithStrategyLabels, + nil, + ) +) + +// ReconcilePhase the phases of a reconcile can have +type ReconcilePhase string + +const ( + + // Error means the rollout had an InvalidSpec during reconciliation + InvalidSpec ReconcilePhase = "InvalidSpec" + // Error means the rollout return with an error during reconciliation + Error ReconcilePhase = "Error" + // Completed means the rollout finished the reconciliation with no remaining work + Completed ReconcilePhase = "Completed" + // Progressing means the rollout finished the reconciliation with remaining work + Progressing ReconcilePhase = "Progressing" +) + +// NewMetricsServer returns a new prometheus server which collects rollout metrics +func NewMetricsServer(addr string, rolloutLister rolloutlister.RolloutLister) *MetricsServer { + mux := http.NewServeMux() + rolloutRegistry := NewRolloutRegistry(rolloutLister) + mux.Handle(MetricsPath, promhttp.HandlerFor(rolloutRegistry, promhttp.HandlerOpts{})) + + reconcileHistogram := prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "rollout_reconcile", + Help: "Rollout reconciliation performance.", + Buckets: []float64{0.01, 0.15, .25, .5, 1}, + }, + append(descRolloutWithStrategyLabels), + ) + + rolloutRegistry.MustRegister(reconcileHistogram) + + reconcilePhaseCounter := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "rollout_reconcile_phases", + Help: "Phase the rollout has", + }, + append(descRolloutReconcilePhaseLabels), + ) + rolloutRegistry.MustRegister(reconcilePhaseCounter) + + return &MetricsServer{ + Server: &http.Server{ + Addr: addr, + Handler: mux, + }, + reconcileHistogram: reconcileHistogram, + reconcilePhaseCounter: reconcilePhaseCounter, + } +} + +// IncReconcile increments the reconcile counter for an rollout +func (m *MetricsServer) IncReconcile(rollout *v1alpha1.Rollout, duration time.Duration) { + m.reconcileHistogram.WithLabelValues(rollout.Namespace, rollout.Name, defaults.GetStrategyType(rollout)).Observe(duration.Seconds()) +} + +// IncReconcile increments the reconcile counter for an rollout +func (m *MetricsServer) IncError(rollout *v1alpha1.Rollout, err error) error { + m.IncPhase(rollout, Error) + return err +} + +// IncError increments the error counter for an rollout +func (m *MetricsServer) IncPhase(rollout *v1alpha1.Rollout, phase ReconcilePhase) { + m.reconcilePhaseCounter.WithLabelValues(rollout.Namespace, rollout.Name, defaults.GetStrategyType(rollout), string(phase)).Inc() +} + +type rolloutCollector struct { + store rolloutlister.RolloutLister +} + +// NewRolloutCollector returns a prometheus collector for rollout metrics +func NewRolloutCollector(rolloutLister rolloutlister.RolloutLister) prometheus.Collector { + return &rolloutCollector{ + store: rolloutLister, + } +} + +// NewRolloutRegistry creates a new prometheus registry that collects rollouts +func NewRolloutRegistry(rolloutLister rolloutlister.RolloutLister) *prometheus.Registry { + registry := prometheus.NewRegistry() + registry.MustRegister(NewRolloutCollector(rolloutLister)) + registry.MustRegister(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{})) + registry.MustRegister(prometheus.NewGoCollector()) + return registry +} + +// Describe implements the prometheus.Collector interface +func (c *rolloutCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- descRolloutInfo + ch <- descRolloutCreated +} + +// Collect implements the prometheus.Collector interface +func (c *rolloutCollector) Collect(ch chan<- prometheus.Metric) { + rollouts, err := c.store.List(labels.NewSelector()) + if err != nil { + log.Warnf("Failed to collect rollouts: %v", err) + return + } + for _, rollout := range rollouts { + collectRollouts(ch, rollout) + } +} + +func collectRollouts(ch chan<- prometheus.Metric, rollout *v1alpha1.Rollout) { + + addConstMetric := func(desc *prometheus.Desc, t prometheus.ValueType, v float64, lv ...string) { + lv = append([]string{rollout.Namespace, rollout.Name, defaults.GetStrategyType(rollout)}, lv...) + ch <- prometheus.MustNewConstMetric(desc, t, v, lv...) + } + addGauge := func(desc *prometheus.Desc, v float64, lv ...string) { + addConstMetric(desc, prometheus.GaugeValue, v, lv...) + } + + addGauge(descRolloutInfo, 1) + + addGauge(descRolloutCreated, float64(rollout.CreationTimestamp.Unix())) +} diff --git a/controller/metrics/metrics_test.go b/controller/metrics/metrics_test.go new file mode 100644 index 0000000000..35382d67e6 --- /dev/null +++ b/controller/metrics/metrics_test.go @@ -0,0 +1,121 @@ +package metrics + +import ( + "context" + + "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" + "github.com/ghodss/yaml" + + "log" + "net/http" + "net/http/httptest" + "strings" + "testing" + + clientset "github.com/argoproj/argo-rollouts/pkg/client/clientset/versioned/fake" + informer "github.com/argoproj/argo-rollouts/pkg/client/informers/externalversions" + lister "github.com/argoproj/argo-rollouts/pkg/client/listers/rollouts/v1alpha1" + "github.com/stretchr/testify/assert" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/cache" +) + +// assertMetricsPrinted asserts every line in the expected lines appears in the body +func assertMetricsPrinted(t *testing.T, expectedLines, body string) { + for _, line := range strings.Split(expectedLines, "\n") { + assert.Contains(t, body, line) + } +} + +func newFakeRollout(fakeRollout string) *v1alpha1.Rollout { + var rollout v1alpha1.Rollout + err := yaml.Unmarshal([]byte(fakeRollout), &rollout) + if err != nil { + panic(err) + } + return &rollout +} + +func newFakeLister(fakeRollout ...string) (context.CancelFunc, lister.RolloutLister) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var fakeRollouts []runtime.Object + for _, name := range fakeRollout { + fakeRollouts = append(fakeRollouts, newFakeRollout(name)) + } + appClientset := clientset.NewSimpleClientset(fakeRollouts...) + factory := informer.NewSharedInformerFactoryWithOptions(appClientset, 0) + rolloutInformer := factory.Argoproj().V1alpha1().Rollouts().Informer() + go rolloutInformer.Run(ctx.Done()) + if !cache.WaitForCacheSync(ctx.Done(), rolloutInformer.HasSynced) { + log.Fatal("Timed out waiting for caches to sync") + } + return cancel, factory.Argoproj().V1alpha1().Rollouts().Lister() +} + +func testRolloutDescribe(t *testing.T, fakeRollout string, expectedResponse string) { + cancel, rolloutLister := newFakeLister(fakeRollout) + defer cancel() + metricsServ := NewMetricsServer("localhost:8080", rolloutLister) + req, err := http.NewRequest("GET", "/metrics", nil) + assert.NoError(t, err) + rr := httptest.NewRecorder() + metricsServ.Handler.ServeHTTP(rr, req) + assert.Equal(t, rr.Code, http.StatusOK) + body := rr.Body.String() + log.Println(body) + assertMetricsPrinted(t, expectedResponse, body) +} + +type testCombination struct { + rollout string + expectedResponse string +} + +const fakeRollout = ` +apiVersion: argoproj.io/v1alpha1 +kind: Rollout +metadata: + name: guestbook-bluegreen + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: guestbook + template: + metadata: + labels: + app: guestbook + spec: + containers: + - name: guestbook + # The image below can be flip from 0.1 to 0.2 + image: gcr.io/heptio-images/ks-guestbook-demo:0.1 + ports: + - containerPort: 80 + minReadySeconds: 30 + revisionHistoryLimit: 3 + strategy: + blueGreen: + activeService: active-service + previewService: preview-service +` + +const expectedResponse = `# HELP rollout_created_time Creation time in unix timestamp for an rollout. +# TYPE rollout_created_time gauge +rollout_created_time{name="guestbook-bluegreen",namespace="default",strategy="blueGreen"} -6.21355968e+10 +` + +func TestMetrics(t *testing.T) { + combinations := []testCombination{ + { + rollout: fakeRollout, + expectedResponse: expectedResponse, + }, + } + + for _, combination := range combinations { + testRolloutDescribe(t, combination.rollout, combination.expectedResponse) + } +} diff --git a/controller/sync.go b/controller/sync.go index d39734108d..c22fc89dde 100644 --- a/controller/sync.go +++ b/controller/sync.go @@ -15,6 +15,7 @@ import ( "k8s.io/kubernetes/pkg/controller" labelsutil "k8s.io/kubernetes/pkg/util/labels" + "github.com/argoproj/argo-rollouts/controller/metrics" "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" "github.com/argoproj/argo-rollouts/utils/annotations" "github.com/argoproj/argo-rollouts/utils/conditions" @@ -196,10 +197,10 @@ func (c *Controller) sync(r *v1alpha1.Rollout, rsList []*appsv1.ReplicaSet) erro if err := c.scaleBlueGreen(r, newRS, oldRSs, previewSvc, activeSvc); err != nil { // If we get an error while trying to scale, the rollout will be requeued // so we can abort this resync - return err + return c.metricsServer.IncError(r, err) } allRSs := append([]*appsv1.ReplicaSet{newRS}, oldRSs...) - return c.syncRolloutStatusBlueGreen(allRSs, newRS, previewSvc, activeSvc, r, r.Spec.Paused) + return c.syncRolloutStatusBlueGreen(allRSs, newRS, previewSvc, activeSvc, r, r.Spec.Paused, metrics.Progressing) } return fmt.Errorf("no rollout strategy provided") } @@ -346,7 +347,8 @@ func CreateTwoWayMergePatch(orig, new, dataStruct interface{}) ([]byte, bool, er } // persistRolloutStatus persists updates to rollout status. If no changes were made, it is a no-op -func (c *Controller) persistRolloutStatus(orig *v1alpha1.Rollout, newStatus *v1alpha1.RolloutStatus, newPause *bool) error { +func (c *Controller) persistRolloutStatus(orig *v1alpha1.Rollout, newStatus *v1alpha1.RolloutStatus, newPause *bool, phase metrics.ReconcilePhase) error { + c.metricsServer.IncPhase(orig, phase) specCopy := orig.Spec.DeepCopy() paused := specCopy.Paused if newPause != nil { diff --git a/examples/dashboard.json b/examples/dashboard.json new file mode 100644 index 0000000000..ef5f393f95 --- /dev/null +++ b/examples/dashboard.json @@ -0,0 +1,880 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 12, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "Prometheus", + "format": "dtdurations", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 0 + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "time() - max(process_start_time_seconds{job=\"argo-rollouts-metrics\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "", + "title": "Uptime", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 3, + "y": 0 + }, + "id": 14, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rollout_info)", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "Total Rollouts", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 4, + "w": 15, + "x": 8, + "y": 0 + }, + "id": 25, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rollout_info) by (strategy)", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Rollouts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 4 + }, + "id": 4, + "panels": [], + "title": "Controller Stats", + "type": "row" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorPostfix": false, + "colorPrefix": false, + "colorValue": true, + "colors": [ + "#d44a3a", + "rgba(237, 129, 40, 0.89)", + "#629e51" + ], + "datasource": "Prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 7, + "x": 0, + "y": 5 + }, + "id": 18, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgb(27, 62, 27)", + "full": false, + "lineColor": "#37872D", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rollout_reconcile_phases{phase=\"Completed\"} == 1)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "0,1", + "title": "Stable", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorPostfix": false, + "colorPrefix": false, + "colorValue": true, + "colors": [ + "#5794F2", + "#5794F2", + "#5794F2" + ], + "datasource": "Prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 7, + "y": 5 + }, + "id": 21, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "#5794F2", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rollout_reconcile_phases{phase=\"Progressing\"} == 1)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "1,3", + "title": "Progressing", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorPostfix": false, + "colorPrefix": false, + "colorValue": true, + "colors": [ + "#F2495C", + "#F2495C", + "#F2495C" + ], + "datasource": "Prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 13, + "y": 5 + }, + "id": 23, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgb(101, 32, 33)", + "full": false, + "lineColor": "#F2495C", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rollout_reconcile_phases{phase=\"Error\"} == 1)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "0,1", + "title": "Error", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorPostfix": false, + "colorPrefix": false, + "colorValue": true, + "colors": [ + "#F2495C", + "#F2495C", + "#F2495C" + ], + "datasource": "Prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 5, + "x": 19, + "y": 5 + }, + "id": 24, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgb(101, 32, 33)", + "full": false, + "lineColor": "#F2495C", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rollout_reconcile_phases{phase=\"InvalidSpec\"} == 1)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "0,1", + "title": "Invalid Spec", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(rollout_reconcile_count[10m]))", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Reconcile Activity", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 16 + }, + "heatmap": {}, + "highlightCards": true, + "id": 8, + "legend": { + "show": false + }, + "links": [], + "targets": [ + { + "expr": "sum(increase(rollout_reconcile_bucket[10m])) by (le)", + "format": "heatmap", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Reconciliation Performance", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_memstats_heap_alloc_bytes{job=\"argo-rollouts-metrics\"}", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Used", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 18, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Argo Rollouts", + "uid": "9zNpieqik", + "version": 5 +} diff --git a/manifests/base/argo-rollouts-metrics-service.yaml b/manifests/base/argo-rollouts-metrics-service.yaml new file mode 100644 index 0000000000..d43350489b --- /dev/null +++ b/manifests/base/argo-rollouts-metrics-service.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Service +metadata: + name: argo-rollouts-metrics +spec: + ports: + - name: metrics + protocol: TCP + port: 8080 + targetPort: 8080 + selector: + app: argo-rollouts \ No newline at end of file diff --git a/manifests/base/kustomization.yaml b/manifests/base/kustomization.yaml index 2ad97a71c3..aacc75c339 100644 --- a/manifests/base/kustomization.yaml +++ b/manifests/base/kustomization.yaml @@ -4,7 +4,7 @@ resources: - argo-rollouts-rolebinding.yaml - argo-rollouts-deployment.yaml - argo-rollouts-aggregate-roles.yaml - +- argo-rollouts-metrics-service.yaml imageTags: - name: argoproj/rollout-controlller newTag: latest diff --git a/manifests/install.yaml b/manifests/install.yaml index 6b0c498f1a..acde16ef6e 100644 --- a/manifests/install.yaml +++ b/manifests/install.yaml @@ -365,6 +365,19 @@ subjects: name: argo-rollouts namespace: argo-rollouts --- +apiVersion: v1 +kind: Service +metadata: + name: argo-rollouts-metrics +spec: + ports: + - name: metrics + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + app: argo-rollouts +--- apiVersion: apps/v1 kind: Deployment metadata: diff --git a/manifests/namespace-install.yaml b/manifests/namespace-install.yaml index 62f1637c7f..5d1d972340 100644 --- a/manifests/namespace-install.yaml +++ b/manifests/namespace-install.yaml @@ -316,6 +316,19 @@ subjects: - kind: ServiceAccount name: argo-rollouts --- +apiVersion: v1 +kind: Service +metadata: + name: argo-rollouts-metrics +spec: + ports: + - name: metrics + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + app: argo-rollouts +--- apiVersion: apps/v1 kind: Deployment metadata: diff --git a/utils/defaults/defaults.go b/utils/defaults/defaults.go index edc541135e..5332a42b5c 100644 --- a/utils/defaults/defaults.go +++ b/utils/defaults/defaults.go @@ -48,3 +48,13 @@ func GetMaxUnavailableOrDefault(rollout *v1alpha1.Rollout) *intstr.IntOrString { defaultValue := intstr.FromInt(DefaultMaxUnavailable) return &defaultValue } + +func GetStrategyType(rollout *v1alpha1.Rollout) string { + if rollout.Spec.Strategy.BlueGreenStrategy != nil { + return "blueGreen" + } + if rollout.Spec.Strategy.CanaryStrategy != nil { + return "canary" + } + return "No Strategy listed" +} diff --git a/utils/defaults/defaults_test.go b/utils/defaults/defaults_test.go index ff4b6c0d11..0575d47407 100644 --- a/utils/defaults/defaults_test.go +++ b/utils/defaults/defaults_test.go @@ -68,3 +68,30 @@ func TestGetMaxUnavailableOrDefault(t *testing.T) { rolloutDefaultValue := &v1alpha1.Rollout{} assert.Equal(t, intstr.FromInt(DefaultMaxUnavailable), *GetMaxUnavailableOrDefault(rolloutDefaultValue)) } + +func TestGetStrategyType(t *testing.T) { + bgRollout := &v1alpha1.Rollout{ + Spec: v1alpha1.RolloutSpec{ + Strategy: v1alpha1.RolloutStrategy{ + BlueGreenStrategy: &v1alpha1.BlueGreenStrategy{}, + }, + }, + } + assert.Equal(t, "blueGreen", GetStrategyType(bgRollout)) + + canaryRollout := &v1alpha1.Rollout{ + Spec: v1alpha1.RolloutSpec{ + Strategy: v1alpha1.RolloutStrategy{ + CanaryStrategy: &v1alpha1.CanaryStrategy{}, + }, + }, + } + assert.Equal(t, "canary", GetStrategyType(canaryRollout)) + + noStrategyRollout := &v1alpha1.Rollout{ + Spec: v1alpha1.RolloutSpec{ + Strategy: v1alpha1.RolloutStrategy{}, + }, + } + assert.Equal(t, "No Strategy listed", GetStrategyType(noStrategyRollout)) +}