From cd518fc0ca80d2163b3bf8bac3eb8f906495adbf Mon Sep 17 00:00:00 2001 From: Danny Thomson Date: Mon, 25 Mar 2019 08:38:08 -0700 Subject: [PATCH] Add Initial Prometheus Metrics --- Gopkg.lock | 77 +- controller/bluegreen.go | 2 +- controller/controller.go | 37 +- controller/metrics/metrics.go | 191 ++++ controller/metrics/metrics_test.go | 121 +++ controller/sync.go | 1 + examples/dashboard.json | 880 ++++++++++++++++++ .../base/argo-rollouts-metrics-service.yaml | 12 + manifests/base/kustomization.yaml | 2 +- manifests/install.yaml | 13 + manifests/namespace-install.yaml | 13 + utils/defaults/defaults.go | 10 + utils/defaults/defaults_test.go | 27 + 13 files changed, 1374 insertions(+), 12 deletions(-) create mode 100644 controller/metrics/metrics.go create mode 100644 controller/metrics/metrics_test.go create mode 100644 examples/dashboard.json create mode 100644 manifests/base/argo-rollouts-metrics-service.yaml diff --git a/Gopkg.lock b/Gopkg.lock index 26a581d702..8c2b69b980 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -33,6 +33,14 @@ pruneopts = "" revision = "38f6a293f140402953f884b015014e0cd519bbb3" +[[projects]] + branch = "master" + digest = "1:c0bec5f9b98d0bc872ff5e834fac186b807b656683bd29cb82fb207a1513fabb" + name = "github.com/beorn7/perks" + packages = ["quantile"] + pruneopts = "" + revision = "3a771d992973f24aa725d07868b467d1ddfceafb" + [[projects]] digest = "1:0deddd908b6b4b768cfc272c16ee61e7088a60f7fe2f06c547bd3d8e1f8b8e77" name = "github.com/davecgh/go-spew" @@ -246,6 +254,14 @@ pruneopts = "" revision = "60711f1a8329503b04e1c88535f419d0bb440bff" +[[projects]] + digest = "1:63722a4b1e1717be7b98fc686e0b30d5e7f734b9e93d7dee86293b6deab7ea28" + name = "github.com/matttproud/golang_protobuf_extensions" + packages = ["pbutil"] + pruneopts = "" + revision = "c12348ce28de40eed0136aa2b644d0ee0650e56c" + version = "v1.0.1" + [[projects]] digest = "1:0c0ff2a89c1bb0d01887e1dac043ad7efbf3ec77482ef058ac423d13497e16fd" name = "github.com/modern-go/concurrent" @@ -294,6 +310,14 @@ revision = "5f041e8faa004a95c88a202771f4cc3e991971e6" version = "v2.0.1" +[[projects]] + digest = "1:1d7e1867c49a6dd9856598ef7c3123604ea3daabf5b83f303ff457bcbc410b1d" + name = "github.com/pkg/errors" + packages = ["."] + pruneopts = "" + revision = "ba968bfe8b2f7e042a574c888954fccecfa385b4" + version = "v0.8.1" + [[projects]] digest = "1:256484dbbcd271f9ecebc6795b2df8cad4c458dd0f5fd82a8c2fa0c29f233411" name = "github.com/pmezard/go-difflib" @@ -302,6 +326,52 @@ revision = "792786c7400a136282c1664665ae0a8db921c6c2" version = "v1.0.0" +[[projects]] + digest = "1:6f218995d6a74636cfcab45ce03005371e682b4b9bee0e5eb0ccfd83ef85364f" + name = "github.com/prometheus/client_golang" + packages = [ + "prometheus", + "prometheus/internal", + "prometheus/promhttp", + ] + pruneopts = "" + revision = "505eaef017263e299324067d40ca2c48f6a2cf50" + version = "v0.9.2" + +[[projects]] + branch = "master" + digest = "1:cd67319ee7536399990c4b00fae07c3413035a53193c644549a676091507cadc" + name = "github.com/prometheus/client_model" + packages = ["go"] + pruneopts = "" + revision = "fd36f4220a901265f90734c3183c5f0c91daa0b8" + +[[projects]] + digest = "1:96af18a3819d2ff7d6aa07e6e50955b11e477dbc8b890324c67462b84adca56b" + name = "github.com/prometheus/common" + packages = [ + "expfmt", + "internal/bitbucket.org/ww/goautoneg", + "model", + ] + pruneopts = "" + revision = "cfeb6f9992ffa54aaa4f2170ade4067ee478b250" + version = "v0.2.0" + +[[projects]] + branch = "master" + digest = "1:5dff64a37ab1e65130c24f01d5fbda61226b73cc61b6f0c8af24373509a89b73" + name = "github.com/prometheus/procfs" + packages = [ + ".", + "internal/util", + "iostats", + "nfs", + "xfs", + ] + pruneopts = "" + revision = "55ae3d9d557340b5bc24cd8aa5f6fa2c2ab31352" + [[projects]] digest = "1:9a3c631555e0351fdc4e696577bb63afd90c399d782a8462dba9d100d7021db3" name = "github.com/sirupsen/logrus" @@ -909,6 +979,9 @@ "github.com/ghodss/yaml", "github.com/go-openapi/spec", "github.com/golang/glog", + "github.com/pkg/errors", + "github.com/prometheus/client_golang/prometheus", + "github.com/prometheus/client_golang/prometheus/promhttp", "github.com/sirupsen/logrus", "github.com/spf13/cobra", "github.com/stretchr/testify/assert", @@ -924,6 +997,7 @@ "k8s.io/apimachinery/pkg/runtime/schema", "k8s.io/apimachinery/pkg/runtime/serializer", "k8s.io/apimachinery/pkg/types", + "k8s.io/apimachinery/pkg/util/diff", "k8s.io/apimachinery/pkg/util/intstr", "k8s.io/apimachinery/pkg/util/rand", "k8s.io/apimachinery/pkg/util/runtime", @@ -936,13 +1010,11 @@ "k8s.io/client-go/discovery/fake", "k8s.io/client-go/informers", "k8s.io/client-go/informers/apps/v1", - "k8s.io/client-go/informers/core/v1", "k8s.io/client-go/kubernetes", "k8s.io/client-go/kubernetes/fake", "k8s.io/client-go/kubernetes/scheme", "k8s.io/client-go/kubernetes/typed/core/v1", "k8s.io/client-go/listers/apps/v1", - "k8s.io/client-go/listers/core/v1", "k8s.io/client-go/plugin/pkg/client/auth/gcp", "k8s.io/client-go/plugin/pkg/client/auth/oidc", "k8s.io/client-go/rest", @@ -958,6 +1030,7 @@ "k8s.io/kubernetes/pkg/controller", "k8s.io/kubernetes/pkg/util/hash", "k8s.io/kubernetes/pkg/util/labels", + "k8s.io/utils/pointer", ] solver-name = "gps-cdcl" solver-version = 1 diff --git a/controller/bluegreen.go b/controller/bluegreen.go index 8f4afadeff..6d15ebd906 100644 --- a/controller/bluegreen.go +++ b/controller/bluegreen.go @@ -27,7 +27,6 @@ func (c *Controller) rolloutBlueGreen(r *v1alpha1.Rollout, rsList []*appsv1.Repl return err } allRSs := append(oldRSs, newRS) - // Scale up, if we can. logCtx.Infof("Reconciling new ReplicaSet '%s'", newRS.Name) scaledUp, err := c.reconcileNewReplicaSet(allRSs, newRS, r) @@ -86,6 +85,7 @@ func (c *Controller) rolloutBlueGreen(r *v1alpha1.Rollout, rsList []*appsv1.Repl return err } } + return c.syncRolloutStatusBlueGreen(allRSs, newRS, previewSvc, activeSvc, r, false) } diff --git a/controller/controller.go b/controller/controller.go index 03eaa01449..758dfdae60 100644 --- a/controller/controller.go +++ b/controller/controller.go @@ -6,10 +6,11 @@ import ( "time" "github.com/golang/glog" + "github.com/pkg/errors" log "github.com/sirupsen/logrus" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" + k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -24,6 +25,7 @@ import ( "k8s.io/client-go/util/workqueue" "k8s.io/kubernetes/pkg/controller" + "github.com/argoproj/argo-rollouts/controller/metrics" "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" clientset "github.com/argoproj/argo-rollouts/pkg/client/clientset/versioned" rolloutscheme "github.com/argoproj/argo-rollouts/pkg/client/clientset/versioned/scheme" @@ -67,6 +69,7 @@ type Controller struct { replicaSetSynced cache.InformerSynced rolloutsLister listers.RolloutLister rolloutsSynced cache.InformerSynced + metricsServer *metrics.MetricsServer // used for unit testing enqueueRollout func(obj interface{}) @@ -105,6 +108,7 @@ func NewController( KubeClient: kubeclientset, Recorder: recorder, } + metricsAddr := fmt.Sprintf("0.0.0.0:%d", 8080) controller := &Controller{ kubeclientset: kubeclientset, @@ -117,6 +121,7 @@ func NewController( workqueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "Rollouts"), recorder: recorder, resyncPeriod: resyncPeriod, + metricsServer: metrics.NewMetricsServer(metricsAddr, rolloutsInformer.Lister()), } controller.enqueueRollout = controller.enqueueRateLimited controller.enqueueRolloutAfter = controller.enqueueAfter @@ -171,6 +176,14 @@ func (c *Controller) Run(threadiness int, stopCh <-chan struct{}) error { } log.Info("Started workers") + go func() { + log.Infof("Starting Metric Server at %s", c.metricsServer.Addr) + err := c.metricsServer.ListenAndServe() + if err != nil { + err = errors.Wrap(err, "Starting Metric Server") + log.Fatal(err) + } + }() <-stopCh log.Info("Shutting down workers") @@ -221,9 +234,15 @@ func (c *Controller) processNextWorkItem() bool { // Run the syncHandler, passing it the namespace/name string of the // Rollout resource to be synced. if err := c.syncHandler(key); err != nil { + err := fmt.Errorf("error syncing '%s': %s, requeuing", key, err.Error()) + namespace, name, splitErr := cache.SplitMetaNamespaceKey(key) + if splitErr != nil { + return errors.Wrapf(err, "Error splitting key %s: %s", key, splitErr.Error()) + } + c.metricsServer.IncError(namespace, name) // Put the item back on the workqueue to handle any transient errors. c.workqueue.AddRateLimited(key) - return fmt.Errorf("error syncing '%s': %s, requeuing", key, err.Error()) + return err } // Finally, if no error occurs we Forget this item so it does not // get queued again until another change happens. @@ -246,18 +265,14 @@ func (c *Controller) processNextWorkItem() bool { func (c *Controller) syncHandler(key string) error { startTime := time.Now() log.WithField(logutil.RolloutKey, key).Infof("Started syncing rollout at (%v)", startTime) - defer func() { - log.WithField(logutil.RolloutKey, key).Infof("Finished syncing rollout (%v)", time.Since(startTime)) - }() - namespace, name, err := cache.SplitMetaNamespaceKey(key) if err != nil { return err } rollout, err := c.rolloutsLister.Rollouts(namespace).Get(name) - if errors.IsNotFound(err) { + if k8serrors.IsNotFound(err) { log.WithField(logutil.RolloutKey, key).Infof("Rollout %v has been deleted", key) - return nil + return err } if err != nil { return err @@ -265,6 +280,12 @@ func (c *Controller) syncHandler(key string) error { // Deep-copy otherwise we are mutating our cache. r := rollout.DeepCopy() + defer func() { + duration := time.Since(startTime) + c.metricsServer.IncReconcile(r, duration) + logCtx := logutil.WithRollout(r).WithField("time_ms", duration.Seconds()*1e3) + logCtx.Info("Reconciliation completed") + }() prevCond := conditions.GetRolloutCondition(rollout.Status, v1alpha1.InvalidSpec) invalidSpecCond := conditions.VerifyRolloutSpec(r, prevCond) diff --git a/controller/metrics/metrics.go b/controller/metrics/metrics.go new file mode 100644 index 0000000000..206effb87a --- /dev/null +++ b/controller/metrics/metrics.go @@ -0,0 +1,191 @@ +package metrics + +import ( + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + log "github.com/sirupsen/logrus" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" + + "time" + + v1alpha1 "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" + rolloutlister "github.com/argoproj/argo-rollouts/pkg/client/listers/rollouts/v1alpha1" + "github.com/argoproj/argo-rollouts/utils/conditions" + "github.com/argoproj/argo-rollouts/utils/defaults" +) + +type MetricsServer struct { + *http.Server + reconcileHistogram *prometheus.HistogramVec + reconcilePhaseCounter *prometheus.CounterVec + errorCounter *prometheus.CounterVec +} + +const ( + // MetricsPath is the endpoint to collect rollout metrics + MetricsPath = "/metrics" +) + +// Follow Prometheus naming practices +// https://prometheus.io/docs/practices/naming/ +var ( + descRolloutDefaultLabels = []string{"namespace", "name"} + + descRolloutWithStrategyLabels = append(descRolloutDefaultLabels, "strategy") + + descRolloutReconcilePhaseLabels = append(descRolloutWithStrategyLabels, "phase") + + descRolloutInfo = prometheus.NewDesc( + "rollout_info", + "Information about rollout.", + descRolloutWithStrategyLabels, + nil, + ) + + descRolloutCreated = prometheus.NewDesc( + "rollout_created_time", + "Creation time in unix timestamp for an rollout.", + descRolloutWithStrategyLabels, + nil, + ) +) + +// ReconcilePhase the phases of a reconcile can have +type ReconcilePhase string + +const ( + + // InvalidSpec means the rollout had an InvalidSpec during reconciliation + InvalidSpec ReconcilePhase = "InvalidSpec" + // Completed means the rollout finished the reconciliation with no remaining work + Completed ReconcilePhase = "Completed" + // Progressing means the rollout finished the reconciliation with remaining work + Progressing ReconcilePhase = "Progressing" +) + +// NewMetricsServer returns a new prometheus server which collects rollout metrics +func NewMetricsServer(addr string, rolloutLister rolloutlister.RolloutLister) *MetricsServer { + mux := http.NewServeMux() + rolloutRegistry := NewRolloutRegistry(rolloutLister) + mux.Handle(MetricsPath, promhttp.HandlerFor(rolloutRegistry, promhttp.HandlerOpts{})) + + reconcileHistogram := prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "rollout_reconcile", + Help: "Rollout reconciliation performance.", + Buckets: []float64{0.01, 0.15, .25, .5, 1}, + }, + append(descRolloutWithStrategyLabels), + ) + + rolloutRegistry.MustRegister(reconcileHistogram) + + reconcilePhaseCounter := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "rollout_reconcile_phases", + Help: "Phase the rollout has", + }, + append(descRolloutReconcilePhaseLabels), + ) + rolloutRegistry.MustRegister(reconcilePhaseCounter) + + errorCounter := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "rollout_reconcile_error", + Help: "Error occuring during the rollout", + }, + append(descRolloutDefaultLabels), + ) + + rolloutRegistry.MustRegister(errorCounter) + + return &MetricsServer{ + Server: &http.Server{ + Addr: addr, + Handler: mux, + }, + reconcileHistogram: reconcileHistogram, + reconcilePhaseCounter: reconcilePhaseCounter, + errorCounter: errorCounter, + } +} + +// IncReconcile increments the reconcile counter for an rollout +func (m *MetricsServer) IncReconcile(rollout *v1alpha1.Rollout, duration time.Duration) { + m.reconcileHistogram.WithLabelValues(rollout.Namespace, rollout.Name, defaults.GetStrategyType(rollout)).Observe(duration.Seconds()) +} + +// IncError increments the reconcile counter for an rollout +func (m *MetricsServer) IncError(namespace, name string) { + m.errorCounter.WithLabelValues(namespace, name).Inc() +} + +// IncError increments the error counter for an rollout +func (m *MetricsServer) IncPhase(rollout *v1alpha1.Rollout, newStatus *v1alpha1.RolloutStatus) { + phase := Progressing + available := conditions.GetRolloutCondition(*newStatus, v1alpha1.InvalidSpec) + if available != nil && available.Status == corev1.ConditionTrue { + phase = Completed + } + invalidSpec := conditions.GetRolloutCondition(*newStatus, v1alpha1.InvalidSpec) + if invalidSpec != nil { + phase = InvalidSpec + } + m.reconcilePhaseCounter.WithLabelValues(rollout.Namespace, rollout.Name, defaults.GetStrategyType(rollout), string(phase)).Inc() +} + +type rolloutCollector struct { + store rolloutlister.RolloutLister +} + +// NewRolloutCollector returns a prometheus collector for rollout metrics +func NewRolloutCollector(rolloutLister rolloutlister.RolloutLister) prometheus.Collector { + return &rolloutCollector{ + store: rolloutLister, + } +} + +// NewRolloutRegistry creates a new prometheus registry that collects rollouts +func NewRolloutRegistry(rolloutLister rolloutlister.RolloutLister) *prometheus.Registry { + registry := prometheus.NewRegistry() + registry.MustRegister(NewRolloutCollector(rolloutLister)) + registry.MustRegister(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{})) + registry.MustRegister(prometheus.NewGoCollector()) + return registry +} + +// Describe implements the prometheus.Collector interface +func (c *rolloutCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- descRolloutInfo + ch <- descRolloutCreated +} + +// Collect implements the prometheus.Collector interface +func (c *rolloutCollector) Collect(ch chan<- prometheus.Metric) { + rollouts, err := c.store.List(labels.NewSelector()) + if err != nil { + log.Warnf("Failed to collect rollouts: %v", err) + return + } + for _, rollout := range rollouts { + collectRollouts(ch, rollout) + } +} + +func collectRollouts(ch chan<- prometheus.Metric, rollout *v1alpha1.Rollout) { + + addConstMetric := func(desc *prometheus.Desc, t prometheus.ValueType, v float64, lv ...string) { + lv = append([]string{rollout.Namespace, rollout.Name, defaults.GetStrategyType(rollout)}, lv...) + ch <- prometheus.MustNewConstMetric(desc, t, v, lv...) + } + addGauge := func(desc *prometheus.Desc, v float64, lv ...string) { + addConstMetric(desc, prometheus.GaugeValue, v, lv...) + } + + addGauge(descRolloutInfo, 1) + + addGauge(descRolloutCreated, float64(rollout.CreationTimestamp.Unix())) +} diff --git a/controller/metrics/metrics_test.go b/controller/metrics/metrics_test.go new file mode 100644 index 0000000000..35382d67e6 --- /dev/null +++ b/controller/metrics/metrics_test.go @@ -0,0 +1,121 @@ +package metrics + +import ( + "context" + + "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" + "github.com/ghodss/yaml" + + "log" + "net/http" + "net/http/httptest" + "strings" + "testing" + + clientset "github.com/argoproj/argo-rollouts/pkg/client/clientset/versioned/fake" + informer "github.com/argoproj/argo-rollouts/pkg/client/informers/externalversions" + lister "github.com/argoproj/argo-rollouts/pkg/client/listers/rollouts/v1alpha1" + "github.com/stretchr/testify/assert" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/cache" +) + +// assertMetricsPrinted asserts every line in the expected lines appears in the body +func assertMetricsPrinted(t *testing.T, expectedLines, body string) { + for _, line := range strings.Split(expectedLines, "\n") { + assert.Contains(t, body, line) + } +} + +func newFakeRollout(fakeRollout string) *v1alpha1.Rollout { + var rollout v1alpha1.Rollout + err := yaml.Unmarshal([]byte(fakeRollout), &rollout) + if err != nil { + panic(err) + } + return &rollout +} + +func newFakeLister(fakeRollout ...string) (context.CancelFunc, lister.RolloutLister) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + var fakeRollouts []runtime.Object + for _, name := range fakeRollout { + fakeRollouts = append(fakeRollouts, newFakeRollout(name)) + } + appClientset := clientset.NewSimpleClientset(fakeRollouts...) + factory := informer.NewSharedInformerFactoryWithOptions(appClientset, 0) + rolloutInformer := factory.Argoproj().V1alpha1().Rollouts().Informer() + go rolloutInformer.Run(ctx.Done()) + if !cache.WaitForCacheSync(ctx.Done(), rolloutInformer.HasSynced) { + log.Fatal("Timed out waiting for caches to sync") + } + return cancel, factory.Argoproj().V1alpha1().Rollouts().Lister() +} + +func testRolloutDescribe(t *testing.T, fakeRollout string, expectedResponse string) { + cancel, rolloutLister := newFakeLister(fakeRollout) + defer cancel() + metricsServ := NewMetricsServer("localhost:8080", rolloutLister) + req, err := http.NewRequest("GET", "/metrics", nil) + assert.NoError(t, err) + rr := httptest.NewRecorder() + metricsServ.Handler.ServeHTTP(rr, req) + assert.Equal(t, rr.Code, http.StatusOK) + body := rr.Body.String() + log.Println(body) + assertMetricsPrinted(t, expectedResponse, body) +} + +type testCombination struct { + rollout string + expectedResponse string +} + +const fakeRollout = ` +apiVersion: argoproj.io/v1alpha1 +kind: Rollout +metadata: + name: guestbook-bluegreen + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: guestbook + template: + metadata: + labels: + app: guestbook + spec: + containers: + - name: guestbook + # The image below can be flip from 0.1 to 0.2 + image: gcr.io/heptio-images/ks-guestbook-demo:0.1 + ports: + - containerPort: 80 + minReadySeconds: 30 + revisionHistoryLimit: 3 + strategy: + blueGreen: + activeService: active-service + previewService: preview-service +` + +const expectedResponse = `# HELP rollout_created_time Creation time in unix timestamp for an rollout. +# TYPE rollout_created_time gauge +rollout_created_time{name="guestbook-bluegreen",namespace="default",strategy="blueGreen"} -6.21355968e+10 +` + +func TestMetrics(t *testing.T) { + combinations := []testCombination{ + { + rollout: fakeRollout, + expectedResponse: expectedResponse, + }, + } + + for _, combination := range combinations { + testRolloutDescribe(t, combination.rollout, combination.expectedResponse) + } +} diff --git a/controller/sync.go b/controller/sync.go index d39734108d..8bccb64fd2 100644 --- a/controller/sync.go +++ b/controller/sync.go @@ -347,6 +347,7 @@ func CreateTwoWayMergePatch(orig, new, dataStruct interface{}) ([]byte, bool, er // persistRolloutStatus persists updates to rollout status. If no changes were made, it is a no-op func (c *Controller) persistRolloutStatus(orig *v1alpha1.Rollout, newStatus *v1alpha1.RolloutStatus, newPause *bool) error { + c.metricsServer.IncPhase(orig, newStatus) specCopy := orig.Spec.DeepCopy() paused := specCopy.Paused if newPause != nil { diff --git a/examples/dashboard.json b/examples/dashboard.json new file mode 100644 index 0000000000..29a54475f1 --- /dev/null +++ b/examples/dashboard.json @@ -0,0 +1,880 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 12, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "Prometheus", + "format": "dtdurations", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 0 + }, + "id": 2, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "time() - max(process_start_time_seconds{job=\"argo-rollouts-metrics\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "", + "title": "Uptime", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 3, + "y": 0 + }, + "id": 14, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rollout_info)", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "Total Rollouts", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 4, + "w": 15, + "x": 8, + "y": 0 + }, + "id": 25, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rollout_info) by (strategy)", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Rollouts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 4 + }, + "id": 4, + "panels": [], + "title": "Controller Stats", + "type": "row" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorPostfix": false, + "colorPrefix": false, + "colorValue": true, + "colors": [ + "#d44a3a", + "rgba(237, 129, 40, 0.89)", + "#629e51" + ], + "datasource": "Prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 7, + "x": 0, + "y": 5 + }, + "id": 18, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgb(27, 62, 27)", + "full": false, + "lineColor": "#37872D", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rollout_reconcile_phases{phase=\"Completed\"} == 1)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "0,1", + "title": "Stable", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorPostfix": false, + "colorPrefix": false, + "colorValue": true, + "colors": [ + "#5794F2", + "#5794F2", + "#5794F2" + ], + "datasource": "Prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 7, + "y": 5 + }, + "id": 21, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "#5794F2", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rollout_reconcile_phases{phase=\"Progressing\"} == 1)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "1,3", + "title": "Progressing", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorPostfix": false, + "colorPrefix": false, + "colorValue": true, + "colors": [ + "#F2495C", + "#F2495C", + "#F2495C" + ], + "datasource": "Prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 5, + "x": 13, + "y": 5 + }, + "id": 24, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgb(101, 32, 33)", + "full": false, + "lineColor": "#F2495C", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rollout_reconcile_phases{phase=\"InvalidSpec\"} == 1)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "0,1", + "title": "Invalid Spec", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorPostfix": false, + "colorPrefix": false, + "colorValue": true, + "colors": [ + "#F2495C", + "#F2495C", + "#F2495C" + ], + "datasource": "Prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 5 + }, + "id": 23, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgb(101, 32, 33)", + "full": false, + "lineColor": "#F2495C", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rollout_reconcile_error)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "0,1", + "title": "Error", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(rollout_reconcile_count[10m]))", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Reconcile Activity", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 16 + }, + "heatmap": {}, + "highlightCards": true, + "id": 8, + "legend": { + "show": false + }, + "links": [], + "targets": [ + { + "expr": "sum(increase(rollout_reconcile_bucket[10m])) by (le)", + "format": "heatmap", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Reconciliation Performance", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_memstats_heap_alloc_bytes{job=\"argo-rollouts-metrics\"}", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Used", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 18, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Argo Rollouts", + "uid": "9zNpieqik", + "version": 5 +} diff --git a/manifests/base/argo-rollouts-metrics-service.yaml b/manifests/base/argo-rollouts-metrics-service.yaml new file mode 100644 index 0000000000..d43350489b --- /dev/null +++ b/manifests/base/argo-rollouts-metrics-service.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Service +metadata: + name: argo-rollouts-metrics +spec: + ports: + - name: metrics + protocol: TCP + port: 8080 + targetPort: 8080 + selector: + app: argo-rollouts \ No newline at end of file diff --git a/manifests/base/kustomization.yaml b/manifests/base/kustomization.yaml index 2ad97a71c3..aacc75c339 100644 --- a/manifests/base/kustomization.yaml +++ b/manifests/base/kustomization.yaml @@ -4,7 +4,7 @@ resources: - argo-rollouts-rolebinding.yaml - argo-rollouts-deployment.yaml - argo-rollouts-aggregate-roles.yaml - +- argo-rollouts-metrics-service.yaml imageTags: - name: argoproj/rollout-controlller newTag: latest diff --git a/manifests/install.yaml b/manifests/install.yaml index 6b0c498f1a..acde16ef6e 100644 --- a/manifests/install.yaml +++ b/manifests/install.yaml @@ -365,6 +365,19 @@ subjects: name: argo-rollouts namespace: argo-rollouts --- +apiVersion: v1 +kind: Service +metadata: + name: argo-rollouts-metrics +spec: + ports: + - name: metrics + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + app: argo-rollouts +--- apiVersion: apps/v1 kind: Deployment metadata: diff --git a/manifests/namespace-install.yaml b/manifests/namespace-install.yaml index 62f1637c7f..5d1d972340 100644 --- a/manifests/namespace-install.yaml +++ b/manifests/namespace-install.yaml @@ -316,6 +316,19 @@ subjects: - kind: ServiceAccount name: argo-rollouts --- +apiVersion: v1 +kind: Service +metadata: + name: argo-rollouts-metrics +spec: + ports: + - name: metrics + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + app: argo-rollouts +--- apiVersion: apps/v1 kind: Deployment metadata: diff --git a/utils/defaults/defaults.go b/utils/defaults/defaults.go index edc541135e..5332a42b5c 100644 --- a/utils/defaults/defaults.go +++ b/utils/defaults/defaults.go @@ -48,3 +48,13 @@ func GetMaxUnavailableOrDefault(rollout *v1alpha1.Rollout) *intstr.IntOrString { defaultValue := intstr.FromInt(DefaultMaxUnavailable) return &defaultValue } + +func GetStrategyType(rollout *v1alpha1.Rollout) string { + if rollout.Spec.Strategy.BlueGreenStrategy != nil { + return "blueGreen" + } + if rollout.Spec.Strategy.CanaryStrategy != nil { + return "canary" + } + return "No Strategy listed" +} diff --git a/utils/defaults/defaults_test.go b/utils/defaults/defaults_test.go index ff4b6c0d11..0575d47407 100644 --- a/utils/defaults/defaults_test.go +++ b/utils/defaults/defaults_test.go @@ -68,3 +68,30 @@ func TestGetMaxUnavailableOrDefault(t *testing.T) { rolloutDefaultValue := &v1alpha1.Rollout{} assert.Equal(t, intstr.FromInt(DefaultMaxUnavailable), *GetMaxUnavailableOrDefault(rolloutDefaultValue)) } + +func TestGetStrategyType(t *testing.T) { + bgRollout := &v1alpha1.Rollout{ + Spec: v1alpha1.RolloutSpec{ + Strategy: v1alpha1.RolloutStrategy{ + BlueGreenStrategy: &v1alpha1.BlueGreenStrategy{}, + }, + }, + } + assert.Equal(t, "blueGreen", GetStrategyType(bgRollout)) + + canaryRollout := &v1alpha1.Rollout{ + Spec: v1alpha1.RolloutSpec{ + Strategy: v1alpha1.RolloutStrategy{ + CanaryStrategy: &v1alpha1.CanaryStrategy{}, + }, + }, + } + assert.Equal(t, "canary", GetStrategyType(canaryRollout)) + + noStrategyRollout := &v1alpha1.Rollout{ + Spec: v1alpha1.RolloutSpec{ + Strategy: v1alpha1.RolloutStrategy{}, + }, + } + assert.Equal(t, "No Strategy listed", GetStrategyType(noStrategyRollout)) +}