Skip to content

Commit

Permalink
feat: Report notification metrics for rollouts (argoproj#1856)
Browse files Browse the repository at this point in the history
feat: Report notification metrics for rollouts (argoproj#1856)

Signed-off-by: Ravi Hari <[email protected]>
  • Loading branch information
RaviHari authored Mar 22, 2022
1 parent 08cf10e commit 55a041a
Show file tree
Hide file tree
Showing 9 changed files with 177 additions and 12 deletions.
2 changes: 1 addition & 1 deletion controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ func NewManager(

refResolver := rollout.NewInformerBasedWorkloadRefResolver(namespace, dynamicclientset, discoveryClient, argoprojclientset, rolloutsInformer.Informer())
apiFactory := notificationapi.NewFactory(record.NewAPIFactorySettings(), defaults.Namespace(), secretInformer.Informer(), configMapInformer.Informer())
recorder := record.NewEventRecorder(kubeclientset, metrics.MetricRolloutEventsTotal, apiFactory)
recorder := record.NewEventRecorder(kubeclientset, metrics.MetricRolloutEventsTotal, metrics.MetricNotificationFailedTotal, metrics.MetricNotificationSuccessTotal, metrics.MetricNotificationSend, apiFactory)
notificationsController := notificationcontroller.NewController(dynamicclientset.Resource(v1alpha1.RolloutGVR), rolloutsInformer.Informer(), apiFactory,
notificationcontroller.WithToUnstructured(func(obj metav1.Object) (*unstructured.Unstructured, error) {
data, err := json.Marshal(obj)
Expand Down
12 changes: 10 additions & 2 deletions controller/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ type MetricsServer struct {

reconcileAnalysisRunHistogram *prometheus.HistogramVec
errorAnalysisRunCounter *prometheus.CounterVec

k8sRequestsCounter *K8sRequestsCountProvider
successNotificationCounter *prometheus.CounterVec
errorNotificationCounter *prometheus.CounterVec
sendNotificationRunHistogram *prometheus.HistogramVec
k8sRequestsCounter *K8sRequestsCountProvider
}

const (
Expand Down Expand Up @@ -75,6 +77,9 @@ func NewMetricsServer(cfg ServerConfig, isPrimary bool) *MetricsServer {
reg.MustRegister(MetricExperimentReconcileError)
reg.MustRegister(MetricAnalysisRunReconcile)
reg.MustRegister(MetricAnalysisRunReconcileError)
reg.MustRegister(MetricNotificationSuccessTotal)
reg.MustRegister(MetricNotificationFailedTotal)
reg.MustRegister(MetricNotificationSend)
reg.MustRegister(MetricVersionGauge)

mux.Handle(MetricsPath, promhttp.HandlerFor(prometheus.Gatherers{
Expand All @@ -96,6 +101,9 @@ func NewMetricsServer(cfg ServerConfig, isPrimary bool) *MetricsServer {

reconcileAnalysisRunHistogram: MetricAnalysisRunReconcile,
errorAnalysisRunCounter: MetricAnalysisRunReconcileError,
successNotificationCounter: MetricNotificationSuccessTotal,
errorNotificationCounter: MetricNotificationFailedTotal,
sendNotificationRunHistogram: MetricNotificationSend,

k8sRequestsCounter: cfg.K8SRequestProvider,
}
Expand Down
28 changes: 28 additions & 0 deletions controller/metrics/prommetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,34 @@ var (
)
)

// Notification metrics
var (
MetricNotificationSuccessTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "notification_send_success",
Help: "Notification send success.",
},
append(namespaceNameLabels, "type", "reason"),
)

MetricNotificationFailedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "notification_send_error",
Help: "Error sending the notification",
},
append(namespaceNameLabels, "type", "reason"),
)

MetricNotificationSend = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "notification_send",
Help: "Notification send performance.",
Buckets: []float64{0.01, 0.15, .25, .5, 1},
},
namespaceNameLabels,
)
)

// K8s Client metrics
var (
// Custom events metric
Expand Down
7 changes: 7 additions & 0 deletions docs/features/notifications.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,10 @@ data:
```

Each condition might use several templates. Typically each template is responsible for generating a service-specific notification part.

### Notification Metrics

The following prometheus metrics are emitted when notifications are enabled in argo-rollouts.
- notification_send_success is a counter that measures how many times the notification is sent successfully.
- notification_send_error is a counter that measures how many times the notification failed to send.
- notification_send is a histogram that measures performance of sending notification.
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ require (
github.com/mitchellh/mapstructure v1.4.3
github.com/newrelic/newrelic-client-go v0.72.0
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.11.0
github.com/prometheus/client_golang v1.12.1
github.com/prometheus/client_model v0.2.0
github.com/prometheus/common v0.32.1
github.com/servicemeshinterface/smi-sdk-go v0.4.1
Expand Down Expand Up @@ -135,7 +135,7 @@ require (
github.com/opsgenie/opsgenie-go-sdk-v2 v1.0.5 // indirect
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/procfs v0.6.0 // indirect
github.com/prometheus/procfs v0.7.3 // indirect
github.com/russross/blackfriday v1.5.2 // indirect
github.com/slack-go/slack v0.10.1 // indirect
github.com/spf13/pflag v1.0.5 // indirect
Expand Down
6 changes: 4 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -739,8 +739,9 @@ github.com/prometheus/client_golang v0.9.3/go.mod h1:/TN21ttK/J9q6uSwhBd54HahCDf
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
github.com/prometheus/client_golang v1.4.0/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU=
github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=
github.com/prometheus/client_golang v1.11.0 h1:HNkLOAEQMIDv/K+04rukrLx6ch7msSRwf3/SASFAGtQ=
github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0=
github.com/prometheus/client_golang v1.12.1 h1:ZiaPsmm9uiBeaSMRznKsCDNtPCS0T3JVDGF+06gjBzk=
github.com/prometheus/client_golang v1.12.1/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY=
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
Expand All @@ -761,8 +762,9 @@ github.com/prometheus/procfs v0.0.0-20190522114515-bc1a522cf7b1/go.mod h1:TjEm7z
github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A=
github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
github.com/prometheus/procfs v0.6.0 h1:mxy4L2jP6qMonqmq+aTtOx1ifVWUgG/TAmntgbh3xv4=
github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA=
github.com/prometheus/procfs v0.7.3 h1:4jVXhlkAyzOScmCkXBTOLRLTz8EeU+eyjrwB/EPq0VU=
github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA=
github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU=
github.com/quobyte/api v0.1.8/go.mod h1:jL7lIHrmqQ7yh05OJ+eEEdHr0u/kmT1Ff9iHd+4H6VI=
github.com/remyoudompheng/bigfft v0.0.0-20170806203942-52369c62f446/go.mod h1:uYEyJGbgTkfkS4+E/PavXkNJcbFIpEtjt2B0KDQ5+9M=
Expand Down
4 changes: 4 additions & 0 deletions metricproviders/prometheus/mock_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ type mockAPI struct {
warnings v1.Warnings
}

func (m mockAPI) WalReplay(ctx context.Context) (v1.WalReplayStatus, error) {
panic("Not used")
}

// Query performs a query for the given time.
func (m mockAPI) Query(ctx context.Context, query string, ts time.Time) (model.Value, v1.Warnings, error) {
if m.err != nil {
Expand Down
47 changes: 43 additions & 4 deletions utils/record/record.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import (
"strings"
"time"

timeutil "github.com/argoproj/argo-rollouts/utils/time"

"github.com/argoproj/notifications-engine/pkg/api"
"github.com/argoproj/notifications-engine/pkg/services"
"github.com/argoproj/notifications-engine/pkg/subscriptions"
Expand Down Expand Up @@ -65,13 +67,18 @@ type EventRecorderAdapter struct {
Recorder record.EventRecorder
// RolloutEventCounter is a counter to increment on events
RolloutEventCounter *prometheus.CounterVec
// NotificationFailCounter is a counter to increment on failing to send notifications
NotificationFailedCounter *prometheus.CounterVec
// NotificationSuccessCounter is a counter to increment on successful send notifications
NotificationSuccessCounter *prometheus.CounterVec
NotificationSendPerformance *prometheus.HistogramVec

eventf func(object runtime.Object, warn bool, opts EventOptions, messageFmt string, args ...interface{})
// apiFactory is a notifications engine API factory
apiFactory api.Factory
}

func NewEventRecorder(kubeclientset kubernetes.Interface, rolloutEventCounter *prometheus.CounterVec, apiFactory api.Factory) EventRecorder {
func NewEventRecorder(kubeclientset kubernetes.Interface, rolloutEventCounter *prometheus.CounterVec, notificationFailedCounter *prometheus.CounterVec, notificationSuccessCounter *prometheus.CounterVec, notificationSendPerformance *prometheus.HistogramVec, apiFactory api.Factory) EventRecorder {
// Create event broadcaster
// Add argo-rollouts custom resources to the default Kubernetes Scheme so Events can be
// logged for argo-rollouts types.
Expand All @@ -80,9 +87,12 @@ func NewEventRecorder(kubeclientset kubernetes.Interface, rolloutEventCounter *p
eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeclientset.CoreV1().Events("")})
k8srecorder := eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: controllerAgentName})
recorder := &EventRecorderAdapter{
Recorder: k8srecorder,
RolloutEventCounter: rolloutEventCounter,
apiFactory: apiFactory,
Recorder: k8srecorder,
RolloutEventCounter: rolloutEventCounter,
NotificationFailedCounter: notificationFailedCounter,
NotificationSuccessCounter: notificationSuccessCounter,
NotificationSendPerformance: notificationSendPerformance,
apiFactory: apiFactory,
}
recorder.eventf = recorder.defaultEventf
return recorder
Expand Down Expand Up @@ -137,6 +147,26 @@ func NewFakeEventRecorder() *FakeEventRecorder {
},
[]string{"name", "namespace", "type", "reason"},
),
prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "notification_send_error",
},
[]string{"name", "namespace", "type", "reason"},
),
prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "notification_send_success",
},
[]string{"name", "namespace", "type", "reason"},
),
prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "notification_send_performance",
Help: "Notification send performance.",
Buckets: []float64{0.01, 0.15, .25, .5, 1},
},
[]string{"namespace", "name"},
),
NewFakeApiFactory(),
).(*EventRecorderAdapter)
recorder.Recorder = record.NewFakeRecorder(1000)
Expand Down Expand Up @@ -178,7 +208,9 @@ func (e *EventRecorderAdapter) defaultEventf(object runtime.Object, warn bool, o
err := e.sendNotifications(object, opts)
if err != nil {
logCtx.Errorf("Notifications failed to send for eventReason %s with error: %s", opts.EventReason, err)
e.NotificationFailedCounter.WithLabelValues(namespace, name, opts.EventType, opts.EventReason).Inc()
}
e.NotificationSuccessCounter.WithLabelValues(namespace, name, opts.EventType, opts.EventReason).Inc()
}

logFn := logCtx.Infof
Expand Down Expand Up @@ -207,6 +239,13 @@ func NewAPIFactorySettings() api.Settings {
// Send notifications for triggered event if user is subscribed
func (e *EventRecorderAdapter) sendNotifications(object runtime.Object, opts EventOptions) error {
logCtx := logutil.WithObject(object)
_, namespace, name := logutil.KindNamespaceName(logCtx)
startTime := timeutil.Now()
defer func() {
duration := time.Since(startTime)
e.NotificationSendPerformance.WithLabelValues(namespace, name).Observe(duration.Seconds())
logCtx.WithField("time_ms", duration.Seconds()*1e3).Debug("Notification sent")
}()
notificationsAPI, err := e.apiFactory.GetAPI()
if err != nil {
// don't return error if notifications are not configured and rollout has no subscribers
Expand Down
79 changes: 78 additions & 1 deletion utils/record/record_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/argoproj/notifications-engine/pkg/triggers"
"github.com/golang/mock/gomock"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
dto "github.com/prometheus/client_model/go"
log "github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -100,11 +101,87 @@ func TestSendNotifications(t *testing.T) {
apiFactory := &mocks.FakeFactory{Api: mockAPI}
rec := NewFakeEventRecorder()
rec.EventRecorderAdapter.apiFactory = apiFactory

//ch := make(chan prometheus.HistogramVec, 1)
err := rec.sendNotifications(&r, EventOptions{EventReason: "FooReason"})
assert.NoError(t, err)
}

func TestNotificationFailedCounter(t *testing.T) {
r := v1alpha1.Rollout{
ObjectMeta: metav1.ObjectMeta{
Name: "guestbook",
Namespace: "default",
Annotations: map[string]string{"notifications.argoproj.io/subscribe.on-foo-reason.console": "console"},
},
}
rec := NewFakeEventRecorder()
opts := EventOptions{EventType: corev1.EventTypeWarning, EventReason: "FooReason"}
rec.NotificationFailedCounter.WithLabelValues(r.Name, r.Namespace, opts.EventType, opts.EventReason).Inc()

res := testutil.ToFloat64(rec.NotificationFailedCounter)
assert.Equal(t, float64(1), res)
}

func TestNotificationSuccessCounter(t *testing.T) {
r := v1alpha1.Rollout{
ObjectMeta: metav1.ObjectMeta{
Name: "guestbook",
Namespace: "default",
Annotations: map[string]string{"notifications.argoproj.io/subscribe.on-foo-reason.console": "console"},
},
}
rec := NewFakeEventRecorder()
opts := EventOptions{EventType: corev1.EventTypeNormal, EventReason: "FooReason"}
rec.NotificationSuccessCounter.WithLabelValues(r.Name, r.Namespace, opts.EventType, opts.EventReason).Inc()

res := testutil.ToFloat64(rec.NotificationSuccessCounter)
assert.Equal(t, float64(1), res)
}

func TestNotificationSendPerformance(t *testing.T) {
r := v1alpha1.Rollout{
ObjectMeta: metav1.ObjectMeta{
Name: "guestbook",
Namespace: "default",
Annotations: map[string]string{"notifications.argoproj.io/subscribe.on-foo-reason.console": "console"},
},
}
rec := NewFakeEventRecorder()
rec.NotificationSendPerformance.WithLabelValues(r.Namespace, r.Name).Observe(float64(0.4))
rec.NotificationSendPerformance.WithLabelValues(r.Namespace, r.Name).Observe(float64(1.3))
rec.NotificationSendPerformance.WithLabelValues(r.Namespace, r.Name).Observe(float64(0.5))
rec.NotificationSendPerformance.WithLabelValues(r.Namespace, r.Name).Observe(float64(1.4))
rec.NotificationSendPerformance.WithLabelValues(r.Namespace, r.Name).Observe(float64(0.6))
rec.NotificationSendPerformance.WithLabelValues(r.Namespace, r.Name).Observe(float64(0.1))
rec.NotificationSendPerformance.WithLabelValues(r.Namespace, r.Name).Observe(float64(1.3))
rec.NotificationSendPerformance.WithLabelValues(r.Namespace, r.Name).Observe(float64(0.25))
rec.NotificationSendPerformance.WithLabelValues(r.Namespace, r.Name).Observe(float64(0.9))
rec.NotificationSendPerformance.WithLabelValues(r.Namespace, r.Name).Observe(float64(0.17))
rec.NotificationSendPerformance.WithLabelValues(r.Namespace, r.Name).Observe(float64(0.35))

reg := prometheus.NewRegistry()
reg.MustRegister(rec.NotificationSendPerformance)

mfs, err := reg.Gather()
if err != nil {
t.Fatalf("error: %v", err)
}
log.Infof("mfs: %v, %v, %v, %v", *mfs[0], *mfs[0].Metric[0].Histogram.SampleCount, *mfs[0].Metric[0].Histogram.SampleSum, *mfs[0].Metric[0].Histogram.Bucket[0].CumulativeCount)
want := `# HELP notification_send_performance Notification send performance.
# TYPE notification_send_performance histogram
notification_send_performance_bucket{name="guestbook",namespace="default",le="0.01"} 0
notification_send_performance_bucket{name="guestbook",namespace="default",le="0.15"} 1
notification_send_performance_bucket{name="guestbook",namespace="default",le="0.25"} 3
notification_send_performance_bucket{name="guestbook",namespace="default",le="0.5"} 6
notification_send_performance_bucket{name="guestbook",namespace="default",le="1"} 8
notification_send_performance_bucket{name="guestbook",namespace="default",le="+Inf"} 11
notification_send_performance_sum{name="guestbook",namespace="default"} 7.27
notification_send_performance_count{name="guestbook",namespace="default"} 11
`
err = testutil.CollectAndCompare(rec.NotificationSendPerformance, strings.NewReader(want))
assert.Nil(t, err)
}

func TestSendNotificationsFails(t *testing.T) {
r := v1alpha1.Rollout{
ObjectMeta: metav1.ObjectMeta{
Expand Down

0 comments on commit 55a041a

Please sign in to comment.