Alert duration variable (kube-burner#171)

* Alert duration variable Signed-off-by: Raul Sevilla <[email protected]> * Update test Signed-off-by: Raul Sevilla <[email protected]> * Update docs Signed-off-by: Raul Sevilla <[email protected]>
rsevilla87 · Mar 24, 2022 · 7adf2b2 · 7adf2b2
1 parent fb891de
commit 7adf2b2
Show file tree

Hide file tree

Showing 6 changed files with 39 additions and 12 deletions.
diff --git a/cmd/kube-burner/kube-burner.go b/cmd/kube-burner/kube-burner.go
@@ -370,8 +370,8 @@ func steps(uuid string, p *prometheus.Prometheus, alertM *alerting.AlertManager)
 		}
 	}
 	if p != nil {
-		log.Infof("Waiting %v extra before scraping prometheus", p.Step*2)
-		time.Sleep(p.Step * 2)
+		log.Infof("Waiting %v extra before scraping prometheus", p.Step)
+		time.Sleep(p.Step)
 		// Update end time of last job
 		jobList[len(jobList)-1].End = time.Now().UTC()
 		// If alertManager is configured

diff --git a/docs/alerting.md b/docs/alerting.md
@@ -28,6 +28,18 @@ Alarm can be configured with a severity. Each one with different effects. At the
 - error: Prints a error message with the alarm description to stdout and makes kube-burner rc = 1
 - critical: Prints a fatal message with the alarm description to stdout and exits execution inmediatly with rc != 0
 
+
+### Using the elapsed variable
+
+There's a special go-template variable that can be used within the prometheus expression, the variable _elapased_ is set to the value of the job duration (or the range given to check-alerts). This variable is specially useful in expressions using (aggregations over time functions)[https://prometheus.io/docs/prometheus/latest/querying/functions/#aggregation_over_time]
+i.e:
+
+```yaml
+- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[{{ .elapsed }}m]) > 0.01
+  description: avg. etcd fsync latency on {{$labels.pod}} higher than 10ms {{$value}}
+  severity: error
+```
+
 ## Checking alerts
 
 It's possible to look for alerts w/o triggering a kube-burner workload. To do so you can use the `check-alerts` option from the CLI, similar to the `index` CLI option, this one accepts the flags `--start` and `--end` to evaluate the alerts at a given time range.

diff --git a/pkg/alerting/alert_manager.go b/pkg/alerting/alert_manager.go
@@ -17,6 +17,7 @@ package alerting
 import (
 	"bytes"
 	"fmt"
+	"math"
 	"strings"
 	"text/template"
 	"time"
@@ -94,12 +95,18 @@ func (a *AlertManager) readProfile(alertProfile string) error {
 
 // Evaluate evaluates expressions
 func (a *AlertManager) Evaluate(start, end time.Time) int {
+	elapsed := int(end.Sub(start).Minutes())
+	var renderedQuery bytes.Buffer
 	result := Passed
 	for _, alert := range a.alertProfile {
-		log.Infof("Evaluating expression: '%s'", alert.Expr)
-		v, err := a.prometheus.QueryRange(alert.Expr, start, end)
+		t, _ := template.New("").Parse(alert.Expr)
+		t.Execute(&renderedQuery, map[string]string{"elapsed": fmt.Sprintf("%dm", elapsed)})
+		expr := renderedQuery.String()
+		renderedQuery.Reset()
+		log.Infof("Evaluating expression: '%s'", expr)
+		v, err := a.prometheus.QueryRange(expr, start, end)
 		if err != nil {
-			log.Warnf("Error performing query %s: %s", alert.Expr, err)
+			log.Warnf("Error performing query %s: %s", expr, err)
 			continue
 		}
 		alarmResult, err := parseMatrix(v, alert.Description, alert.Severity)
@@ -133,18 +140,19 @@ func parseMatrix(value model.Value, description string, severity severityLevel)
 	}
 
 	for _, v := range data {
-		// TODO: improve value casting
 		templateData.Labels = make(map[string]string)
 		for k, v := range v.Metric {
 			templateData.Labels[string(k)] = string(v)
 		}
 		for _, val := range v.Values {
 			renderedDesc.Reset()
-			templateData.Value = float64(val.Value)
+			// Take 3 decimals
+			templateData.Value = math.Round(float64(val.Value)*1000) / 1000
 			if err := t.Execute(&renderedDesc, templateData); err != nil {
 				log.Errorf("Rendering error: %s", err)
+				result = Failed
 			}
-			msg := fmt.Sprintf("Alert triggered at %v: '%s'", val.Timestamp.Time(), renderedDesc.String())
+			msg := fmt.Sprintf("🚨 Alert triggered at %v: '%s'", val.Timestamp.Time(), renderedDesc.String())
 			switch severity {
 			case sevWarn:
 				log.Warn(msg)

diff --git a/pkg/burner/create.go b/pkg/burner/create.go
@@ -113,7 +113,7 @@ func (ex *Executor) RunCreateJob() {
 			log.Fatal(err.Error())
 		}
 	}
-	t0 := time.Now()
+	t0 := time.Now().Round(time.Second)
 	for i := 1; i <= ex.Config.JobIterations; i++ {
 		log.Debugf("Creating object replicas from iteration %d", i)
 		if ex.nsObjects && ex.Config.NamespacedIterations {
@@ -166,7 +166,8 @@ func (ex *Executor) RunCreateJob() {
 		}
 		wg.Wait()
 	}
-	log.Infof("Finished the create job in %g seconds", time.Since(t0).Seconds())
+	t1 := time.Now().Round(time.Second)
+	log.Infof("Finished the create job in %g seconds", t1.Sub(t0).Seconds())
 }
 
 func (ex *Executor) replicaHandler(objectIndex int, obj object, ns string, iteration int, wg *sync.WaitGroup) {

diff --git a/pkg/measurements/metrics/metrics.go b/pkg/measurements/metrics/metrics.go
@@ -90,11 +90,13 @@ func CheckThreshold(thresholds []types.LatencyThreshold, quantiles []interface{}
 				// Required to acccess the attribute by name
 				r := reflect.ValueOf(pq.(LatencyQuantiles))
 				v := r.FieldByName(phase.Metric).Int()
+				latency := float32(v) / 1000
+				msg := fmt.Sprintf("❗ %s %s latency (%.2fs) higher than configured threshold: %v", phase.Metric, phase.ConditionType, latency, phase.Threshold)
 				if v > phase.Threshold.Milliseconds() {
-					log.Warnf("%s %s latency (%dms) higher than configured threshold: %v", phase.Metric, phase.ConditionType, v, phase.Threshold)
+					log.Error(msg)
 					rc = 1
 				} else {
-					log.Infof("%s %s latency (%dms) meets the configured threshold: %v", phase.Metric, phase.ConditionType, v, phase.Threshold)
+					log.Info(msg)
 				}
 				continue
 			}

diff --git a/test/alert-profile.yaml b/test/alert-profile.yaml
@@ -1,3 +1,7 @@
 - expr: count(prometheus_build_info) > 1
   severity: critical
   description: No prometheus instance found
+
+- expr: avg_over_time(prometheus_tsdb_wal_page_flushes_total[{{ .elapsed }}:])
+  severity: warning
+  description: Test