diff --git a/cmd/kube-burner/kube-burner.go b/cmd/kube-burner/kube-burner.go index c581699c7..a57abb667 100644 --- a/cmd/kube-burner/kube-burner.go +++ b/cmd/kube-burner/kube-burner.go @@ -370,8 +370,8 @@ func steps(uuid string, p *prometheus.Prometheus, alertM *alerting.AlertManager) } } if p != nil { - log.Infof("Waiting %v extra before scraping prometheus", p.Step*2) - time.Sleep(p.Step * 2) + log.Infof("Waiting %v extra before scraping prometheus", p.Step) + time.Sleep(p.Step) // Update end time of last job jobList[len(jobList)-1].End = time.Now().UTC() // If alertManager is configured diff --git a/docs/alerting.md b/docs/alerting.md index 4d750a47c..308ab32be 100644 --- a/docs/alerting.md +++ b/docs/alerting.md @@ -28,6 +28,18 @@ Alarm can be configured with a severity. Each one with different effects. At the - error: Prints a error message with the alarm description to stdout and makes kube-burner rc = 1 - critical: Prints a fatal message with the alarm description to stdout and exits execution inmediatly with rc != 0 + +### Using the elapsed variable + +There's a special go-template variable that can be used within the prometheus expression, the variable _elapased_ is set to the value of the job duration (or the range given to check-alerts). This variable is specially useful in expressions using (aggregations over time functions)[https://prometheus.io/docs/prometheus/latest/querying/functions/#aggregation_over_time] +i.e: + +```yaml +- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[{{ .elapsed }}m]) > 0.01 + description: avg. etcd fsync latency on {{$labels.pod}} higher than 10ms {{$value}} + severity: error +``` + ## Checking alerts It's possible to look for alerts w/o triggering a kube-burner workload. To do so you can use the `check-alerts` option from the CLI, similar to the `index` CLI option, this one accepts the flags `--start` and `--end` to evaluate the alerts at a given time range. diff --git a/pkg/alerting/alert_manager.go b/pkg/alerting/alert_manager.go index c44791fdc..f1f539ab0 100644 --- a/pkg/alerting/alert_manager.go +++ b/pkg/alerting/alert_manager.go @@ -17,6 +17,7 @@ package alerting import ( "bytes" "fmt" + "math" "strings" "text/template" "time" @@ -94,12 +95,18 @@ func (a *AlertManager) readProfile(alertProfile string) error { // Evaluate evaluates expressions func (a *AlertManager) Evaluate(start, end time.Time) int { + elapsed := int(end.Sub(start).Minutes()) + var renderedQuery bytes.Buffer result := Passed for _, alert := range a.alertProfile { - log.Infof("Evaluating expression: '%s'", alert.Expr) - v, err := a.prometheus.QueryRange(alert.Expr, start, end) + t, _ := template.New("").Parse(alert.Expr) + t.Execute(&renderedQuery, map[string]string{"elapsed": fmt.Sprintf("%dm", elapsed)}) + expr := renderedQuery.String() + renderedQuery.Reset() + log.Infof("Evaluating expression: '%s'", expr) + v, err := a.prometheus.QueryRange(expr, start, end) if err != nil { - log.Warnf("Error performing query %s: %s", alert.Expr, err) + log.Warnf("Error performing query %s: %s", expr, err) continue } alarmResult, err := parseMatrix(v, alert.Description, alert.Severity) @@ -133,18 +140,19 @@ func parseMatrix(value model.Value, description string, severity severityLevel) } for _, v := range data { - // TODO: improve value casting templateData.Labels = make(map[string]string) for k, v := range v.Metric { templateData.Labels[string(k)] = string(v) } for _, val := range v.Values { renderedDesc.Reset() - templateData.Value = float64(val.Value) + // Take 3 decimals + templateData.Value = math.Round(float64(val.Value)*1000) / 1000 if err := t.Execute(&renderedDesc, templateData); err != nil { log.Errorf("Rendering error: %s", err) + result = Failed } - msg := fmt.Sprintf("Alert triggered at %v: '%s'", val.Timestamp.Time(), renderedDesc.String()) + msg := fmt.Sprintf("🚨 Alert triggered at %v: '%s'", val.Timestamp.Time(), renderedDesc.String()) switch severity { case sevWarn: log.Warn(msg) diff --git a/pkg/burner/create.go b/pkg/burner/create.go index a6b3399e9..cad8173d9 100644 --- a/pkg/burner/create.go +++ b/pkg/burner/create.go @@ -113,7 +113,7 @@ func (ex *Executor) RunCreateJob() { log.Fatal(err.Error()) } } - t0 := time.Now() + t0 := time.Now().Round(time.Second) for i := 1; i <= ex.Config.JobIterations; i++ { log.Debugf("Creating object replicas from iteration %d", i) if ex.nsObjects && ex.Config.NamespacedIterations { @@ -166,7 +166,8 @@ func (ex *Executor) RunCreateJob() { } wg.Wait() } - log.Infof("Finished the create job in %g seconds", time.Since(t0).Seconds()) + t1 := time.Now().Round(time.Second) + log.Infof("Finished the create job in %g seconds", t1.Sub(t0).Seconds()) } func (ex *Executor) replicaHandler(objectIndex int, obj object, ns string, iteration int, wg *sync.WaitGroup) { diff --git a/pkg/measurements/metrics/metrics.go b/pkg/measurements/metrics/metrics.go index 28e08d362..2dc5047ac 100644 --- a/pkg/measurements/metrics/metrics.go +++ b/pkg/measurements/metrics/metrics.go @@ -90,11 +90,13 @@ func CheckThreshold(thresholds []types.LatencyThreshold, quantiles []interface{} // Required to acccess the attribute by name r := reflect.ValueOf(pq.(LatencyQuantiles)) v := r.FieldByName(phase.Metric).Int() + latency := float32(v) / 1000 + msg := fmt.Sprintf("❗ %s %s latency (%.2fs) higher than configured threshold: %v", phase.Metric, phase.ConditionType, latency, phase.Threshold) if v > phase.Threshold.Milliseconds() { - log.Warnf("%s %s latency (%dms) higher than configured threshold: %v", phase.Metric, phase.ConditionType, v, phase.Threshold) + log.Error(msg) rc = 1 } else { - log.Infof("%s %s latency (%dms) meets the configured threshold: %v", phase.Metric, phase.ConditionType, v, phase.Threshold) + log.Info(msg) } continue } diff --git a/test/alert-profile.yaml b/test/alert-profile.yaml index 88107abfe..0c8bee4cc 100644 --- a/test/alert-profile.yaml +++ b/test/alert-profile.yaml @@ -1,3 +1,7 @@ - expr: count(prometheus_build_info) > 1 severity: critical description: No prometheus instance found + +- expr: avg_over_time(prometheus_tsdb_wal_page_flushes_total[{{ .elapsed }}:]) + severity: warning + description: Test