Skip to content

Commit

Permalink
Alert duration variable (kube-burner#171)
Browse files Browse the repository at this point in the history
* Alert duration variable

Signed-off-by: Raul Sevilla <[email protected]>

* Update test

Signed-off-by: Raul Sevilla <[email protected]>

* Update docs

Signed-off-by: Raul Sevilla <[email protected]>
  • Loading branch information
rsevilla87 authored Mar 24, 2022
1 parent fb891de commit 7adf2b2
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 12 deletions.
4 changes: 2 additions & 2 deletions cmd/kube-burner/kube-burner.go
Original file line number Diff line number Diff line change
Expand Up @@ -370,8 +370,8 @@ func steps(uuid string, p *prometheus.Prometheus, alertM *alerting.AlertManager)
}
}
if p != nil {
log.Infof("Waiting %v extra before scraping prometheus", p.Step*2)
time.Sleep(p.Step * 2)
log.Infof("Waiting %v extra before scraping prometheus", p.Step)
time.Sleep(p.Step)
// Update end time of last job
jobList[len(jobList)-1].End = time.Now().UTC()
// If alertManager is configured
Expand Down
12 changes: 12 additions & 0 deletions docs/alerting.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,18 @@ Alarm can be configured with a severity. Each one with different effects. At the
- error: Prints a error message with the alarm description to stdout and makes kube-burner rc = 1
- critical: Prints a fatal message with the alarm description to stdout and exits execution inmediatly with rc != 0


### Using the elapsed variable

There's a special go-template variable that can be used within the prometheus expression, the variable _elapased_ is set to the value of the job duration (or the range given to check-alerts). This variable is specially useful in expressions using (aggregations over time functions)[https://prometheus.io/docs/prometheus/latest/querying/functions/#aggregation_over_time]
i.e:

```yaml
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[{{ .elapsed }}m]) > 0.01
description: avg. etcd fsync latency on {{$labels.pod}} higher than 10ms {{$value}}
severity: error
```

## Checking alerts

It's possible to look for alerts w/o triggering a kube-burner workload. To do so you can use the `check-alerts` option from the CLI, similar to the `index` CLI option, this one accepts the flags `--start` and `--end` to evaluate the alerts at a given time range.
Expand Down
20 changes: 14 additions & 6 deletions pkg/alerting/alert_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package alerting
import (
"bytes"
"fmt"
"math"
"strings"
"text/template"
"time"
Expand Down Expand Up @@ -94,12 +95,18 @@ func (a *AlertManager) readProfile(alertProfile string) error {

// Evaluate evaluates expressions
func (a *AlertManager) Evaluate(start, end time.Time) int {
elapsed := int(end.Sub(start).Minutes())
var renderedQuery bytes.Buffer
result := Passed
for _, alert := range a.alertProfile {
log.Infof("Evaluating expression: '%s'", alert.Expr)
v, err := a.prometheus.QueryRange(alert.Expr, start, end)
t, _ := template.New("").Parse(alert.Expr)
t.Execute(&renderedQuery, map[string]string{"elapsed": fmt.Sprintf("%dm", elapsed)})
expr := renderedQuery.String()
renderedQuery.Reset()
log.Infof("Evaluating expression: '%s'", expr)
v, err := a.prometheus.QueryRange(expr, start, end)
if err != nil {
log.Warnf("Error performing query %s: %s", alert.Expr, err)
log.Warnf("Error performing query %s: %s", expr, err)
continue
}
alarmResult, err := parseMatrix(v, alert.Description, alert.Severity)
Expand Down Expand Up @@ -133,18 +140,19 @@ func parseMatrix(value model.Value, description string, severity severityLevel)
}

for _, v := range data {
// TODO: improve value casting
templateData.Labels = make(map[string]string)
for k, v := range v.Metric {
templateData.Labels[string(k)] = string(v)
}
for _, val := range v.Values {
renderedDesc.Reset()
templateData.Value = float64(val.Value)
// Take 3 decimals
templateData.Value = math.Round(float64(val.Value)*1000) / 1000
if err := t.Execute(&renderedDesc, templateData); err != nil {
log.Errorf("Rendering error: %s", err)
result = Failed
}
msg := fmt.Sprintf("Alert triggered at %v: '%s'", val.Timestamp.Time(), renderedDesc.String())
msg := fmt.Sprintf("🚨 Alert triggered at %v: '%s'", val.Timestamp.Time(), renderedDesc.String())
switch severity {
case sevWarn:
log.Warn(msg)
Expand Down
5 changes: 3 additions & 2 deletions pkg/burner/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ func (ex *Executor) RunCreateJob() {
log.Fatal(err.Error())
}
}
t0 := time.Now()
t0 := time.Now().Round(time.Second)
for i := 1; i <= ex.Config.JobIterations; i++ {
log.Debugf("Creating object replicas from iteration %d", i)
if ex.nsObjects && ex.Config.NamespacedIterations {
Expand Down Expand Up @@ -166,7 +166,8 @@ func (ex *Executor) RunCreateJob() {
}
wg.Wait()
}
log.Infof("Finished the create job in %g seconds", time.Since(t0).Seconds())
t1 := time.Now().Round(time.Second)
log.Infof("Finished the create job in %g seconds", t1.Sub(t0).Seconds())
}

func (ex *Executor) replicaHandler(objectIndex int, obj object, ns string, iteration int, wg *sync.WaitGroup) {
Expand Down
6 changes: 4 additions & 2 deletions pkg/measurements/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,13 @@ func CheckThreshold(thresholds []types.LatencyThreshold, quantiles []interface{}
// Required to acccess the attribute by name
r := reflect.ValueOf(pq.(LatencyQuantiles))
v := r.FieldByName(phase.Metric).Int()
latency := float32(v) / 1000
msg := fmt.Sprintf("❗ %s %s latency (%.2fs) higher than configured threshold: %v", phase.Metric, phase.ConditionType, latency, phase.Threshold)
if v > phase.Threshold.Milliseconds() {
log.Warnf("%s %s latency (%dms) higher than configured threshold: %v", phase.Metric, phase.ConditionType, v, phase.Threshold)
log.Error(msg)
rc = 1
} else {
log.Infof("%s %s latency (%dms) meets the configured threshold: %v", phase.Metric, phase.ConditionType, v, phase.Threshold)
log.Info(msg)
}
continue
}
Expand Down
4 changes: 4 additions & 0 deletions test/alert-profile.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
- expr: count(prometheus_build_info) > 1
severity: critical
description: No prometheus instance found

- expr: avg_over_time(prometheus_tsdb_wal_page_flushes_total[{{ .elapsed }}:])
severity: warning
description: Test

0 comments on commit 7adf2b2

Please sign in to comment.