From 496346c2220d0ccf91ef0f5dc89a56842a011cdb Mon Sep 17 00:00:00 2001 From: Mark Wolfe Date: Mon, 22 Jul 2024 10:45:34 +1000 Subject: [PATCH] fix to remove reset in prometheus backend This has been reoved as it can't work for more than one cluster as gauges are scoped to all clusters, but the collect function is called once for each cluster. In summary this leads to gauges only being correct for the last cluster, depending what metrics were returned when the cluster was polled as the result doesn't always include all keys. --- backend/prometheus.go | 31 ++++++++++++++++--------------- main.go | 2 ++ 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/backend/prometheus.go b/backend/prometheus.go index 8869e55..7d10eb1 100644 --- a/backend/prometheus.go +++ b/backend/prometheus.go @@ -17,17 +17,20 @@ var ( camel = regexp.MustCompile("(^[^A-Z0-9]*|[A-Z0-9]*)([A-Z0-9][^A-Z]+|$)") ) +// Prometheus this holds a list of prometheus gauges which have been created, one for each metric +// that we want to expose. These are created on the fly as we receive metrics from the agent. +// +// Note: these metrics are not unique to a cluster / queue, as these labels are added to the +// value when it is set. type Prometheus struct { - totals map[string]*prometheus.GaugeVec - queues map[string]*prometheus.GaugeVec - pipelines map[string]*prometheus.GaugeVec + totals map[string]*prometheus.GaugeVec + queues map[string]*prometheus.GaugeVec } func NewPrometheusBackend() *Prometheus { return &Prometheus{ - totals: make(map[string]*prometheus.GaugeVec), - queues: make(map[string]*prometheus.GaugeVec), - pipelines: make(map[string]*prometheus.GaugeVec), + totals: make(map[string]*prometheus.GaugeVec), + queues: make(map[string]*prometheus.GaugeVec), } } @@ -38,16 +41,14 @@ func (p *Prometheus) Serve(path, addr string) { log.Fatal(http.ListenAndServe(addr, m)) } +// Collect receives a set of metrics from the agent and creates or updates the prometheus gauges +// +// Note: This is called once per agent token per interval func (p *Prometheus) Collect(r *collector.Result) error { - // Clear the gauges to prevent stale values from persisting forever. - for _, gauge := range p.queues { - gauge.Reset() - } - for name, value := range r.Totals { labelNames := []string{"cluster"} gauge, ok := p.totals[name] - if !ok { + if !ok { // first time this metric has been seen so create a new gauge gauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: fmt.Sprintf("buildkite_total_%s", camelToUnderscore(name)), Help: fmt.Sprintf("Buildkite Total: %s", name), @@ -56,14 +57,14 @@ func (p *Prometheus) Collect(r *collector.Result) error { p.totals[name] = gauge } - // note that r.Cluster will be empty for unclustered agents, this label will be dropped by promethues + // note that r.Cluster will be empty for unclustered agents, this label will be dropped by prometheus gauge.WithLabelValues(r.Cluster).Set(float64(value)) } for queue, counts := range r.Queues { for name, value := range counts { gauge, ok := p.queues[name] - if !ok { + if !ok { // first time this metric has been seen so create a new gauge labelNames := []string{"queue", "cluster"} gauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: fmt.Sprintf("buildkite_queues_%s", camelToUnderscore(name)), @@ -73,7 +74,7 @@ func (p *Prometheus) Collect(r *collector.Result) error { p.queues[name] = gauge } - // note that r.Cluster will be empty for unclustered agents, this label will be dropped by promethues + // note that r.Cluster will be empty for unclustered agents, this label will be dropped by prometheus gauge.WithLabelValues(queue, r.Cluster).Set(float64(value)) } } diff --git a/main.go b/main.go index 7525273..29cfd24 100644 --- a/main.go +++ b/main.go @@ -19,6 +19,8 @@ import ( var metricsBackend backend.Backend func main() { + log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile) + var ( interval = flag.Duration("interval", 0, "Update metrics every interval, rather than once") showVersion = flag.Bool("version", false, "Show the version")