From 49d4ce13bb2f1f1d34e5a9ae8713e21de67daf82 Mon Sep 17 00:00:00 2001 From: Sami Alhaddad Date: Thu, 16 Feb 2023 00:38:09 +0100 Subject: [PATCH] feat: add more metric to detect failures (#18) --- pkg/metric/metric.go | 14 ++++++++++++++ pkg/server/handlers.go | 11 ++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go index 3de8c72..623bb82 100644 --- a/pkg/metric/metric.go +++ b/pkg/metric/metric.go @@ -10,6 +10,8 @@ type Metrics struct { ChangeResources *prometheus.GaugeVec AddResources *prometheus.GaugeVec DestroyResources *prometheus.GaugeVec + PlanFailure *prometheus.GaugeVec + GitPullFailure *prometheus.GaugeVec } func NewMetrics(reg prometheus.Registerer) *Metrics { @@ -29,10 +31,22 @@ func NewMetrics(reg prometheus.Registerer) *Metrics { Name: "plan_destroy_resources", Help: "Number of resources to be destroyed based on tf plan", }, []string{"stack"}), + PlanFailure: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "terradrift", + Name: "plan_failure", + Help: "Status of the last scan of a stack", + }, []string{"stack"}), + GitPullFailure: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "terradrift", + Name: "git_pull_failure", + Help: "Status of the last git pull", + }, []string{}), } reg.MustRegister(m.AddResources) reg.MustRegister(m.ChangeResources) reg.MustRegister(m.DestroyResources) + reg.MustRegister(m.PlanFailure) + reg.MustRegister(m.GitPullFailure) return m } diff --git a/pkg/server/handlers.go b/pkg/server/handlers.go index daadc87..2b79bcf 100644 --- a/pkg/server/handlers.go +++ b/pkg/server/handlers.go @@ -17,6 +17,9 @@ func (s Server) scanHandler(c *gin.Context) { name := c.Query("stack") planResp, err := tfstack.StackScan(name, s.Workdir, s.ConfigPath, s.ExtraBackendVars) + // Reset the plan failure metric + promMetrics.PlanFailure.With(prometheus.Labels{"stack": name}).Set(0) + if err == nil { // Record metrics for drifts in resources @@ -35,11 +38,13 @@ func (s Server) scanHandler(c *gin.Context) { c.JSON(404, errorMessage) } else if strings.Contains(errorMessage, "error acquiring the state lock") { + promMetrics.PlanFailure.With(prometheus.Labels{"stack": name}).Set(1) // When there's a current terrafom plan in progress, terraform locks the state till it's finished. c.JSON(502, "Another plan is in-progress for the requested stack, please try again in few minutes.") } else { + promMetrics.PlanFailure.With(prometheus.Labels{"stack": name}).Set(1) c.JSON(500, errorMessage) } } @@ -48,9 +53,13 @@ func (s Server) scanHandler(c *gin.Context) { // gitHandler is a handler function for git sync endpoint func (s Server) gitHandler(c *gin.Context) { + // Reset the git failure metric + promMetrics.GitPullFailure.With(prometheus.Labels{}).Set(0) + status, err := git.GitPull(s.Workdir, s.GitToken, s.GitTimeout) if err != nil { - c.JSON(500, err) + promMetrics.GitPullFailure.With(prometheus.Labels{}).Set(1) + c.JSON(500, error.Error(err)) } else { c.JSON(200, status) }