Skip to content

Commit

Permalink
add prometheus metrics to record warning total and last status of bac…
Browse files Browse the repository at this point in the history
…kups

Signed-off-by: allenxu404 <[email protected]>
  • Loading branch information
allenxu404 committed Jan 30, 2023
1 parent 598333d commit a5a165b
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 0 deletions.
3 changes: 3 additions & 0 deletions changelogs/unreleased/5779-allenxu404
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Add File system backup related matrics to Grafana dashboard
Add metrics backup_warning_total for record of total warnings
Add metrics backup_last_status for record of last status of the backup
8 changes: 8 additions & 0 deletions pkg/controller/backup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -303,12 +303,16 @@ func (c *backupController) processBackup(key string) error {
switch request.Status.Phase {
case velerov1api.BackupPhaseCompleted:
c.metrics.RegisterBackupSuccess(backupScheduleName)
c.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusSucc)
case velerov1api.BackupPhasePartiallyFailed:
c.metrics.RegisterBackupPartialFailure(backupScheduleName)
c.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure)
case velerov1api.BackupPhaseFailed:
c.metrics.RegisterBackupFailed(backupScheduleName)
c.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure)
case velerov1api.BackupPhaseFailedValidation:
c.metrics.RegisterBackupValidationFailure(backupScheduleName)
c.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure)
}

log.Debug("Updating backup's final status")
Expand Down Expand Up @@ -789,6 +793,10 @@ func recordBackupMetrics(log logrus.FieldLogger, backup *velerov1api.Backup, bac
serverMetrics.RegisterBackupItemsTotalGauge(backupScheduleName, backup.Status.Progress.TotalItems)
}
serverMetrics.RegisterBackupItemsErrorsGauge(backupScheduleName, backup.Status.Errors)

if backup.Status.Warnings > 0 {
serverMetrics.RegisterBackupWarning(backupScheduleName)
}
}

func persistBackup(backup *pkgbackup.Request,
Expand Down
42 changes: 42 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ const (
backupLastSuccessfulTimestamp = "backup_last_successful_timestamp"
backupItemsTotalGauge = "backup_items_total"
backupItemsErrorsGauge = "backup_items_errors"
backupWarningTotal = "backup_warning_total"
backupLastStatus = "backup_last_status"
restoreTotal = "restore_total"
restoreAttemptTotal = "restore_attempt_total"
restoreValidationFailedTotal = "restore_validation_failed_total"
Expand All @@ -70,6 +72,10 @@ const (
pvbNameLabel = "pod_volume_backup"
scheduleLabel = "schedule"
backupNameLabel = "backupName"

// metrics values
BackupLastStatusSucc int64 = 1
BackupLastStatusFailure int64 = 0
)

// NewServerMetrics returns new ServerMetrics
Expand Down Expand Up @@ -198,6 +204,22 @@ func NewServerMetrics() *ServerMetrics {
},
[]string{scheduleLabel},
),
backupWarningTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: backupWarningTotal,
Help: "Total number of warned backups",
},
[]string{scheduleLabel},
),
backupLastStatus: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metricNamespace,
Name: backupLastStatus,
Help: "Last status of the backup. A value of 1 is success, 0 is failure",
},
[]string{scheduleLabel},
),
restoreTotal: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: metricNamespace,
Expand Down Expand Up @@ -386,6 +408,12 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) {
if c, ok := m.metrics[backupItemsErrorsGauge].(*prometheus.GaugeVec); ok {
c.WithLabelValues(scheduleName).Add(0)
}
if c, ok := m.metrics[backupWarningTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Add(0)
}
if c, ok := m.metrics[backupLastStatus].(*prometheus.GaugeVec); ok {
c.WithLabelValues(scheduleName).Add(0)
}
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Add(0)
}
Expand Down Expand Up @@ -559,6 +587,20 @@ func (m *ServerMetrics) RegisterBackupItemsErrorsGauge(backupSchedule string, it
}
}

// RegisterBackupWarning records a warned backup.
func (m *ServerMetrics) RegisterBackupWarning(backupSchedule string) {
if c, ok := m.metrics[backupWarningTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
}
}

// RegisterBackupLastStatus records the last status of the backup.
func (m *ServerMetrics) RegisterBackupLastStatus(backupSchedule string, lastStatus int64) {
if g, ok := m.metrics[backupLastStatus].(*prometheus.GaugeVec); ok {
g.WithLabelValues(backupSchedule).Set(float64(lastStatus))
}
}

// toSeconds translates a time.Duration value into a float64
// representing the number of seconds in that duration.
func toSeconds(d time.Duration) float64 {
Expand Down

0 comments on commit a5a165b

Please sign in to comment.