diff --git a/monitoring/storage.go b/monitoring/storage.go index 4d6f7b713..9b861e6a9 100644 --- a/monitoring/storage.go +++ b/monitoring/storage.go @@ -19,6 +19,8 @@ package monitoring import ( "fmt" + "github.com/gravitational/trace" + humanize "github.com/dustin/go-humanize" ) @@ -38,10 +40,25 @@ type StorageConfig struct { HighWatermark uint } +// CheckAndSetDefaults validates that this configuration is correct and sets +// value defaults where necessary. +func (c *StorageConfig) CheckAndSetDefaults() error { + var errors []error + if c.Path == "" { + errors = append(errors, trace.BadParameter("volume path must be provided")) + } + if c.HighWatermark == 0 { + c.HighWatermark = DefaultCriticalWatermark + } + return trace.NewAggregate(errors...) +} + // HighWatermarkCheckerData is attached to high watermark check results type HighWatermarkCheckerData struct { - // HighWatermark is the watermark percentage value - HighWatermark uint `json:"high_watermark"` + // WatermarkWarning is the watermark warning percentage value + WatermarkWarning uint `json:"watermark_warning"` + // WatermarkCritical is the watermark critical percentage value + WatermarkCritical uint `json:"watermark_critical"` // Path is the absolute path to check Path string `json:"path"` // TotalBytes is the total disk capacity @@ -50,17 +67,26 @@ type HighWatermarkCheckerData struct { AvailableBytes uint64 `json:"available_bytes"` } -// FailureMessage returns failure watermark check message -func (d HighWatermarkCheckerData) FailureMessage() string { +// WarningMessage returns warning watermark check message +func (d HighWatermarkCheckerData) WarningMessage() string { return fmt.Sprintf("disk utilization on %s exceeds %v percent (%s is available out of %s), see https://gravitational.com/telekube/docs/cluster/#garbage-collection", - d.Path, d.HighWatermark, humanize.Bytes(d.AvailableBytes), humanize.Bytes(d.TotalBytes)) + d.Path, d.WatermarkWarning, humanize.Bytes(d.AvailableBytes), humanize.Bytes(d.TotalBytes)) +} + +// CriticalMessage returns critical watermark check message +func (d HighWatermarkCheckerData) CriticalMessage() string { + return fmt.Sprintf("disk utilization on %s exceeds %v percent (%s is available out of %s), see https://gravitational.com/telekube/docs/cluster/#garbage-collection", + d.Path, d.WatermarkCritical, humanize.Bytes(d.AvailableBytes), humanize.Bytes(d.TotalBytes)) } // SuccessMessage returns success watermark check message func (d HighWatermarkCheckerData) SuccessMessage() string { return fmt.Sprintf("disk utilization on %s is below %v percent (%s is available out of %s)", - d.Path, d.HighWatermark, humanize.Bytes(d.AvailableBytes), humanize.Bytes(d.TotalBytes)) + d.Path, d.WatermarkWarning, humanize.Bytes(d.AvailableBytes), humanize.Bytes(d.TotalBytes)) } // DiskSpaceCheckerID is the checker that checks disk space utilization const DiskSpaceCheckerID = "disk-space" + +// DefaultCriticalWatermark is the default critical disk usage percentage threshold. +const DefaultCriticalWatermark = 90 diff --git a/monitoring/storage_linux.go b/monitoring/storage_linux.go index de6c2baa9..a3b08b45b 100644 --- a/monitoring/storage_linux.go +++ b/monitoring/storage_linux.go @@ -39,11 +39,15 @@ import ( // NewStorageChecker creates a new instance of the volume checker // using the specified checker as configuration -func NewStorageChecker(config StorageConfig) health.Checker { +func NewStorageChecker(config StorageConfig) (health.Checker, error) { + if err := config.CheckAndSetDefaults(); err != nil { + return nil, trace.Wrap(err) + } + return &storageChecker{ StorageConfig: config, osInterface: &realOS{}, - } + }, nil } // storageChecker verifies volume requirements @@ -155,30 +159,48 @@ func (c *storageChecker) checkHighWatermark(ctx context.Context, reporter health return trace.BadParameter("disk capacity at %v is 0", c.path) } checkerData := HighWatermarkCheckerData{ - HighWatermark: c.HighWatermark, - Path: c.Path, - TotalBytes: totalBytes, - AvailableBytes: availableBytes, + WatermarkCritical: c.HighWatermark, + WatermarkWarning: c.HighWatermark - 10, // Set warning watermark 10% below the critical watermark + Path: c.Path, + TotalBytes: totalBytes, + AvailableBytes: availableBytes, } checkerDataBytes, err := json.Marshal(checkerData) if err != nil { return trace.Wrap(err) } - if float64(totalBytes-availableBytes)/float64(totalBytes)*100 > float64(c.HighWatermark) { + + diskUsagePercent := float64(totalBytes-availableBytes) / float64(totalBytes) * 100 + + if diskUsagePercent > float64(checkerData.WatermarkCritical) { reporter.Add(&pb.Probe{ Checker: DiskSpaceCheckerID, - Detail: checkerData.FailureMessage(), + Detail: checkerData.CriticalMessage(), CheckerData: checkerDataBytes, Status: pb.Probe_Failed, + Severity: pb.Probe_Critical, }) - } else { + return nil + } + + if diskUsagePercent > float64(checkerData.WatermarkWarning) { reporter.Add(&pb.Probe{ Checker: DiskSpaceCheckerID, - Detail: checkerData.SuccessMessage(), + Detail: checkerData.WarningMessage(), CheckerData: checkerDataBytes, - Status: pb.Probe_Running, + Status: pb.Probe_Failed, + Severity: pb.Probe_Warning, }) + return nil } + + reporter.Add(&pb.Probe{ + Checker: DiskSpaceCheckerID, + Detail: checkerData.SuccessMessage(), + CheckerData: checkerDataBytes, + Status: pb.Probe_Running, + }) + return nil }