From 81868782d7d614bde647f758f4ea08334c459a45 Mon Sep 17 00:00:00 2001 From: Bernard Kim Date: Mon, 15 Jun 2020 10:01:49 -0700 Subject: [PATCH 1/2] Update disk/storage check --- monitoring/storage.go | 65 ++++++++++++++++++++++++++++---- monitoring/storage_linux.go | 42 ++++++++++++++++----- monitoring/storage_linux_test.go | 18 +++++++-- 3 files changed, 106 insertions(+), 19 deletions(-) diff --git a/monitoring/storage.go b/monitoring/storage.go index 4d6f7b713..1333a52ec 100644 --- a/monitoring/storage.go +++ b/monitoring/storage.go @@ -19,6 +19,8 @@ package monitoring import ( "fmt" + "github.com/gravitational/trace" + humanize "github.com/dustin/go-humanize" ) @@ -34,13 +36,48 @@ type StorageConfig struct { Filesystems []string // MinFreeBytes define minimum free volume capacity MinFreeBytes uint64 - // HighWatermark is the disk occupancy percentage that is considered degrading + // LowWatermark is the disk occupancy percentage that will trigger a warning probe + LowWatermark uint + // HighWatermark is the disk occupancy percentage that will trigger a critical probe HighWatermark uint } +// CheckAndSetDefaults validates that this configuration is correct and sets +// value defaults where necessary. +func (c *StorageConfig) CheckAndSetDefaults() error { + var errors []error + if c.Path == "" { + errors = append(errors, trace.BadParameter("volume path must be provided")) + } + + if c.LowWatermark > 100 { + errors = append(errors, trace.BadParameter("low watermark must be 0-100")) + } + + if c.HighWatermark > 100 { + errors = append(errors, trace.BadParameter("high watermark must be 0-100")) + } + + if c.LowWatermark == 0 { + c.LowWatermark = DefaultLowWatermark + } + + if c.HighWatermark == 0 { + c.HighWatermark = DefaultHighWatermark + } + + if c.LowWatermark > c.HighWatermark { + c.LowWatermark = c.HighWatermark + } + + return trace.NewAggregate(errors...) +} + // HighWatermarkCheckerData is attached to high watermark check results type HighWatermarkCheckerData struct { - // HighWatermark is the watermark percentage value + // LowWatermark is the low watermark percentage value + LowWatermark uint `json:"low_watermark"` + // HighWatermark is the high watermark percentage value HighWatermark uint `json:"high_watermark"` // Path is the absolute path to check Path string `json:"path"` @@ -50,17 +87,31 @@ type HighWatermarkCheckerData struct { AvailableBytes uint64 `json:"available_bytes"` } -// FailureMessage returns failure watermark check message -func (d HighWatermarkCheckerData) FailureMessage() string { - return fmt.Sprintf("disk utilization on %s exceeds %v percent (%s is available out of %s), see https://gravitational.com/telekube/docs/cluster/#garbage-collection", - d.Path, d.HighWatermark, humanize.Bytes(d.AvailableBytes), humanize.Bytes(d.TotalBytes)) +// WarningMessage returns warning watermark check message +func (d HighWatermarkCheckerData) WarningMessage() string { + diskUsage := float64(d.TotalBytes-d.AvailableBytes) / float64(d.TotalBytes) * 100 + return fmt.Sprintf("disk utilization on %s exceeds %v%%, currently at %v%% (%s is available out of %s), cluster will degrade if usage exceeds %v%%, see https://gravitational.com/gravity/docs/cluster/#garbage-collection", + d.Path, d.LowWatermark, diskUsage, humanize.Bytes(d.AvailableBytes), humanize.Bytes(d.TotalBytes), d.HighWatermark) +} + +// CriticalMessage returns critical watermark check message +func (d HighWatermarkCheckerData) CriticalMessage() string { + diskUsage := float64(d.TotalBytes-d.AvailableBytes) / float64(d.TotalBytes) * 100 + return fmt.Sprintf("disk utilization on %s exceeds %v%%, currently at %v%% (%s is available out of %s), see https://gravitational.com/gravity/docs/cluster/#garbage-collection", + d.Path, d.HighWatermark, diskUsage, humanize.Bytes(d.AvailableBytes), humanize.Bytes(d.TotalBytes)) } // SuccessMessage returns success watermark check message func (d HighWatermarkCheckerData) SuccessMessage() string { - return fmt.Sprintf("disk utilization on %s is below %v percent (%s is available out of %s)", + return fmt.Sprintf("disk utilization on %s is below %v%% (%s is available out of %s)", d.Path, d.HighWatermark, humanize.Bytes(d.AvailableBytes), humanize.Bytes(d.TotalBytes)) } // DiskSpaceCheckerID is the checker that checks disk space utilization const DiskSpaceCheckerID = "disk-space" + +// DefaultLowWatermark is the default low watermark percentage. +const DefaultLowWatermark = 80 + +// DefaultHighWatermark is the default high watermark percentage. +const DefaultHighWatermark = 90 diff --git a/monitoring/storage_linux.go b/monitoring/storage_linux.go index de6c2baa9..e017a1a48 100644 --- a/monitoring/storage_linux.go +++ b/monitoring/storage_linux.go @@ -39,11 +39,15 @@ import ( // NewStorageChecker creates a new instance of the volume checker // using the specified checker as configuration -func NewStorageChecker(config StorageConfig) health.Checker { +func NewStorageChecker(config StorageConfig) (health.Checker, error) { + if err := config.CheckAndSetDefaults(); err != nil { + return nil, trace.Wrap(err) + } + return &storageChecker{ StorageConfig: config, osInterface: &realOS{}, - } + }, nil } // storageChecker verifies volume requirements @@ -84,7 +88,7 @@ func (c *storageChecker) check(ctx context.Context, reporter health.Reporter) er return trace.NewAggregate(c.checkFsType(ctx, reporter), c.checkCapacity(ctx, reporter), - c.checkHighWatermark(ctx, reporter), + c.checkDiskUsage(ctx, reporter), c.checkWriteSpeed(ctx, reporter)) } @@ -143,7 +147,9 @@ func (c *storageChecker) checkFsType(ctx context.Context, reporter health.Report return nil } -func (c *storageChecker) checkHighWatermark(ctx context.Context, reporter health.Reporter) error { +// checkDiskUsage checks the disk usage. A warning or critical probe will be +// reported if the usage percentage is above the set thresholds. +func (c *storageChecker) checkDiskUsage(ctx context.Context, reporter health.Reporter) error { if c.HighWatermark == 0 { return nil } @@ -155,6 +161,7 @@ func (c *storageChecker) checkHighWatermark(ctx context.Context, reporter health return trace.BadParameter("disk capacity at %v is 0", c.path) } checkerData := HighWatermarkCheckerData{ + LowWatermark: c.LowWatermark, HighWatermark: c.HighWatermark, Path: c.Path, TotalBytes: totalBytes, @@ -164,21 +171,38 @@ func (c *storageChecker) checkHighWatermark(ctx context.Context, reporter health if err != nil { return trace.Wrap(err) } - if float64(totalBytes-availableBytes)/float64(totalBytes)*100 > float64(c.HighWatermark) { + + diskUsagePercent := float64(totalBytes-availableBytes) / float64(totalBytes) * 100 + + if diskUsagePercent > float64(checkerData.HighWatermark) { reporter.Add(&pb.Probe{ Checker: DiskSpaceCheckerID, - Detail: checkerData.FailureMessage(), + Detail: checkerData.CriticalMessage(), CheckerData: checkerDataBytes, Status: pb.Probe_Failed, + Severity: pb.Probe_Critical, }) - } else { + return nil + } + + if diskUsagePercent > float64(checkerData.LowWatermark) { reporter.Add(&pb.Probe{ Checker: DiskSpaceCheckerID, - Detail: checkerData.SuccessMessage(), + Detail: checkerData.WarningMessage(), CheckerData: checkerDataBytes, - Status: pb.Probe_Running, + Status: pb.Probe_Failed, + Severity: pb.Probe_Warning, }) + return nil } + + reporter.Add(&pb.Probe{ + Checker: DiskSpaceCheckerID, + Detail: checkerData.SuccessMessage(), + CheckerData: checkerDataBytes, + Status: pb.Probe_Running, + }) + return nil } diff --git a/monitoring/storage_linux_test.go b/monitoring/storage_linux_test.go index ee750031b..e96cc4fdb 100644 --- a/monitoring/storage_linux_test.go +++ b/monitoring/storage_linux_test.go @@ -105,19 +105,31 @@ func (_ *StorageSuite) TestStorage(c *C) { StorageConfig: StorageConfig{ Path: path.Join("/tmp", fmt.Sprintf("%d", time.Now().Unix())), WillBeCreated: true, - HighWatermark: 40, + LowWatermark: 60, + HighWatermark: 80, }, osInterface: testOS{mountList: mounts, bytesAvail: 2048}, - }.probe(c, "high watermark is reached", shallFail) + }.probe(c, "low watermark is not reached", shallSucceed) storageChecker{ StorageConfig: StorageConfig{ Path: path.Join("/tmp", fmt.Sprintf("%d", time.Now().Unix())), WillBeCreated: true, + LowWatermark: 40, HighWatermark: 60, }, osInterface: testOS{mountList: mounts, bytesAvail: 2048}, - }.probe(c, "high watermark is not reached", shallSucceed) + }.probe(c, "low watermark is reached", shallFail) + + storageChecker{ + StorageConfig: StorageConfig{ + Path: path.Join("/tmp", fmt.Sprintf("%d", time.Now().Unix())), + WillBeCreated: true, + LowWatermark: 20, + HighWatermark: 40, + }, + osInterface: testOS{mountList: mounts, bytesAvail: 2048}, + }.probe(c, "high watermark is reached", shallFail) } func (_ *StorageSuite) TestMatchesFilesystem(c *C) { From 0760fd96fe54288c0218d03b24de54a49da9d185 Mon Sep 17 00:00:00 2001 From: Bernard Kim Date: Mon, 15 Jun 2020 11:20:11 -0700 Subject: [PATCH 2/2] Update default NewStorageChecker --- monitoring/defaults.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monitoring/defaults.go b/monitoring/defaults.go index 9b7009876..5cdb01a48 100644 --- a/monitoring/defaults.go +++ b/monitoring/defaults.go @@ -69,8 +69,8 @@ func GetStorageDriverBootConfigParams(drv string) health.Checker { // NewStorageChecker creates a new instance of the volume checker // using the specified checker as configuration -func NewStorageChecker(config StorageConfig) health.Checker { - return noopChecker{} +func NewStorageChecker(config StorageConfig) (health.Checker, error) { + return noopChecker{}, nil } // NewDNSChecker sends some default queries to monitor DNS / service discovery health