From fafb68b28d839892206e5e688055204c2cf54023 Mon Sep 17 00:00:00 2001 From: Nick Ethier Date: Tue, 18 Apr 2023 22:32:31 -0400 Subject: [PATCH 1/9] hoststats: add package for collecting host statistics including cpu, memory and disk usage --- agent/setup.go | 4 + lib/hoststats/collector.go | 189 +++++++++++++++++++++++++++++++++++++ lib/hoststats/cpu.go | 118 +++++++++++++++++++++++ lib/hoststats/cpu_test.go | 77 +++++++++++++++ lib/hoststats/host.go | 95 +++++++++++++++++++ lib/hoststats/metrics.go | 75 +++++++++++++++ 6 files changed, 558 insertions(+) create mode 100644 lib/hoststats/collector.go create mode 100644 lib/hoststats/cpu.go create mode 100644 lib/hoststats/cpu_test.go create mode 100644 lib/hoststats/host.go create mode 100644 lib/hoststats/metrics.go diff --git a/agent/setup.go b/agent/setup.go index 9ed993aaf4a6..a97874470400 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -4,6 +4,7 @@ package agent import ( + "context" "fmt" "io" "net" @@ -12,6 +13,7 @@ import ( "github.com/armon/go-metrics" "github.com/armon/go-metrics/prometheus" + "github.com/hashicorp/consul/lib/hoststats" "github.com/hashicorp/go-hclog" wal "github.com/hashicorp/raft-wal" "github.com/hashicorp/raft-wal/verifier" @@ -117,6 +119,7 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer, providedLogger hcl if err != nil { return d, fmt.Errorf("failed to initialize telemetry: %w", err) } + hoststats.NewCollector(context.Background(), d.Logger, cfg.DataDir) d.TLSConfigurator, err = tlsutil.NewConfigurator(cfg.TLS, d.Logger) if err != nil { @@ -295,6 +298,7 @@ func getPrometheusDefs(cfg *config.RuntimeConfig, isServer bool) ([]prometheus.G Gauges, raftGauges, serverGauges, + hoststats.Gauges, } // TODO(ffmmm): conditionally add only leader specific metrics to gauges, counters, summaries, etc diff --git a/lib/hoststats/collector.go b/lib/hoststats/collector.go new file mode 100644 index 000000000000..a2c7bade67bb --- /dev/null +++ b/lib/hoststats/collector.go @@ -0,0 +1,189 @@ +package hoststats + +import ( + "context" + "fmt" + "math" + "runtime" + "sync" + "time" + + "github.com/armon/go-metrics" + "github.com/hashicorp/go-hclog" + "github.com/shirou/gopsutil/v3/disk" + "github.com/shirou/gopsutil/v3/host" + "github.com/shirou/gopsutil/v3/mem" +) + +// Collector collects host resource usage stats +type Collector struct { + numCores int + cpuCalculator map[string]*cpuStatsCalculator + hostStats *HostStats + hostStatsLock sync.RWMutex + dataDir string + + metrics Metrics + baseLabels []metrics.Label + + logger hclog.Logger +} + +// NewCollector returns a Collector. The dataDir is passed in +// so that we can present the disk related statistics for the mountpoint where the dataDir exists +func NewCollector(ctx context.Context, logger hclog.Logger, dataDir string, opts ...CollectorOption) *Collector { + logger = logger.Named("host_stats") + collector := initCollector(logger, dataDir) + go collector.loop(ctx) + return collector +} + +// initCollector initializes the Collector but does not start the collection loop +func initCollector(logger hclog.Logger, dataDir string, opts ...CollectorOption) *Collector { + numCores := runtime.NumCPU() + statsCalculator := make(map[string]*cpuStatsCalculator) + collector := &Collector{ + cpuCalculator: statsCalculator, + numCores: numCores, + logger: logger, + dataDir: dataDir, + } + + for _, opt := range opts { + opt(collector) + } + + if collector.metrics == nil { + collector.metrics = metrics.Default() + } + return collector +} + +func (h *Collector) loop(ctx context.Context) { + // Start collecting host stats right away and then keep collecting every + // collection interval + next := time.NewTimer(0) + defer next.Stop() + for { + select { + case <-next.C: + h.collect() + next.Reset(hostStatsCollectionInterval) + h.Stats().Emit(h.metrics, h.baseLabels) + + case <-ctx.Done(): + return + } + } +} + +// collect will collect stats related to resource usage of the host +func (h *Collector) collect() { + h.hostStatsLock.Lock() + defer h.hostStatsLock.Unlock() + hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()} + + // Determine up-time + uptime, err := host.Uptime() + if err != nil { + h.logger.Error("failed to collect uptime stats", "error", err) + uptime = 0 + } + hs.Uptime = uptime + + // Collect memory stats + mstats, err := h.collectMemoryStats() + if err != nil { + h.logger.Error("failed to collect memory stats", "error", err) + mstats = &MemoryStats{} + } + hs.Memory = mstats + + // Collect cpu stats + cpus, err := h.collectCPUStats() + if err != nil { + h.logger.Error("failed to collect cpu stats", "error", err) + cpus = []*CPUStats{} + } + hs.CPU = cpus + + // Collect disk stats + diskStats, err := h.collectDiskStats(h.dataDir) + if err != nil { + h.logger.Error("failed to collect dataDir disk stats", "error", err) + } + hs.DataDirStats = diskStats + + // Update the collected status object. + h.hostStats = hs +} + +func (h *Collector) collectDiskStats(dir string) (*DiskStats, error) { + usage, err := disk.Usage(dir) + if err != nil { + return nil, fmt.Errorf("failed to collect disk usage stats: %w", err) + } + return h.toDiskStats(usage), nil +} + +func (h *Collector) collectMemoryStats() (*MemoryStats, error) { + memStats, err := mem.VirtualMemory() + if err != nil { + return nil, err + } + mem := &MemoryStats{ + Total: memStats.Total, + Available: memStats.Available, + Used: memStats.Used, + UsedPercent: memStats.UsedPercent, + Free: memStats.Free, + } + + return mem, nil +} + +// Stats returns the host stats that has been collected +func (h *Collector) Stats() *HostStats { + h.hostStatsLock.RLock() + defer h.hostStatsLock.RUnlock() + + if h.hostStats == nil { + return &HostStats{} + } + + return h.hostStats.Clone() +} + +// toDiskStats merges UsageStat and PartitionStat to create a DiskStat +func (h *Collector) toDiskStats(usage *disk.UsageStat) *DiskStats { + ds := DiskStats{ + Size: usage.Total, + Used: usage.Used, + Available: usage.Free, + UsedPercent: usage.UsedPercent, + InodesUsedPercent: usage.InodesUsedPercent, + Path: usage.Path, + } + if math.IsNaN(ds.UsedPercent) { + ds.UsedPercent = 0.0 + } + if math.IsNaN(ds.InodesUsedPercent) { + ds.InodesUsedPercent = 0.0 + } + + return &ds +} + +type CollectorOption func(c *Collector) + +func WithMetrics(m *metrics.Metrics) CollectorOption { + return func(c *Collector) { + c.metrics = m + } +} + +func WithBaseLabels(labels []metrics.Label) CollectorOption { + return func(c *Collector) { + c.baseLabels = labels + } +} diff --git a/lib/hoststats/cpu.go b/lib/hoststats/cpu.go new file mode 100644 index 000000000000..0fc3fc28c49c --- /dev/null +++ b/lib/hoststats/cpu.go @@ -0,0 +1,118 @@ +package hoststats + +import ( + "math" + "time" + + "github.com/shirou/gopsutil/v3/cpu" +) + +// cpuStatsCalculator calculates cpu usage percentages +type cpuStatsCalculator struct { + prevIdle float64 + prevUser float64 + prevSystem float64 + prevBusy float64 + prevTotal float64 +} + +// calculate calculates the current cpu usage percentages +func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) (idle float64, user float64, system float64, total float64) { + currentIdle := times.Idle + currentUser := times.User + currentSystem := times.System + currentTotal := times.Total() + currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq + + times.Softirq + times.Steal + times.Guest + times.GuestNice + + deltaTotal := currentTotal - h.prevTotal + idle = ((currentIdle - h.prevIdle) / deltaTotal) * 100 + user = ((currentUser - h.prevUser) / deltaTotal) * 100 + system = ((currentSystem - h.prevSystem) / deltaTotal) * 100 + total = ((currentBusy - h.prevBusy) / deltaTotal) * 100 + + // Protect against any invalid values + if math.IsNaN(idle) || math.IsInf(idle, 0) { + idle = 100.0 + } + if math.IsNaN(user) || math.IsInf(user, 0) { + user = 0.0 + } + if math.IsNaN(system) || math.IsInf(system, 0) { + system = 0.0 + } + if math.IsNaN(total) || math.IsInf(total, 0) { + total = 0.0 + } + + h.prevIdle = currentIdle + h.prevUser = currentUser + h.prevSystem = currentSystem + h.prevTotal = currentTotal + h.prevBusy = currentBusy + return +} + +// cpuStats calculates cpu usage percentage +type cpuStats struct { + prevCpuTime float64 + prevTime time.Time + + totalCpus int +} + +// percent calculates the cpu usage percentage based on the current cpu usage +// and the previous cpu usage where usage is given as time in nanoseconds spend +// in the cpu +func (c *cpuStats) percent(cpuTime float64) float64 { + now := time.Now() + + if c.prevCpuTime == 0.0 { + // invoked first time + c.prevCpuTime = cpuTime + c.prevTime = now + return 0.0 + } + + timeDelta := now.Sub(c.prevTime).Nanoseconds() + ret := c.calculatePercent(c.prevCpuTime, cpuTime, timeDelta) + c.prevCpuTime = cpuTime + c.prevTime = now + return ret +} + +func (c *cpuStats) calculatePercent(t1, t2 float64, timeDelta int64) float64 { + vDelta := t2 - t1 + if timeDelta <= 0 || vDelta <= 0.0 { + return 0.0 + } + + overall_percent := (vDelta / float64(timeDelta)) * 100.0 + return overall_percent +} + +func (h *Collector) collectCPUStats() (cpus []*CPUStats, err error) { + + cpuStats, err := cpu.Times(true) + if err != nil { + return nil, err + } + cs := make([]*CPUStats, len(cpuStats)) + for idx, cpuStat := range cpuStats { + percentCalculator, ok := h.cpuCalculator[cpuStat.CPU] + if !ok { + percentCalculator = &cpuStatsCalculator{} + h.cpuCalculator[cpuStat.CPU] = percentCalculator + } + idle, user, system, total := percentCalculator.calculate(cpuStat) + cs[idx] = &CPUStats{ + CPU: cpuStat.CPU, + User: user, + System: system, + Idle: idle, + Total: total, + } + } + + return cs, nil +} diff --git a/lib/hoststats/cpu_test.go b/lib/hoststats/cpu_test.go new file mode 100644 index 000000000000..6de0823a9648 --- /dev/null +++ b/lib/hoststats/cpu_test.go @@ -0,0 +1,77 @@ +package hoststats + +import ( + "math" + "os" + "runtime" + "testing" + "time" + + "github.com/hashicorp/consul/sdk/testutil" + "github.com/shirou/gopsutil/v3/cpu" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCpuStats_percent(t *testing.T) { + cs := &cpuStats{ + totalCpus: runtime.NumCPU(), + } + cs.percent(79.7) + time.Sleep(1 * time.Second) + percent := cs.percent(80.69) + expectedPercent := 98.00 + if percent < expectedPercent && percent > (expectedPercent+1.00) { + t.Fatalf("expected: %v, actual: %v", expectedPercent, percent) + } +} + +func TestHostStats_CPU(t *testing.T) { + + assert := assert.New(t) + + logger := testutil.Logger(t) + cwd, err := os.Getwd() + assert.Nil(err) + hs := initCollector(logger, cwd) + + // Collect twice so we can calculate percents we need to generate some work + // so that the cpu values change + hs.collect() + total := 0 + for i := 1; i < 1000000000; i++ { + total *= i + total = total % i + } + hs.collect() + stats := hs.Stats() + assert.NotZero(len(stats.CPU)) + + for _, cpu := range stats.CPU { + assert.False(math.IsNaN(cpu.Idle)) + assert.False(math.IsNaN(cpu.Total)) + assert.False(math.IsNaN(cpu.System)) + assert.False(math.IsNaN(cpu.User)) + + assert.False(math.IsInf(cpu.Idle, 0)) + assert.False(math.IsInf(cpu.Total, 0)) + assert.False(math.IsInf(cpu.System, 0)) + assert.False(math.IsInf(cpu.User, 0)) + } +} + +func TestCpuStatsCalculator_Nan(t *testing.T) { + times := cpu.TimesStat{ + User: 0.0, + Idle: 100.0, + System: 0.0, + } + + calculator := &cpuStatsCalculator{} + calculator.calculate(times) + idle, user, system, total := calculator.calculate(times) + require.Equal(t, 100.0, idle) + require.Zero(t, user) + require.Zero(t, system) + require.Zero(t, total) +} diff --git a/lib/hoststats/host.go b/lib/hoststats/host.go new file mode 100644 index 000000000000..8ba04c734ada --- /dev/null +++ b/lib/hoststats/host.go @@ -0,0 +1,95 @@ +package hoststats + +import ( + "time" + + "github.com/armon/go-metrics" +) + +var hostStatsCollectionInterval = 10 * time.Second + +// HostStats represents resource usage hoststats of the host running a Consul agent +type HostStats struct { + Memory *MemoryStats + CPU []*CPUStats + DataDirStats *DiskStats + Uptime uint64 + Timestamp int64 +} + +func (hs *HostStats) Clone() *HostStats { + clone := *hs + + clone.CPU = make([]*CPUStats, len(hs.CPU)) + for i := range hs.CPU { + cpu := *hs.CPU[i] + clone.CPU[i] = &cpu + } + return &clone +} + +func (hs *HostStats) Emit(sink Metrics, baseLabels []metrics.Label) { + + if hs.Memory != nil { + sink.SetGaugeWithLabels([]string{"host", "memory", "total"}, float32(hs.Memory.Total), baseLabels) + sink.SetGaugeWithLabels([]string{"host", "memory", "available"}, float32(hs.Memory.Available), baseLabels) + sink.SetGaugeWithLabels([]string{"host", "memory", "used"}, float32(hs.Memory.Used), baseLabels) + sink.SetGaugeWithLabels([]string{"host", "memory", "used_percent"}, float32(hs.Memory.UsedPercent), baseLabels) + sink.SetGaugeWithLabels([]string{"host", "memory", "free"}, float32(hs.Memory.Free), baseLabels) + } + + for _, cpu := range hs.CPU { + labels := append(baseLabels, metrics.Label{ + Name: "cpu", + Value: cpu.CPU, + }) + + sink.SetGaugeWithLabels([]string{"host", "cpu", "total"}, float32(cpu.Total), labels) + sink.SetGaugeWithLabels([]string{"host", "cpu", "user"}, float32(cpu.User), labels) + sink.SetGaugeWithLabels([]string{"host", "cpu", "idle"}, float32(cpu.Idle), labels) + sink.SetGaugeWithLabels([]string{"host", "cpu", "system"}, float32(cpu.System), labels) + } + + if hs.DataDirStats != nil { + diskLabels := append(baseLabels, metrics.Label{ + Name: "path", + Value: hs.DataDirStats.Path, + }) + + sink.SetGaugeWithLabels([]string{"host", "disk", "size"}, float32(hs.DataDirStats.Size), diskLabels) + sink.SetGaugeWithLabels([]string{"host", "disk", "used"}, float32(hs.DataDirStats.Used), diskLabels) + sink.SetGaugeWithLabels([]string{"host", "disk", "available"}, float32(hs.DataDirStats.Available), diskLabels) + sink.SetGaugeWithLabels([]string{"host", "disk", "used_percent"}, float32(hs.DataDirStats.UsedPercent), diskLabels) + sink.SetGaugeWithLabels([]string{"host", "disk", "inodes_percent"}, float32(hs.DataDirStats.InodesUsedPercent), diskLabels) + } + + sink.SetGaugeWithLabels([]string{"host", "uptime"}, float32(hs.Uptime), baseLabels) +} + +// CPUStats represents hoststats related to cpu usage +type CPUStats struct { + CPU string + User float64 + System float64 + Idle float64 + Total float64 +} + +// MemoryStats represents hoststats related to virtual memory usage +type MemoryStats struct { + Total uint64 + Available uint64 + Used uint64 + UsedPercent float64 + Free uint64 +} + +// DiskStats represents hoststats related to disk usage +type DiskStats struct { + Path string + Size uint64 + Used uint64 + Available uint64 + UsedPercent float64 + InodesUsedPercent float64 +} diff --git a/lib/hoststats/metrics.go b/lib/hoststats/metrics.go new file mode 100644 index 000000000000..5cedfa296298 --- /dev/null +++ b/lib/hoststats/metrics.go @@ -0,0 +1,75 @@ +package hoststats + +import ( + "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" +) + +// Metrics defines an interface for the methods used to emit data to the go-metrics library. +// `metrics.Default()` should always satisfy this interface. +type Metrics interface { + SetGaugeWithLabels(key []string, val float32, labels []metrics.Label) +} + +var Gauges = []prometheus.GaugeDefinition{ + { + Name: []string{"host", "memory", "total"}, + Help: "Total physical memory in bytes", + }, + { + Name: []string{"host", "memory", "available"}, + Help: "Available physical memory in bytes", + }, + { + Name: []string{"host", "memory", "free"}, + Help: "Free physical memory in bytes", + }, + { + Name: []string{"host", "memory", "used"}, + Help: "Used physical memory in bytes", + }, + { + Name: []string{"host", "memory", "used_percent"}, + Help: "Percentage of physical memory in use", + }, + { + Name: []string{"host", "cpu", "total"}, + Help: "Total cpu utilization", + }, + { + Name: []string{"host", "cpu", "user"}, + Help: "User cpu utilization", + }, + { + Name: []string{"host", "cpu", "idle"}, + Help: "Idle cpu utilization", + }, + { + Name: []string{"host", "cpu", "system"}, + Help: "System cpu utilization", + }, + { + Name: []string{"host", "disk", "size"}, + Help: "Size of disk in bytes", + }, + { + Name: []string{"host", "disk", "used"}, + Help: "Disk usage in bytes", + }, + { + Name: []string{"host", "disk", "available"}, + Help: "Available bytes on disk", + }, + { + Name: []string{"host", "disk", "used_percent"}, + Help: "Percentage of disk space usage", + }, + { + Name: []string{"host", "disk", "inodes_percent"}, + Help: "Percentage of disk inodes usage", + }, + { + Name: []string{"host", "uptime"}, + Help: "System uptime", + }, +} From baffef45bdf2c9ba8fdf8857d4eacc0959efafa4 Mon Sep 17 00:00:00 2001 From: Nick Ethier Date: Fri, 12 May 2023 16:09:54 -0400 Subject: [PATCH 2/9] add config toggle to disable host metrics and docs --- agent/config/builder.go | 1 + agent/config/config.go | 1 + agent/config/runtime_test.go | 1 + .../TestRuntimeConfig_Sanitize.golden | 3 ++- agent/config/testdata/full-config.hcl | 1 + agent/config/testdata/full-config.json | 1 + agent/setup.go | 4 +++- lib/hoststats/cpu.go | 2 +- lib/hoststats/cpu_test.go | 23 ++++++++---------- lib/telemetry.go | 5 ++++ .../docs/agent/config/config-files.mdx | 3 +++ website/content/docs/agent/telemetry.mdx | 24 +++++++++++++++++++ 12 files changed, 53 insertions(+), 16 deletions(-) diff --git a/agent/config/builder.go b/agent/config/builder.go index 87ee229406a7..845218b018da 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -941,6 +941,7 @@ func (b *builder) build() (rt RuntimeConfig, err error) { CirconusCheckTags: stringVal(c.Telemetry.CirconusCheckTags), CirconusSubmissionInterval: stringVal(c.Telemetry.CirconusSubmissionInterval), CirconusSubmissionURL: stringVal(c.Telemetry.CirconusSubmissionURL), + DisableHostMetrics: boolVal(c.Telemetry.DisableHostMetrics), DisableHostname: boolVal(c.Telemetry.DisableHostname), DogstatsdAddr: stringVal(c.Telemetry.DogstatsdAddr), DogstatsdTags: c.Telemetry.DogstatsdTags, diff --git a/agent/config/config.go b/agent/config/config.go index a1f41452924b..c9098ec78e9e 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -690,6 +690,7 @@ type Telemetry struct { CirconusCheckTags *string `mapstructure:"circonus_check_tags" json:"circonus_check_tags,omitempty"` CirconusSubmissionInterval *string `mapstructure:"circonus_submission_interval" json:"circonus_submission_interval,omitempty"` CirconusSubmissionURL *string `mapstructure:"circonus_submission_url" json:"circonus_submission_url,omitempty"` + DisableHostMetrics *bool `mapstructure:"disable_host_metrics" json:"disable_host_metrics,omitempty"` DisableHostname *bool `mapstructure:"disable_hostname" json:"disable_hostname,omitempty"` DogstatsdAddr *string `mapstructure:"dogstatsd_addr" json:"dogstatsd_addr,omitempty"` DogstatsdTags []string `mapstructure:"dogstatsd_tags" json:"dogstatsd_tags,omitempty"` diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index 1064829cd3a1..a078107e1e39 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -6754,6 +6754,7 @@ func TestLoad_FullConfig(t *testing.T) { Expiration: 15 * time.Second, Name: "ftO6DySn", // notice this is the same as the metrics prefix }, + DisableHostMetrics: true, }, TLS: tlsutil.Config{ InternalRPC: tlsutil.ProtocolConfig{ diff --git a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden index c17636eef730..5ebb8b178ebb 100644 --- a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden +++ b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden @@ -462,6 +462,7 @@ "CirconusSubmissionInterval": "", "CirconusSubmissionURL": "", "Disable": false, + "DisableHostMetrics": false, "DisableHostname": false, "DogstatsdAddr": "", "DogstatsdTags": [], @@ -510,4 +511,4 @@ "VersionPrerelease": "", "Watches": [], "XDSUpdateRateLimit": 0 -} \ No newline at end of file +} diff --git a/agent/config/testdata/full-config.hcl b/agent/config/testdata/full-config.hcl index c29c334b9504..912f5dfe7087 100644 --- a/agent/config/testdata/full-config.hcl +++ b/agent/config/testdata/full-config.hcl @@ -690,6 +690,7 @@ telemetry { circonus_check_tags = "prvO4uBl" circonus_submission_interval = "DolzaflP" circonus_submission_url = "gTcbS93G" + disable_host_metrics = true disable_hostname = true dogstatsd_addr = "0wSndumK" dogstatsd_tags = [ "3N81zSUB","Xtj8AnXZ" ] diff --git a/agent/config/testdata/full-config.json b/agent/config/testdata/full-config.json index 7640394a4fa0..58f2e54acbc1 100644 --- a/agent/config/testdata/full-config.json +++ b/agent/config/testdata/full-config.json @@ -808,6 +808,7 @@ "circonus_check_tags": "prvO4uBl", "circonus_submission_interval": "DolzaflP", "circonus_submission_url": "gTcbS93G", + "disable_host_metrics": true, "disable_hostname": true, "dogstatsd_addr": "0wSndumK", "dogstatsd_tags": [ diff --git a/agent/setup.go b/agent/setup.go index a97874470400..2419b07868d2 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -119,7 +119,9 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer, providedLogger hcl if err != nil { return d, fmt.Errorf("failed to initialize telemetry: %w", err) } - hoststats.NewCollector(context.Background(), d.Logger, cfg.DataDir) + if !cfg.Telemetry.DisableHostMetrics { + hoststats.NewCollector(context.Background(), d.Logger, cfg.DataDir) + } d.TLSConfigurator, err = tlsutil.NewConfigurator(cfg.TLS, d.Logger) if err != nil { diff --git a/lib/hoststats/cpu.go b/lib/hoststats/cpu.go index 0fc3fc28c49c..f00c6a91dfa9 100644 --- a/lib/hoststats/cpu.go +++ b/lib/hoststats/cpu.go @@ -21,9 +21,9 @@ func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) (idle float64, user currentIdle := times.Idle currentUser := times.User currentSystem := times.System - currentTotal := times.Total() currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq + times.Softirq + times.Steal + times.Guest + times.GuestNice + currentTotal := currentBusy + currentIdle deltaTotal := currentTotal - h.prevTotal idle = ((currentIdle - h.prevIdle) / deltaTotal) * 100 diff --git a/lib/hoststats/cpu_test.go b/lib/hoststats/cpu_test.go index 6de0823a9648..421ec6fb7c07 100644 --- a/lib/hoststats/cpu_test.go +++ b/lib/hoststats/cpu_test.go @@ -27,12 +27,9 @@ func TestCpuStats_percent(t *testing.T) { } func TestHostStats_CPU(t *testing.T) { - - assert := assert.New(t) - logger := testutil.Logger(t) cwd, err := os.Getwd() - assert.Nil(err) + assert.Nil(t, err) hs := initCollector(logger, cwd) // Collect twice so we can calculate percents we need to generate some work @@ -45,18 +42,18 @@ func TestHostStats_CPU(t *testing.T) { } hs.collect() stats := hs.Stats() - assert.NotZero(len(stats.CPU)) + assert.NotZero(t, len(stats.CPU)) for _, cpu := range stats.CPU { - assert.False(math.IsNaN(cpu.Idle)) - assert.False(math.IsNaN(cpu.Total)) - assert.False(math.IsNaN(cpu.System)) - assert.False(math.IsNaN(cpu.User)) + assert.False(t, math.IsNaN(cpu.Idle)) + assert.False(t, math.IsNaN(cpu.Total)) + assert.False(t, math.IsNaN(cpu.System)) + assert.False(t, math.IsNaN(cpu.User)) - assert.False(math.IsInf(cpu.Idle, 0)) - assert.False(math.IsInf(cpu.Total, 0)) - assert.False(math.IsInf(cpu.System, 0)) - assert.False(math.IsInf(cpu.User, 0)) + assert.False(t, math.IsInf(cpu.Idle, 0)) + assert.False(t, math.IsInf(cpu.Total, 0)) + assert.False(t, math.IsInf(cpu.System, 0)) + assert.False(t, math.IsInf(cpu.User, 0)) } } diff --git a/lib/telemetry.go b/lib/telemetry.go index 2d87707c3318..9b853169ec79 100644 --- a/lib/telemetry.go +++ b/lib/telemetry.go @@ -210,6 +210,11 @@ type TelemetryConfig struct { // // hcl: telemetry { prometheus_retention_time = "duration" } PrometheusOpts prometheus.PrometheusOpts + + // DisableHostMetrics will disable metrics collected about the host system such as cpu memory and disk usage. + // + // hcl: telemetry { disable_host_metrics = (true|false) } + DisableHostMetrics bool } // MetricsHandler provides an http.Handler for displaying metrics. diff --git a/website/content/docs/agent/config/config-files.mdx b/website/content/docs/agent/config/config-files.mdx index 3d6efc68f084..8e5dd9ed4dcb 100644 --- a/website/content/docs/agent/config/config-files.mdx +++ b/website/content/docs/agent/config/config-files.mdx @@ -1817,6 +1817,9 @@ subsystem that provides Consul's service mesh capabilities. be used based on **where** this particular instance is running (e.g. a specific geo location or datacenter, dc:sfo). By default, this is left blank and not used. + - `disable_host_metrics` ((#telemetry-disable_host_metrics)) + This disables reporting of host metrics about system resources, defaults to false. + - `disable_hostname` ((#telemetry-disable_hostname)) This controls whether or not to prepend runtime telemetry with the machine's hostname, defaults to false. diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index 27626d9be8d5..e9b56d0b03ff 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -755,3 +755,27 @@ Consul attaches the following labels to metric values. | `peer_id` | The ID of a peer connected to the reporting cluster or leader. | Any UUID | | `partition` | Name of the partition that the peering is created in. | Any defined partition name in the cluster | +## Server Host Metrics + +Consul servers report the following metrics about the host's system resources + +**Requirements:** +- Consul 1.15.3+ + +| Metric | Description | Unit | Type | +| ----------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- | ------- | +| `consul.host.memory.total` | The total physical memory in bytes | mixed | mixed | +| `consul.host.memory.available` | The available physical memory in bytes | mixed | mixed | +| `consul.host.memory.free` | The free physical memory in bytes | mixed | mixed | +| `consul.host.memory.used` | The used physical memory in bytes | mixed | mixed | +| `consul.host.memory.used_percent` | The used physical memory as a percentage of total physical memory | mixed | mixed | +| `consul.host.cpu.total` | The host's total cpu utilization +| `consul.host.cpu.user` | The cpu utilization in user space +| `consul.host.cpu.idle` | The cpu utilization in idle state +| `consul.host.cpu.system` | The cpu utilization in system space +| `consul.host.disk.size` | The size in bytes of the data_dir disk +| `consul.host.disk.used` | The number of bytes used on the data_dir disk +| `consul.host.disk.available` | The number of bytes available on the data_dir disk +| `consul.host.disk.used_percent` | The percentage of disk space used on the data_dir disk +| `consul.host.disk.inodes_percent` | The percentage of inode usage on the data_dir disk +| `consul.host.uptime` | The uptime of the host in seconds From 369619ba3a946609c61ff9b64ee2af7b8f081207 Mon Sep 17 00:00:00 2001 From: Nick Ethier Date: Tue, 16 May 2023 16:30:31 -0400 Subject: [PATCH 3/9] hoststats: add cpu.iowait state tracking --- lib/hoststats/cpu.go | 93 ++++++++---------------- lib/hoststats/cpu_test.go | 26 ++----- lib/hoststats/host.go | 2 + lib/hoststats/metrics.go | 4 + website/content/docs/agent/telemetry.mdx | 1 + 5 files changed, 42 insertions(+), 84 deletions(-) diff --git a/lib/hoststats/cpu.go b/lib/hoststats/cpu.go index f00c6a91dfa9..1ac0fc859d60 100644 --- a/lib/hoststats/cpu.go +++ b/lib/hoststats/cpu.go @@ -9,48 +9,50 @@ import ( // cpuStatsCalculator calculates cpu usage percentages type cpuStatsCalculator struct { - prevIdle float64 - prevUser float64 - prevSystem float64 - prevBusy float64 - prevTotal float64 + prev cpu.TimesStat + prevBusy float64 + prevTotal float64 } // calculate calculates the current cpu usage percentages -func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) (idle float64, user float64, system float64, total float64) { - currentIdle := times.Idle - currentUser := times.User - currentSystem := times.System +func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) *CPUStats { + currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq + times.Softirq + times.Steal + times.Guest + times.GuestNice - currentTotal := currentBusy + currentIdle + currentTotal := currentBusy + times.Idle deltaTotal := currentTotal - h.prevTotal - idle = ((currentIdle - h.prevIdle) / deltaTotal) * 100 - user = ((currentUser - h.prevUser) / deltaTotal) * 100 - system = ((currentSystem - h.prevSystem) / deltaTotal) * 100 - total = ((currentBusy - h.prevBusy) / deltaTotal) * 100 + stats := &CPUStats{ + CPU: times.CPU, + + Idle: ((times.Idle - h.prev.Idle) / deltaTotal) * 100, + User: ((times.User - h.prev.User) / deltaTotal) * 100, + System: ((times.System - h.prev.System) / deltaTotal) * 100, + Iowait: ((times.Iowait - h.prev.Iowait) / deltaTotal) * 100, + Total: ((currentBusy - h.prevBusy) / deltaTotal) * 100, + } // Protect against any invalid values - if math.IsNaN(idle) || math.IsInf(idle, 0) { - idle = 100.0 + if math.IsNaN(stats.Idle) || math.IsInf(stats.Idle, 0) { + stats.Idle = 100.0 } - if math.IsNaN(user) || math.IsInf(user, 0) { - user = 0.0 + if math.IsNaN(stats.User) || math.IsInf(stats.User, 0) { + stats.User = 0.0 } - if math.IsNaN(system) || math.IsInf(system, 0) { - system = 0.0 + if math.IsNaN(stats.System) || math.IsInf(stats.System, 0) { + stats.System = 0.0 } - if math.IsNaN(total) || math.IsInf(total, 0) { - total = 0.0 + if math.IsNaN(stats.Iowait) || math.IsInf(stats.Iowait, 0) { + stats.Iowait = 0.0 + } + if math.IsNaN(stats.Total) || math.IsInf(stats.Total, 0) { + stats.Total = 0.0 } - h.prevIdle = currentIdle - h.prevUser = currentUser - h.prevSystem = currentSystem + h.prev = times h.prevTotal = currentTotal h.prevBusy = currentBusy - return + return stats } // cpuStats calculates cpu usage percentage @@ -61,36 +63,6 @@ type cpuStats struct { totalCpus int } -// percent calculates the cpu usage percentage based on the current cpu usage -// and the previous cpu usage where usage is given as time in nanoseconds spend -// in the cpu -func (c *cpuStats) percent(cpuTime float64) float64 { - now := time.Now() - - if c.prevCpuTime == 0.0 { - // invoked first time - c.prevCpuTime = cpuTime - c.prevTime = now - return 0.0 - } - - timeDelta := now.Sub(c.prevTime).Nanoseconds() - ret := c.calculatePercent(c.prevCpuTime, cpuTime, timeDelta) - c.prevCpuTime = cpuTime - c.prevTime = now - return ret -} - -func (c *cpuStats) calculatePercent(t1, t2 float64, timeDelta int64) float64 { - vDelta := t2 - t1 - if timeDelta <= 0 || vDelta <= 0.0 { - return 0.0 - } - - overall_percent := (vDelta / float64(timeDelta)) * 100.0 - return overall_percent -} - func (h *Collector) collectCPUStats() (cpus []*CPUStats, err error) { cpuStats, err := cpu.Times(true) @@ -104,14 +76,7 @@ func (h *Collector) collectCPUStats() (cpus []*CPUStats, err error) { percentCalculator = &cpuStatsCalculator{} h.cpuCalculator[cpuStat.CPU] = percentCalculator } - idle, user, system, total := percentCalculator.calculate(cpuStat) - cs[idx] = &CPUStats{ - CPU: cpuStat.CPU, - User: user, - System: system, - Idle: idle, - Total: total, - } + cs[idx] = percentCalculator.calculate(cpuStat) } return cs, nil diff --git a/lib/hoststats/cpu_test.go b/lib/hoststats/cpu_test.go index 421ec6fb7c07..3e5d6e81ed7d 100644 --- a/lib/hoststats/cpu_test.go +++ b/lib/hoststats/cpu_test.go @@ -3,9 +3,7 @@ package hoststats import ( "math" "os" - "runtime" "testing" - "time" "github.com/hashicorp/consul/sdk/testutil" "github.com/shirou/gopsutil/v3/cpu" @@ -13,19 +11,6 @@ import ( "github.com/stretchr/testify/require" ) -func TestCpuStats_percent(t *testing.T) { - cs := &cpuStats{ - totalCpus: runtime.NumCPU(), - } - cs.percent(79.7) - time.Sleep(1 * time.Second) - percent := cs.percent(80.69) - expectedPercent := 98.00 - if percent < expectedPercent && percent > (expectedPercent+1.00) { - t.Fatalf("expected: %v, actual: %v", expectedPercent, percent) - } -} - func TestHostStats_CPU(t *testing.T) { logger := testutil.Logger(t) cwd, err := os.Getwd() @@ -66,9 +51,10 @@ func TestCpuStatsCalculator_Nan(t *testing.T) { calculator := &cpuStatsCalculator{} calculator.calculate(times) - idle, user, system, total := calculator.calculate(times) - require.Equal(t, 100.0, idle) - require.Zero(t, user) - require.Zero(t, system) - require.Zero(t, total) + stats := calculator.calculate(times) + require.Equal(t, 100.0, stats.Idle) + require.Zero(t, stats.User) + require.Zero(t, stats.System) + require.Zero(t, stats.Iowait) + require.Zero(t, stats.Total) } diff --git a/lib/hoststats/host.go b/lib/hoststats/host.go index 8ba04c734ada..aa90b6373ef1 100644 --- a/lib/hoststats/host.go +++ b/lib/hoststats/host.go @@ -47,6 +47,7 @@ func (hs *HostStats) Emit(sink Metrics, baseLabels []metrics.Label) { sink.SetGaugeWithLabels([]string{"host", "cpu", "total"}, float32(cpu.Total), labels) sink.SetGaugeWithLabels([]string{"host", "cpu", "user"}, float32(cpu.User), labels) sink.SetGaugeWithLabels([]string{"host", "cpu", "idle"}, float32(cpu.Idle), labels) + sink.SetGaugeWithLabels([]string{"host", "cpu", "iowait"}, float32(cpu.Iowait), labels) sink.SetGaugeWithLabels([]string{"host", "cpu", "system"}, float32(cpu.System), labels) } @@ -72,6 +73,7 @@ type CPUStats struct { User float64 System float64 Idle float64 + Iowait float64 Total float64 } diff --git a/lib/hoststats/metrics.go b/lib/hoststats/metrics.go index 5cedfa296298..c89d40b813ff 100644 --- a/lib/hoststats/metrics.go +++ b/lib/hoststats/metrics.go @@ -44,6 +44,10 @@ var Gauges = []prometheus.GaugeDefinition{ Name: []string{"host", "cpu", "idle"}, Help: "Idle cpu utilization", }, + { + Name: []string{"host", "cpu", "iowait"}, + Help: "Iowait cpu utilization", + }, { Name: []string{"host", "cpu", "system"}, Help: "System cpu utilization", diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index e9b56d0b03ff..b0fa38669205 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -772,6 +772,7 @@ Consul servers report the following metrics about the host's system resources | `consul.host.cpu.total` | The host's total cpu utilization | `consul.host.cpu.user` | The cpu utilization in user space | `consul.host.cpu.idle` | The cpu utilization in idle state +| `consul.host.cpu.iowait` | The cpu utilization in iowait state | `consul.host.cpu.system` | The cpu utilization in system space | `consul.host.disk.size` | The size in bytes of the data_dir disk | `consul.host.disk.used` | The number of bytes used on the data_dir disk From b7f5032136f8893f1be2dfdb37803e56441f9e32 Mon Sep 17 00:00:00 2001 From: Nick Ethier Date: Tue, 16 May 2023 16:45:34 -0400 Subject: [PATCH 4/9] hoststats: cancel host collector when base deps Close is called --- agent/setup.go | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/agent/setup.go b/agent/setup.go index 2419b07868d2..881d094da0af 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -61,6 +61,7 @@ type BaseDeps struct { WatchedFiles []string deregisterBalancer, deregisterResolver func() + stopHostCollector func() } type ConfigLoader func(source config.Source) (config.LoadResult, error) @@ -119,8 +120,10 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer, providedLogger hcl if err != nil { return d, fmt.Errorf("failed to initialize telemetry: %w", err) } - if !cfg.Telemetry.DisableHostMetrics { - hoststats.NewCollector(context.Background(), d.Logger, cfg.DataDir) + if !cfg.Telemetry.Disable && !cfg.Telemetry.DisableHostMetrics { + ctx, cancel := context.WithCancel(context.Background()) + hoststats.NewCollector(ctx, d.Logger, cfg.DataDir) + d.stopHostCollector = cancel } d.TLSConfigurator, err = tlsutil.NewConfigurator(cfg.TLS, d.Logger) @@ -219,11 +222,10 @@ func (bd BaseDeps) Close() { bd.AutoConfig.Stop() bd.MetricsConfig.Cancel() - if fn := bd.deregisterBalancer; fn != nil { - fn() - } - if fn := bd.deregisterResolver; fn != nil { - fn() + for _, fn := range []func(){bd.deregisterBalancer, bd.deregisterResolver, bd.stopHostCollector} { + if fn != nil { + fn() + } } } From ac6071721c80a2c5ffc1b1ba0bbd80f5b3faa095 Mon Sep 17 00:00:00 2001 From: Nick Ethier Date: Fri, 12 May 2023 16:16:48 -0400 Subject: [PATCH 5/9] .changelog: add 17038 entry --- .changelog/17038.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .changelog/17038.txt diff --git a/.changelog/17038.txt b/.changelog/17038.txt new file mode 100644 index 000000000000..0e206836d97d --- /dev/null +++ b/.changelog/17038.txt @@ -0,0 +1,3 @@ +```release-note:improvement +agent: add new metrics to track cpu disk and memory usage for server hosts +``` From 2a3dd5066a368b1538744ad8e20c4953f9f57f23 Mon Sep 17 00:00:00 2001 From: Nick Ethier Date: Mon, 22 May 2023 17:02:56 -0400 Subject: [PATCH 6/9] pr review fixes and documentation --- .changelog/17038.txt | 2 +- agent/setup.go | 4 +-- lib/hoststats/collector.go | 46 ++++++++++++------------ lib/hoststats/cpu.go | 28 ++++++++------- lib/hoststats/cpu_test.go | 6 ++-- lib/hoststats/host.go | 11 ++---- website/content/docs/agent/telemetry.mdx | 5 ++- 7 files changed, 50 insertions(+), 52 deletions(-) diff --git a/.changelog/17038.txt b/.changelog/17038.txt index 0e206836d97d..b3a47f98a777 100644 --- a/.changelog/17038.txt +++ b/.changelog/17038.txt @@ -1,3 +1,3 @@ ```release-note:improvement -agent: add new metrics to track cpu disk and memory usage for server hosts +agent: add new metrics to track cpu disk and memory usage for server hosts (defaults to: enabled) ``` diff --git a/agent/setup.go b/agent/setup.go index 881d094da0af..1e6dfc9f4a25 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -13,7 +13,6 @@ import ( "github.com/armon/go-metrics" "github.com/armon/go-metrics/prometheus" - "github.com/hashicorp/consul/lib/hoststats" "github.com/hashicorp/go-hclog" wal "github.com/hashicorp/raft-wal" "github.com/hashicorp/raft-wal/verifier" @@ -43,6 +42,7 @@ import ( "github.com/hashicorp/consul/agent/xds" "github.com/hashicorp/consul/ipaddr" "github.com/hashicorp/consul/lib" + "github.com/hashicorp/consul/lib/hoststats" "github.com/hashicorp/consul/logging" "github.com/hashicorp/consul/tlsutil" ) @@ -61,7 +61,7 @@ type BaseDeps struct { WatchedFiles []string deregisterBalancer, deregisterResolver func() - stopHostCollector func() + stopHostCollector context.CancelFunc } type ConfigLoader func(source config.Source) (config.LoadResult, error) diff --git a/lib/hoststats/collector.go b/lib/hoststats/collector.go index a2c7bade67bb..c4c57b35c5c2 100644 --- a/lib/hoststats/collector.go +++ b/lib/hoststats/collector.go @@ -59,7 +59,7 @@ func initCollector(logger hclog.Logger, dataDir string, opts ...CollectorOption) return collector } -func (h *Collector) loop(ctx context.Context) { +func (c *Collector) loop(ctx context.Context) { // Start collecting host stats right away and then keep collecting every // collection interval next := time.NewTimer(0) @@ -67,9 +67,9 @@ func (h *Collector) loop(ctx context.Context) { for { select { case <-next.C: - h.collect() + c.collect() next.Reset(hostStatsCollectionInterval) - h.Stats().Emit(h.metrics, h.baseLabels) + c.Stats().Emit(c.metrics, c.baseLabels) case <-ctx.Done(): return @@ -78,55 +78,55 @@ func (h *Collector) loop(ctx context.Context) { } // collect will collect stats related to resource usage of the host -func (h *Collector) collect() { - h.hostStatsLock.Lock() - defer h.hostStatsLock.Unlock() +func (c *Collector) collect() { hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()} // Determine up-time uptime, err := host.Uptime() if err != nil { - h.logger.Error("failed to collect uptime stats", "error", err) + c.logger.Error("failed to collect uptime stats", "error", err) uptime = 0 } hs.Uptime = uptime // Collect memory stats - mstats, err := h.collectMemoryStats() + mstats, err := c.collectMemoryStats() if err != nil { - h.logger.Error("failed to collect memory stats", "error", err) + c.logger.Error("failed to collect memory stats", "error", err) mstats = &MemoryStats{} } hs.Memory = mstats // Collect cpu stats - cpus, err := h.collectCPUStats() + cpus, err := c.collectCPUStats() if err != nil { - h.logger.Error("failed to collect cpu stats", "error", err) + c.logger.Error("failed to collect cpu stats", "error", err) cpus = []*CPUStats{} } hs.CPU = cpus // Collect disk stats - diskStats, err := h.collectDiskStats(h.dataDir) + diskStats, err := c.collectDiskStats(c.dataDir) if err != nil { - h.logger.Error("failed to collect dataDir disk stats", "error", err) + c.logger.Error("failed to collect dataDir disk stats", "error", err) } hs.DataDirStats = diskStats // Update the collected status object. - h.hostStats = hs + c.hostStatsLock.Lock() + c.hostStats = hs + c.hostStatsLock.Unlock() } -func (h *Collector) collectDiskStats(dir string) (*DiskStats, error) { +func (c *Collector) collectDiskStats(dir string) (*DiskStats, error) { usage, err := disk.Usage(dir) if err != nil { return nil, fmt.Errorf("failed to collect disk usage stats: %w", err) } - return h.toDiskStats(usage), nil + return c.toDiskStats(usage), nil } -func (h *Collector) collectMemoryStats() (*MemoryStats, error) { +func (c *Collector) collectMemoryStats() (*MemoryStats, error) { memStats, err := mem.VirtualMemory() if err != nil { return nil, err @@ -143,19 +143,19 @@ func (h *Collector) collectMemoryStats() (*MemoryStats, error) { } // Stats returns the host stats that has been collected -func (h *Collector) Stats() *HostStats { - h.hostStatsLock.RLock() - defer h.hostStatsLock.RUnlock() +func (c *Collector) Stats() *HostStats { + c.hostStatsLock.RLock() + defer c.hostStatsLock.RUnlock() - if h.hostStats == nil { + if c.hostStats == nil { return &HostStats{} } - return h.hostStats.Clone() + return c.hostStats.Clone() } // toDiskStats merges UsageStat and PartitionStat to create a DiskStat -func (h *Collector) toDiskStats(usage *disk.UsageStat) *DiskStats { +func (c *Collector) toDiskStats(usage *disk.UsageStat) *DiskStats { ds := DiskStats{ Size: usage.Total, Used: usage.Used, diff --git a/lib/hoststats/cpu.go b/lib/hoststats/cpu.go index 1ac0fc859d60..45633b40df06 100644 --- a/lib/hoststats/cpu.go +++ b/lib/hoststats/cpu.go @@ -2,7 +2,6 @@ package hoststats import ( "math" - "time" "github.com/shirou/gopsutil/v3/cpu" ) @@ -14,17 +13,28 @@ type cpuStatsCalculator struct { prevTotal float64 } -// calculate calculates the current cpu usage percentages +// calculate the current cpu usage percentages. +// Since the cpu.TimesStat captures the total time a cpu spent in various states +// this function tracks the last seen stat and derives each cpu state's utilization +// as a percentage of the total change in cpu time between calls. +// The first time calculate is called CPUStats will report %100 idle +// usage since there is not a previous value to calculate against func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) *CPUStats { + // sum all none idle counters to get the total busy cpu time currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq + times.Softirq + times.Steal + times.Guest + times.GuestNice + // sum of the total cpu time currentTotal := currentBusy + times.Idle + // calculate how much cpu time has passed since last calculation deltaTotal := currentTotal - h.prevTotal + stats := &CPUStats{ CPU: times.CPU, + // calculate each percentage as the ratio of the change + // in each state's time to the total change in cpu time Idle: ((times.Idle - h.prev.Idle) / deltaTotal) * 100, User: ((times.User - h.prev.User) / deltaTotal) * 100, System: ((times.System - h.prev.System) / deltaTotal) * 100, @@ -55,15 +65,7 @@ func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) *CPUStats { return stats } -// cpuStats calculates cpu usage percentage -type cpuStats struct { - prevCpuTime float64 - prevTime time.Time - - totalCpus int -} - -func (h *Collector) collectCPUStats() (cpus []*CPUStats, err error) { +func (c *Collector) collectCPUStats() (cpus []*CPUStats, err error) { cpuStats, err := cpu.Times(true) if err != nil { @@ -71,10 +73,10 @@ func (h *Collector) collectCPUStats() (cpus []*CPUStats, err error) { } cs := make([]*CPUStats, len(cpuStats)) for idx, cpuStat := range cpuStats { - percentCalculator, ok := h.cpuCalculator[cpuStat.CPU] + percentCalculator, ok := c.cpuCalculator[cpuStat.CPU] if !ok { percentCalculator = &cpuStatsCalculator{} - h.cpuCalculator[cpuStat.CPU] = percentCalculator + c.cpuCalculator[cpuStat.CPU] = percentCalculator } cs[idx] = percentCalculator.calculate(cpuStat) } diff --git a/lib/hoststats/cpu_test.go b/lib/hoststats/cpu_test.go index 3e5d6e81ed7d..5d5efbe9769a 100644 --- a/lib/hoststats/cpu_test.go +++ b/lib/hoststats/cpu_test.go @@ -4,6 +4,7 @@ import ( "math" "os" "testing" + "time" "github.com/hashicorp/consul/sdk/testutil" "github.com/shirou/gopsutil/v3/cpu" @@ -20,10 +21,7 @@ func TestHostStats_CPU(t *testing.T) { // Collect twice so we can calculate percents we need to generate some work // so that the cpu values change hs.collect() - total := 0 - for i := 1; i < 1000000000; i++ { - total *= i - total = total % i + for begin := time.Now(); time.Now().Sub(begin) < 100*time.Millisecond; { } hs.collect() stats := hs.Stats() diff --git a/lib/hoststats/host.go b/lib/hoststats/host.go index aa90b6373ef1..426cf43ea21e 100644 --- a/lib/hoststats/host.go +++ b/lib/hoststats/host.go @@ -18,14 +18,9 @@ type HostStats struct { } func (hs *HostStats) Clone() *HostStats { - clone := *hs - - clone.CPU = make([]*CPUStats, len(hs.CPU)) - for i := range hs.CPU { - cpu := *hs.CPU[i] - clone.CPU[i] = &cpu - } - return &clone + clone := &HostStats{} + *clone = *hs + return clone } func (hs *HostStats) Emit(sink Metrics, baseLabels []metrics.Label) { diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index b0fa38669205..8c62507e9d72 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -757,7 +757,10 @@ Consul attaches the following labels to metric values. ## Server Host Metrics -Consul servers report the following metrics about the host's system resources +Consul servers report the following metrics about the host's system resources. +Note that if the Consul server is operating inside a container these metrics +still report host resource usage and do not report any resource limits placed +on the container. **Requirements:** - Consul 1.15.3+ From 50621fa098423dbe0b0298dbd3a8440b8241fa16 Mon Sep 17 00:00:00 2001 From: Nick Ethier Date: Wed, 24 May 2023 17:01:03 -0400 Subject: [PATCH 7/9] agent/config: change disable_host_metrics to enable_host_metrics and default to false unless configured /w cloud integration --- agent/config/builder.go | 4 +++- agent/config/builder_test.go | 19 +++++++++++++++++++ agent/config/config.go | 2 +- agent/config/runtime_test.go | 2 +- .../TestRuntimeConfig_Sanitize.golden | 4 ++-- agent/config/testdata/full-config.hcl | 2 +- agent/config/testdata/full-config.json | 2 +- agent/setup.go | 2 +- lib/telemetry.go | 10 +++++----- .../docs/agent/config/config-files.mdx | 6 +++--- 10 files changed, 37 insertions(+), 16 deletions(-) diff --git a/agent/config/builder.go b/agent/config/builder.go index 845218b018da..063771b0f7d0 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -941,7 +941,6 @@ func (b *builder) build() (rt RuntimeConfig, err error) { CirconusCheckTags: stringVal(c.Telemetry.CirconusCheckTags), CirconusSubmissionInterval: stringVal(c.Telemetry.CirconusSubmissionInterval), CirconusSubmissionURL: stringVal(c.Telemetry.CirconusSubmissionURL), - DisableHostMetrics: boolVal(c.Telemetry.DisableHostMetrics), DisableHostname: boolVal(c.Telemetry.DisableHostname), DogstatsdAddr: stringVal(c.Telemetry.DogstatsdAddr), DogstatsdTags: c.Telemetry.DogstatsdTags, @@ -1107,6 +1106,9 @@ func (b *builder) build() (rt RuntimeConfig, err error) { LocalProxyConfigResyncInterval: 30 * time.Second, } + // host metrics are enabled by default if consul is configured with HashiCorp Cloud Platform integration + rt.Telemetry.EnableHostMetrics = boolValWithDefault(c.Telemetry.EnableHostMetrics, rt.IsCloudEnabled()) + rt.TLS, err = b.buildTLSConfig(rt, c.TLS) if err != nil { return RuntimeConfig{}, err diff --git a/agent/config/builder_test.go b/agent/config/builder_test.go index 28d5b2972c3d..3eb81fdee4de 100644 --- a/agent/config/builder_test.go +++ b/agent/config/builder_test.go @@ -556,3 +556,22 @@ func TestBuilder_parsePrefixFilter(t *testing.T) { } }) } + +func TestBuidler_hostMetricsWithCloud(t *testing.T) { + devMode := true + builderOpts := LoadOpts{ + DevMode: &devMode, + DefaultConfig: FileSource{ + Name: "test", + Format: "hcl", + Data: `cloud{ resource_id = "abc" client_id = "abc" client_secret = "abc"}`, + }, + } + + result, err := Load(builderOpts) + require.NoError(t, err) + require.Empty(t, result.Warnings) + cfg := result.RuntimeConfig + require.NotNil(t, cfg) + require.True(t, cfg.Telemetry.EnableHostMetrics) +} diff --git a/agent/config/config.go b/agent/config/config.go index c9098ec78e9e..e26d6edc4d95 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -690,8 +690,8 @@ type Telemetry struct { CirconusCheckTags *string `mapstructure:"circonus_check_tags" json:"circonus_check_tags,omitempty"` CirconusSubmissionInterval *string `mapstructure:"circonus_submission_interval" json:"circonus_submission_interval,omitempty"` CirconusSubmissionURL *string `mapstructure:"circonus_submission_url" json:"circonus_submission_url,omitempty"` - DisableHostMetrics *bool `mapstructure:"disable_host_metrics" json:"disable_host_metrics,omitempty"` DisableHostname *bool `mapstructure:"disable_hostname" json:"disable_hostname,omitempty"` + EnableHostMetrics *bool `mapstructure:"enable_host_metrics" json:"enable_host_metrics,omitempty"` DogstatsdAddr *string `mapstructure:"dogstatsd_addr" json:"dogstatsd_addr,omitempty"` DogstatsdTags []string `mapstructure:"dogstatsd_tags" json:"dogstatsd_tags,omitempty"` RetryFailedConfiguration *bool `mapstructure:"retry_failed_connection" json:"retry_failed_connection,omitempty"` diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index a078107e1e39..fa95fbb43b33 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -6754,7 +6754,7 @@ func TestLoad_FullConfig(t *testing.T) { Expiration: 15 * time.Second, Name: "ftO6DySn", // notice this is the same as the metrics prefix }, - DisableHostMetrics: true, + EnableHostMetrics: true, }, TLS: tlsutil.Config{ InternalRPC: tlsutil.ProtocolConfig{ diff --git a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden index 5ebb8b178ebb..334f5f8c8ff5 100644 --- a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden +++ b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden @@ -462,10 +462,10 @@ "CirconusSubmissionInterval": "", "CirconusSubmissionURL": "", "Disable": false, - "DisableHostMetrics": false, "DisableHostname": false, "DogstatsdAddr": "", "DogstatsdTags": [], + "EnableHostMetrics": false, "FilterDefault": false, "MetricsPrefix": "", "PrometheusOpts": { @@ -511,4 +511,4 @@ "VersionPrerelease": "", "Watches": [], "XDSUpdateRateLimit": 0 -} +} \ No newline at end of file diff --git a/agent/config/testdata/full-config.hcl b/agent/config/testdata/full-config.hcl index 912f5dfe7087..660e1036086d 100644 --- a/agent/config/testdata/full-config.hcl +++ b/agent/config/testdata/full-config.hcl @@ -690,7 +690,7 @@ telemetry { circonus_check_tags = "prvO4uBl" circonus_submission_interval = "DolzaflP" circonus_submission_url = "gTcbS93G" - disable_host_metrics = true + enable_host_metrics = true disable_hostname = true dogstatsd_addr = "0wSndumK" dogstatsd_tags = [ "3N81zSUB","Xtj8AnXZ" ] diff --git a/agent/config/testdata/full-config.json b/agent/config/testdata/full-config.json index 58f2e54acbc1..52dab37bfa53 100644 --- a/agent/config/testdata/full-config.json +++ b/agent/config/testdata/full-config.json @@ -808,7 +808,7 @@ "circonus_check_tags": "prvO4uBl", "circonus_submission_interval": "DolzaflP", "circonus_submission_url": "gTcbS93G", - "disable_host_metrics": true, + "enable_host_metrics": true, "disable_hostname": true, "dogstatsd_addr": "0wSndumK", "dogstatsd_tags": [ diff --git a/agent/setup.go b/agent/setup.go index 1e6dfc9f4a25..16d02a2a968e 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -120,7 +120,7 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer, providedLogger hcl if err != nil { return d, fmt.Errorf("failed to initialize telemetry: %w", err) } - if !cfg.Telemetry.Disable && !cfg.Telemetry.DisableHostMetrics { + if !cfg.Telemetry.Disable && cfg.Telemetry.EnableHostMetrics { ctx, cancel := context.WithCancel(context.Background()) hoststats.NewCollector(ctx, d.Logger, cfg.DataDir) d.stopHostCollector = cancel diff --git a/lib/telemetry.go b/lib/telemetry.go index 9b853169ec79..b66ec721b1dc 100644 --- a/lib/telemetry.go +++ b/lib/telemetry.go @@ -204,17 +204,17 @@ type TelemetryConfig struct { // hcl: telemetry { statsite_address = string } StatsiteAddr string `json:"statsite_address,omitempty" mapstructure:"statsite_address"` + // EnableHostMetrics will enable metrics collected about the host system such as cpu memory and disk usage. + // + // hcl: telemetry { enable_host_metrics = (true|false) } + EnableHostMetrics bool `json:"enable_host_metrics,omitempty" mapstructure:"enable_host_metrics"` + // PrometheusOpts provides configuration for the PrometheusSink. Currently the only configuration // we acquire from hcl is the retention time. We also use definition slices that are set in agent setup // before being passed to InitTelemmetry. // // hcl: telemetry { prometheus_retention_time = "duration" } PrometheusOpts prometheus.PrometheusOpts - - // DisableHostMetrics will disable metrics collected about the host system such as cpu memory and disk usage. - // - // hcl: telemetry { disable_host_metrics = (true|false) } - DisableHostMetrics bool } // MetricsHandler provides an http.Handler for displaying metrics. diff --git a/website/content/docs/agent/config/config-files.mdx b/website/content/docs/agent/config/config-files.mdx index 8e5dd9ed4dcb..d7ac6923e097 100644 --- a/website/content/docs/agent/config/config-files.mdx +++ b/website/content/docs/agent/config/config-files.mdx @@ -1817,9 +1817,6 @@ subsystem that provides Consul's service mesh capabilities. be used based on **where** this particular instance is running (e.g. a specific geo location or datacenter, dc:sfo). By default, this is left blank and not used. - - `disable_host_metrics` ((#telemetry-disable_host_metrics)) - This disables reporting of host metrics about system resources, defaults to false. - - `disable_hostname` ((#telemetry-disable_hostname)) This controls whether or not to prepend runtime telemetry with the machine's hostname, defaults to false. @@ -1834,6 +1831,9 @@ subsystem that provides Consul's service mesh capabilities. of global tags that will be added to all telemetry packets sent to DogStatsD. It is a list of strings, where each string looks like "my_tag_name:my_tag_value". + - `enable_host_metrics` ((#telemetry-enable_host_metrics)) + This enables reporting of host metrics about system resources, defaults to false. + - `filter_default` ((#telemetry-filter_default)) This controls whether to allow metrics that have not been specified by the filter. Defaults to `true`, which will allow all metrics when no filters are provided. From 6e3ceb0347bcf3d8c0ebad2fc9790f16059b2577 Mon Sep 17 00:00:00 2001 From: Nick Ethier Date: Wed, 24 May 2023 17:12:03 -0400 Subject: [PATCH 8/9] agent: only register host resource metrics /w prom if enabled --- agent/setup.go | 5 ++++- website/content/docs/agent/telemetry.mdx | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/agent/setup.go b/agent/setup.go index 16d02a2a968e..fba5c2b5dd6c 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -302,7 +302,10 @@ func getPrometheusDefs(cfg *config.RuntimeConfig, isServer bool) ([]prometheus.G Gauges, raftGauges, serverGauges, - hoststats.Gauges, + } + + if cfg.Telemetry.EnableHostMetrics { + gauges = append(gauges, hoststats.Gauges) } // TODO(ffmmm): conditionally add only leader specific metrics to gauges, counters, summaries, etc diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index 8c62507e9d72..5e1b6b8a88d9 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -757,7 +757,8 @@ Consul attaches the following labels to metric values. ## Server Host Metrics -Consul servers report the following metrics about the host's system resources. +Consul servers can report the following metrics about the host's system resources. +This feature must be enabled in the [agent telemetry configuration](/consul/docs/agent/config/config-files#telemetry-enable_host_metrics). Note that if the Consul server is operating inside a container these metrics still report host resource usage and do not report any resource limits placed on the container. From bf69a829f2fd1e371389672aba979c93409afa4f Mon Sep 17 00:00:00 2001 From: Nick Ethier Date: Fri, 26 May 2023 14:17:04 -0400 Subject: [PATCH 9/9] agent/config: cleanup test at case end --- agent/config/runtime_test.go | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index fa95fbb43b33..3b1a77b2cb95 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -46,6 +46,7 @@ type testCase struct { desc string args []string setup func() // TODO: accept a testing.T instead of panic + cleanup func() expected func(rt *RuntimeConfig) expectedErr string expectedWarnings []string @@ -2308,9 +2309,9 @@ func TestLoad_IntegrationWithFlags(t *testing.T) { }, setup: func() { os.Setenv("HCP_RESOURCE_ID", "env-id") - t.Cleanup(func() { - os.Unsetenv("HCP_RESOURCE_ID") - }) + }, + cleanup: func() { + os.Unsetenv("HCP_RESOURCE_ID") }, expected: func(rt *RuntimeConfig) { rt.DataDir = dataDir @@ -2321,6 +2322,7 @@ func TestLoad_IntegrationWithFlags(t *testing.T) { // server things rt.ServerMode = true + rt.Telemetry.EnableHostMetrics = true rt.TLS.ServerMode = true rt.LeaveOnTerm = false rt.SkipLeaveOnInt = true @@ -2337,9 +2339,9 @@ func TestLoad_IntegrationWithFlags(t *testing.T) { }, setup: func() { os.Setenv("HCP_RESOURCE_ID", "env-id") - t.Cleanup(func() { - os.Unsetenv("HCP_RESOURCE_ID") - }) + }, + cleanup: func() { + os.Unsetenv("HCP_RESOURCE_ID") }, json: []string{`{ "cloud": { @@ -2360,6 +2362,7 @@ func TestLoad_IntegrationWithFlags(t *testing.T) { // server things rt.ServerMode = true + rt.Telemetry.EnableHostMetrics = true rt.TLS.ServerMode = true rt.LeaveOnTerm = false rt.SkipLeaveOnInt = true @@ -6032,6 +6035,9 @@ func (tc testCase) run(format string, dataDir string) func(t *testing.T) { expected.ACLResolverSettings.EnterpriseMeta = *structs.NodeEnterpriseMetaInPartition(expected.PartitionOrDefault()) prototest.AssertDeepEqual(t, expected, actual, cmpopts.EquateEmpty()) + if tc.cleanup != nil { + tc.cleanup() + } } }