From 4188ee040dab163f6fcde485b7434f7b5d6ea3fb Mon Sep 17 00:00:00 2001 From: lyang24 Date: Tue, 6 Feb 2024 19:56:55 -0800 Subject: [PATCH] obs: export metrics about Go GC Assist work This commit introduced functions to extract exposed metrics in go runtime metrics api. The runtime metrics is sampled along in SampleEnvironment call every 10 seconds. New metric GcAssistNS is added as an estimate to amount of effort of user go routines assist in gc activities in nanoseconds. Fixes: https://github.com/cockroachdb/cockroach/issues/88178 Release note: None --- docs/generated/metrics/metrics.html | 1 + pkg/server/status/runtime.go | 97 +++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html index b4487a955b6d..53418bca74b8 100644 --- a/docs/generated/metrics/metrics.html +++ b/docs/generated/metrics/metrics.html @@ -1558,6 +1558,7 @@ SERVERsys.cpu.user.percentCurrent user cpu percentage consumed by the CRDB processCPU TimeGAUGEPERCENTAVGNONE SERVERsys.fd.openProcess open file descriptorsFile DescriptorsGAUGECOUNTAVGNONE SERVERsys.fd.softlimitProcess open FD soft limitFile DescriptorsGAUGECOUNTAVGNONE +SERVERsys.gc.assist.nsEstimated total CPU time user goroutines spent to assist the GC processCPU TimeGAUGENANOSECONDSAVGNONE SERVERsys.gc.countTotal number of GC runsGC RunsGAUGECOUNTAVGNONE SERVERsys.gc.pause.nsTotal GC pauseGC PauseGAUGENANOSECONDSAVGNONE SERVERsys.gc.pause.percentCurrent GC pause percentageGC PauseGAUGEPERCENTAVGNONE diff --git a/pkg/server/status/runtime.go b/pkg/server/status/runtime.go index 500b561c9432..fb0612459fd8 100644 --- a/pkg/server/status/runtime.go +++ b/pkg/server/status/runtime.go @@ -12,10 +12,12 @@ package status import ( "context" + "fmt" "os" "regexp" "runtime" "runtime/debug" + "runtime/metrics" "time" "github.com/cockroachdb/cockroach/pkg/build" @@ -93,6 +95,12 @@ var ( Measurement: "GC Pause", Unit: metric.Unit_PERCENT, } + metaGCAssistNS = metric.Metadata{ + Name: "sys.gc.assist.ns", + Help: "Estimated total CPU time user goroutines spent to assist the GC process", + Measurement: "CPU Time", + Unit: metric.Unit_NANOSECONDS, + } metaCPUUserNS = metric.Metadata{ Name: "sys.cpu.user.ns", @@ -292,6 +300,85 @@ var diskMetricsIgnoredDevices = envutil.EnvOrDefaultString("COCKROACH_DISK_METRI // error : any issues fetching stats. This should be a warning only. var getCgoMemStats func(context.Context) (uint, uint, error) +// Estimated total CPU time goroutines spent performing GC tasks to assist the +// GC and prevent it from falling behind the application. This metric is an +// overestimate, and not directly comparable to system CPU time measurements. +// Compare only with other /cpu/classes metrics. +const runtimeMetricGCAssist = "/cpu/classes/gc/mark/assist:cpu-seconds" + +var runtimeMetrics = []string{runtimeMetricGCAssist} + +// GoRuntimeSampler are a collection of metrics to sample from golang's runtime environment and +// runtime metrics metadata. It fetches go runtime metrics and provides read access. +// https://pkg.go.dev/runtime/metrics +type GoRuntimeSampler struct { + // The collection of metrics we want to sample. + metricSamples []metrics.Sample + // The mapping to find metric slot in metricSamples by name. + metricIndexes map[string]int +} + +// getIndex finds the position of metrics in the sample array by name. +func (grm *GoRuntimeSampler) getIndex(name string) int { + i, found := grm.metricIndexes[name] + if !found { + panic(fmt.Sprintf("unsampled metric: %s", name)) + } + return i +} + +// float64 gets the sampled value by metrics name as float64. +// N.B. This method will panic if the metrics value is not metrics.KindFloat64. +func (grm *GoRuntimeSampler) float64(name string) float64 { + i := grm.getIndex(name) + return grm.metricSamples[i].Value.Float64() +} + +// sampleRuntimeMetrics reads from metrics.Read api and fill in the value +// in the metricSamples field. +// Benchmark results on 12 core Apple M3 Pro: +// goos: darwin +// goarch: arm64 +// pkg: github.com/cockroachdb/cockroach/pkg/server/status +// BenchmarkGoRuntimeSampler +// BenchmarkGoRuntimeSampler-12 28886398 40.03 ns/op +// +// func BenchmarkGoRuntimeSampler(b *testing.B) { +// s := NewGoRuntimeSampler([]string{runtimeMetricGCAssist}) +// for n := 0; n < b.N; n++ { +// s.sampleRuntimeMetrics() +// } +// } +func (grm *GoRuntimeSampler) sampleRuntimeMetrics() { + metrics.Read(grm.metricSamples) +} + +// NewGoRuntimeSampler constructs a new GoRuntimeSampler object. +// This method will panic on invalid metrics names provided. +func NewGoRuntimeSampler(metricNames []string) *GoRuntimeSampler { + m := metrics.All() + metricTypes := make(map[string]metrics.ValueKind, len(m)) + for _, desc := range m { + metricTypes[desc.Name] = desc.Kind + } + metricSamples := make([]metrics.Sample, len(metricNames)) + metricIndexes := make(map[string]int, len(metricNames)) + for i, n := range metricNames { + _, hasDesc := metricTypes[n] + if !hasDesc { + panic(fmt.Sprintf("unexpected metric: %s", n)) + } + metricSamples[i] = metrics.Sample{Name: n} + metricIndexes[n] = i + } + + grm := &GoRuntimeSampler{ + metricSamples: metricSamples, + metricIndexes: metricIndexes, + } + return grm +} + // RuntimeStatSampler is used to periodically sample the runtime environment // for useful statistics, performing some rudimentary calculations and storing // the resulting information in a format that can be easily consumed by status @@ -326,6 +413,8 @@ type RuntimeStatSampler struct { // Only show "not implemented" errors once, we don't need the log spam. fdUsageNotImplemented bool + goRuntimeSampler *GoRuntimeSampler + // Metric gauges maintained by the sampler. // Go runtime stats. CgoCalls *metric.Gauge @@ -338,6 +427,7 @@ type RuntimeStatSampler struct { GcCount *metric.Gauge GcPauseNS *metric.Gauge GcPausePercent *metric.GaugeFloat64 + GcAssistNS *metric.Gauge // CPU stats for the CRDB process usage. CPUUserNS *metric.Gauge CPUUserPercent *metric.GaugeFloat64 @@ -414,6 +504,7 @@ func NewRuntimeStatSampler(ctx context.Context, clock hlc.WallClock) *RuntimeSta startTimeNanos: clock.Now().UnixNano(), initialNetCounters: netCounters, initialDiskCounters: diskCounters, + goRuntimeSampler: NewGoRuntimeSampler(runtimeMetrics), CgoCalls: metric.NewGauge(metaCgoCalls), Goroutines: metric.NewGauge(metaGoroutines), RunnableGoroutinesPerCPU: metric.NewGaugeFloat64(metaRunnableGoroutinesPerCPU), @@ -424,6 +515,7 @@ func NewRuntimeStatSampler(ctx context.Context, clock hlc.WallClock) *RuntimeSta GcCount: metric.NewGauge(metaGCCount), GcPauseNS: metric.NewGauge(metaGCPauseNS), GcPausePercent: metric.NewGaugeFloat64(metaGCPausePercent), + GcAssistNS: metric.NewGauge(metaGCAssistNS), CPUUserNS: metric.NewGauge(metaCPUUserNS), CPUUserPercent: metric.NewGaugeFloat64(metaCPUUserPercent), @@ -515,6 +607,8 @@ func (rsr *RuntimeStatSampler) SampleEnvironment( gc := &debug.GCStats{} debug.ReadGCStats(gc) + rsr.goRuntimeSampler.sampleRuntimeMetrics() + numCgoCall := runtime.NumCgoCall() numGoroutine := runtime.NumGoroutine() @@ -615,6 +709,8 @@ func (rsr *RuntimeStatSampler) SampleEnvironment( combinedNormalizedHostPerc := (hostSrate + hostUrate) / float64(numHostCPUs) gcPauseRatio := float64(uint64(gc.PauseTotal)-rsr.last.gcPauseTime) / dur runnableSum := goschedstats.CumulativeNormalizedRunnableGoroutines() + gcAssistSeconds := rsr.goRuntimeSampler.float64(runtimeMetricGCAssist) + gcAssistNS := int64(gcAssistSeconds * 1e9) // The number of runnable goroutines per CPU is a count, but it can vary // quickly. We don't just want to get a current snapshot of it, we want the // average value since the last sampling. @@ -668,6 +764,7 @@ func (rsr *RuntimeStatSampler) SampleEnvironment( rsr.GcCount.Update(gc.NumGC) rsr.GcPauseNS.Update(int64(gc.PauseTotal)) rsr.GcPausePercent.Update(gcPauseRatio) + rsr.GcAssistNS.Update(gcAssistNS) rsr.CPUUserNS.Update(procUtime) rsr.CPUUserPercent.Update(procUrate)