obs: export metrics about Go GC Assist work

This commit introduced functions to extract exposed metrics in go runtime metrics api. The runtime metrics is sampled along in SampleEnvironment call every 10 seconds. New metric GcAssistNS is captured in this pr to capture represent an estimate to amount of effort of user go routines assist in gc activities in nanoseconds. Fixes: #88178 Release note: None
cockroachdb · Feb 16, 2024 · 3f1e528 · 3f1e528
1 parent 47a68b5
commit 3f1e528
Show file tree

Hide file tree

Showing 5 changed files with 112 additions and 0 deletions.
diff --git a/docs/generated/eventlog.md b/docs/generated/eventlog.md
@@ -266,6 +266,7 @@ An event of type `runtime_stats` is recorded every 10 seconds as server health m
 | `GCRunCount` | The total number of GC runs. | no |
 | `NetHostRecvBytes` | The bytes received on all network interfaces since this process started. | no |
 | `NetHostSendBytes` | The bytes sent on all network interfaces since this process started. | no |
+| `GCAssistNs` | Estimated total CPU time user goroutines spent performing GC tasks to assist the GC. Expressed in nanoseconds. | no |
 
 
 #### Common fields

diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html
@@ -1558,6 +1558,7 @@
 <tr><td>SERVER</td><td>sys.cpu.user.percent</td><td>Current user cpu percentage consumed by the CRDB process</td><td>CPU Time</td><td>GAUGE</td><td>PERCENT</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>SERVER</td><td>sys.fd.open</td><td>Process open file descriptors</td><td>File Descriptors</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>SERVER</td><td>sys.fd.softlimit</td><td>Process open FD soft limit</td><td>File Descriptors</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
+<tr><td>SERVER</td><td>sys.gc.assist.ns</td><td>Estimated total CPU time user goroutines spent to assist the GC process</td><td>CPU Time</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>SERVER</td><td>sys.gc.count</td><td>Total number of GC runs</td><td>GC Runs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>SERVER</td><td>sys.gc.pause.ns</td><td>Total GC pause</td><td>GC Pause</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>SERVER</td><td>sys.gc.pause.percent</td><td>Current GC pause percentage</td><td>GC Pause</td><td>GAUGE</td><td>PERCENT</td><td>AVG</td><td>NONE</td></tr>

diff --git a/pkg/server/status/runtime.go b/pkg/server/status/runtime.go
@@ -12,10 +12,12 @@ package status
 
 import (
 	"context"
+	"fmt"
 	"os"
 	"regexp"
 	"runtime"
 	"runtime/debug"
+	"runtime/metrics"
 	"time"
 
 	"github.com/cockroachdb/cockroach/pkg/build"
@@ -93,6 +95,12 @@ var (
 		Measurement: "GC Pause",
 		Unit:        metric.Unit_PERCENT,
 	}
+	metaGCAssistNS = metric.Metadata{
+		Name:        "sys.gc.assist.ns",
+		Help:        "Estimated total CPU time user goroutines spent to assist the GC process",
+		Measurement: "CPU Time",
+		Unit:        metric.Unit_NANOSECONDS,
+	}
 
 	metaCPUUserNS = metric.Metadata{
 		Name:        "sys.cpu.user.ns",
@@ -292,6 +300,85 @@ var diskMetricsIgnoredDevices = envutil.EnvOrDefaultString("COCKROACH_DISK_METRI
 // error           : any issues fetching stats. This should be a warning only.
 var getCgoMemStats func(context.Context) (uint, uint, error)
 
+// Estimated total CPU time goroutines spent performing GC tasks to assist the
+// GC and prevent it from falling behind the application. This metric is an
+// overestimate, and not directly comparable to system CPU time measurements.
+// Compare only with other /cpu/classes metrics.
+const runtimeMetricGCAssist = "/cpu/classes/gc/mark/assist:cpu-seconds"
+
+var runtimeMetrics = []string{runtimeMetricGCAssist}
+
+// GoRuntimeSampler are a collection of metrics to sample from golang's runtime environment and
+// runtime metrics metadata. It fetches go runtime metrics and provides read access.
+// https://pkg.go.dev/runtime/metrics
+type GoRuntimeSampler struct {
+	// The collection of metrics we want to sample.
+	metricSamples []metrics.Sample
+	// The mapping to find metric slot in metricSamples by name.
+	metricIndexes map[string]int
+}
+
+// getIndex finds the position of metrics in the sample array by name.
+func (grm *GoRuntimeSampler) getIndex(name string) int {
+	i, found := grm.metricIndexes[name]
+	if !found {
+		panic(fmt.Sprintf("unsampled metric: %s", name))
+	}
+	return i
+}
+
+// float64 gets the sampled value by metrics name as float64.
+// N.B. This method will panic if the metrics value is not metrics.KindFloat64.
+func (grm *GoRuntimeSampler) float64(name string) float64 {
+	i := grm.getIndex(name)
+	return grm.metricSamples[i].Value.Float64()
+}
+
+// sampleRuntimeMetrics reads from metrics.Read api and fill in the value
+// in the metricSamples field.
+// Benchmark results on 12 core Apple M3 Pro:
+// goos: darwin
+// goarch: arm64
+// pkg: github.com/cockroachdb/cockroach/pkg/server/status
+// BenchmarkGoRuntimeSampler
+// BenchmarkGoRuntimeSampler-12    	28886398	        40.03 ns/op
+//
+//	func BenchmarkGoRuntimeSampler(b *testing.B) {
+//		 s := NewGoRuntimeSampler([]string{runtimeMetricGCAssist})
+//		 for n := 0; n < b.N; n++ {
+//			 s.sampleRuntimeMetrics()
+//		 }
+//	}
+func (grm *GoRuntimeSampler) sampleRuntimeMetrics() {
+	metrics.Read(grm.metricSamples)
+}
+
+// NewGoRuntimeSampler constructs a new GoRuntimeSampler object.
+// This method will panic on invalid metrics names provided.
+func NewGoRuntimeSampler(metricNames []string) *GoRuntimeSampler {
+	m := metrics.All()
+	metricTypes := make(map[string]metrics.ValueKind, len(m))
+	for _, desc := range m {
+		metricTypes[desc.Name] = desc.Kind
+	}
+	metricSamples := make([]metrics.Sample, len(metricNames))
+	metricIndexes := make(map[string]int, len(metricNames))
+	for i, n := range metricNames {
+		_, hasDesc := metricTypes[n]
+		if !hasDesc {
+			panic(fmt.Sprintf("unexpected metric: %s", n))
+		}
+		metricSamples[i] = metrics.Sample{Name: n}
+		metricIndexes[n] = i
+	}
+
+	grm := &GoRuntimeSampler{
+		metricSamples: metricSamples,
+		metricIndexes: metricIndexes,
+	}
+	return grm
+}
+
 // RuntimeStatSampler is used to periodically sample the runtime environment
 // for useful statistics, performing some rudimentary calculations and storing
 // the resulting information in a format that can be easily consumed by status
@@ -326,6 +413,8 @@ type RuntimeStatSampler struct {
 	// Only show "not implemented" errors once, we don't need the log spam.
 	fdUsageNotImplemented bool
 
+	goRuntimeSampler *GoRuntimeSampler
+
 	// Metric gauges maintained by the sampler.
 	// Go runtime stats.
 	CgoCalls                 *metric.Gauge
@@ -338,6 +427,7 @@ type RuntimeStatSampler struct {
 	GcCount                  *metric.Gauge
 	GcPauseNS                *metric.Gauge
 	GcPausePercent           *metric.GaugeFloat64
+	GcAssistNS               *metric.Gauge
 	// CPU stats for the CRDB process usage.
 	CPUUserNS              *metric.Gauge
 	CPUUserPercent         *metric.GaugeFloat64
@@ -414,6 +504,7 @@ func NewRuntimeStatSampler(ctx context.Context, clock hlc.WallClock) *RuntimeSta
 		startTimeNanos:           clock.Now().UnixNano(),
 		initialNetCounters:       netCounters,
 		initialDiskCounters:      diskCounters,
+		goRuntimeSampler:         NewGoRuntimeSampler(runtimeMetrics),
 		CgoCalls:                 metric.NewGauge(metaCgoCalls),
 		Goroutines:               metric.NewGauge(metaGoroutines),
 		RunnableGoroutinesPerCPU: metric.NewGaugeFloat64(metaRunnableGoroutinesPerCPU),
@@ -424,6 +515,7 @@ func NewRuntimeStatSampler(ctx context.Context, clock hlc.WallClock) *RuntimeSta
 		GcCount:                  metric.NewGauge(metaGCCount),
 		GcPauseNS:                metric.NewGauge(metaGCPauseNS),
 		GcPausePercent:           metric.NewGaugeFloat64(metaGCPausePercent),
+		GcAssistNS:               metric.NewGauge(metaGCAssistNS),
 
 		CPUUserNS:              metric.NewGauge(metaCPUUserNS),
 		CPUUserPercent:         metric.NewGaugeFloat64(metaCPUUserPercent),
@@ -515,6 +607,8 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(
 	gc := &debug.GCStats{}
 	debug.ReadGCStats(gc)
 
+	rsr.goRuntimeSampler.sampleRuntimeMetrics()
+
 	numCgoCall := runtime.NumCgoCall()
 	numGoroutine := runtime.NumGoroutine()
 
@@ -615,6 +709,8 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(
 	combinedNormalizedHostPerc := (hostSrate + hostUrate) / float64(numHostCPUs)
 	gcPauseRatio := float64(uint64(gc.PauseTotal)-rsr.last.gcPauseTime) / dur
 	runnableSum := goschedstats.CumulativeNormalizedRunnableGoroutines()
+	gcAssistSeconds := rsr.goRuntimeSampler.float64(runtimeMetricGCAssist)
+	gcAssistNS := int64(gcAssistSeconds * 1e9)
 	// The number of runnable goroutines per CPU is a count, but it can vary
 	// quickly. We don't just want to get a current snapshot of it, we want the
 	// average value since the last sampling.
@@ -651,6 +747,7 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(
 		GCRunCount:        uint64(gc.NumGC),
 		NetHostRecvBytes:  deltaNet.BytesRecv,
 		NetHostSendBytes:  deltaNet.BytesSent,
+		GCAssistNs:        uint64(gcAssistNS),
 	}
 
 	logStats(ctx, stats)
@@ -668,6 +765,7 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(
 	rsr.GcCount.Update(gc.NumGC)
 	rsr.GcPauseNS.Update(int64(gc.PauseTotal))
 	rsr.GcPausePercent.Update(gcPauseRatio)
+	rsr.GcAssistNS.Update(gcAssistNS)
 
 	rsr.CPUUserNS.Update(procUtime)
 	rsr.CPUUserPercent.Update(procUrate)

diff --git a/pkg/util/log/eventpb/health_events.proto b/pkg/util/log/eventpb/health_events.proto
@@ -64,4 +64,7 @@ message RuntimeStats {
   uint64 net_host_recv_bytes = 18 [(gogoproto.jsontag) = ",omitempty"];
   // The bytes sent on all network interfaces since this process started.
   uint64 net_host_send_bytes = 19 [(gogoproto.jsontag) = ",omitempty"];
+  // Estimated total CPU time user goroutines spent performing GC tasks to
+  // assist the GC. Expressed in nanoseconds.
+  uint64 gc_assist_ns = 20 [(gogoproto.customname) = "GCAssistNs", (gogoproto.jsontag) = ",omitempty"];
 }
diff --git a/pkg/util/log/eventpb/json_encode_generated.go b/pkg/util/log/eventpb/json_encode_generated.go