diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html
index b4487a955b6d..53418bca74b8 100644
--- a/docs/generated/metrics/metrics.html
+++ b/docs/generated/metrics/metrics.html
@@ -1558,6 +1558,7 @@
SERVER | sys.cpu.user.percent | Current user cpu percentage consumed by the CRDB process | CPU Time | GAUGE | PERCENT | AVG | NONE |
SERVER | sys.fd.open | Process open file descriptors | File Descriptors | GAUGE | COUNT | AVG | NONE |
SERVER | sys.fd.softlimit | Process open FD soft limit | File Descriptors | GAUGE | COUNT | AVG | NONE |
+SERVER | sys.gc.assist.ns | Estimated total CPU time user goroutines spent to assist the GC process | CPU Time | GAUGE | NANOSECONDS | AVG | NONE |
SERVER | sys.gc.count | Total number of GC runs | GC Runs | GAUGE | COUNT | AVG | NONE |
SERVER | sys.gc.pause.ns | Total GC pause | GC Pause | GAUGE | NANOSECONDS | AVG | NONE |
SERVER | sys.gc.pause.percent | Current GC pause percentage | GC Pause | GAUGE | PERCENT | AVG | NONE |
diff --git a/pkg/server/status/runtime.go b/pkg/server/status/runtime.go
index 500b561c9432..fb0612459fd8 100644
--- a/pkg/server/status/runtime.go
+++ b/pkg/server/status/runtime.go
@@ -12,10 +12,12 @@ package status
import (
"context"
+ "fmt"
"os"
"regexp"
"runtime"
"runtime/debug"
+ "runtime/metrics"
"time"
"github.com/cockroachdb/cockroach/pkg/build"
@@ -93,6 +95,12 @@ var (
Measurement: "GC Pause",
Unit: metric.Unit_PERCENT,
}
+ metaGCAssistNS = metric.Metadata{
+ Name: "sys.gc.assist.ns",
+ Help: "Estimated total CPU time user goroutines spent to assist the GC process",
+ Measurement: "CPU Time",
+ Unit: metric.Unit_NANOSECONDS,
+ }
metaCPUUserNS = metric.Metadata{
Name: "sys.cpu.user.ns",
@@ -292,6 +300,85 @@ var diskMetricsIgnoredDevices = envutil.EnvOrDefaultString("COCKROACH_DISK_METRI
// error : any issues fetching stats. This should be a warning only.
var getCgoMemStats func(context.Context) (uint, uint, error)
+// Estimated total CPU time goroutines spent performing GC tasks to assist the
+// GC and prevent it from falling behind the application. This metric is an
+// overestimate, and not directly comparable to system CPU time measurements.
+// Compare only with other /cpu/classes metrics.
+const runtimeMetricGCAssist = "/cpu/classes/gc/mark/assist:cpu-seconds"
+
+var runtimeMetrics = []string{runtimeMetricGCAssist}
+
+// GoRuntimeSampler are a collection of metrics to sample from golang's runtime environment and
+// runtime metrics metadata. It fetches go runtime metrics and provides read access.
+// https://pkg.go.dev/runtime/metrics
+type GoRuntimeSampler struct {
+ // The collection of metrics we want to sample.
+ metricSamples []metrics.Sample
+ // The mapping to find metric slot in metricSamples by name.
+ metricIndexes map[string]int
+}
+
+// getIndex finds the position of metrics in the sample array by name.
+func (grm *GoRuntimeSampler) getIndex(name string) int {
+ i, found := grm.metricIndexes[name]
+ if !found {
+ panic(fmt.Sprintf("unsampled metric: %s", name))
+ }
+ return i
+}
+
+// float64 gets the sampled value by metrics name as float64.
+// N.B. This method will panic if the metrics value is not metrics.KindFloat64.
+func (grm *GoRuntimeSampler) float64(name string) float64 {
+ i := grm.getIndex(name)
+ return grm.metricSamples[i].Value.Float64()
+}
+
+// sampleRuntimeMetrics reads from metrics.Read api and fill in the value
+// in the metricSamples field.
+// Benchmark results on 12 core Apple M3 Pro:
+// goos: darwin
+// goarch: arm64
+// pkg: github.com/cockroachdb/cockroach/pkg/server/status
+// BenchmarkGoRuntimeSampler
+// BenchmarkGoRuntimeSampler-12 28886398 40.03 ns/op
+//
+// func BenchmarkGoRuntimeSampler(b *testing.B) {
+// s := NewGoRuntimeSampler([]string{runtimeMetricGCAssist})
+// for n := 0; n < b.N; n++ {
+// s.sampleRuntimeMetrics()
+// }
+// }
+func (grm *GoRuntimeSampler) sampleRuntimeMetrics() {
+ metrics.Read(grm.metricSamples)
+}
+
+// NewGoRuntimeSampler constructs a new GoRuntimeSampler object.
+// This method will panic on invalid metrics names provided.
+func NewGoRuntimeSampler(metricNames []string) *GoRuntimeSampler {
+ m := metrics.All()
+ metricTypes := make(map[string]metrics.ValueKind, len(m))
+ for _, desc := range m {
+ metricTypes[desc.Name] = desc.Kind
+ }
+ metricSamples := make([]metrics.Sample, len(metricNames))
+ metricIndexes := make(map[string]int, len(metricNames))
+ for i, n := range metricNames {
+ _, hasDesc := metricTypes[n]
+ if !hasDesc {
+ panic(fmt.Sprintf("unexpected metric: %s", n))
+ }
+ metricSamples[i] = metrics.Sample{Name: n}
+ metricIndexes[n] = i
+ }
+
+ grm := &GoRuntimeSampler{
+ metricSamples: metricSamples,
+ metricIndexes: metricIndexes,
+ }
+ return grm
+}
+
// RuntimeStatSampler is used to periodically sample the runtime environment
// for useful statistics, performing some rudimentary calculations and storing
// the resulting information in a format that can be easily consumed by status
@@ -326,6 +413,8 @@ type RuntimeStatSampler struct {
// Only show "not implemented" errors once, we don't need the log spam.
fdUsageNotImplemented bool
+ goRuntimeSampler *GoRuntimeSampler
+
// Metric gauges maintained by the sampler.
// Go runtime stats.
CgoCalls *metric.Gauge
@@ -338,6 +427,7 @@ type RuntimeStatSampler struct {
GcCount *metric.Gauge
GcPauseNS *metric.Gauge
GcPausePercent *metric.GaugeFloat64
+ GcAssistNS *metric.Gauge
// CPU stats for the CRDB process usage.
CPUUserNS *metric.Gauge
CPUUserPercent *metric.GaugeFloat64
@@ -414,6 +504,7 @@ func NewRuntimeStatSampler(ctx context.Context, clock hlc.WallClock) *RuntimeSta
startTimeNanos: clock.Now().UnixNano(),
initialNetCounters: netCounters,
initialDiskCounters: diskCounters,
+ goRuntimeSampler: NewGoRuntimeSampler(runtimeMetrics),
CgoCalls: metric.NewGauge(metaCgoCalls),
Goroutines: metric.NewGauge(metaGoroutines),
RunnableGoroutinesPerCPU: metric.NewGaugeFloat64(metaRunnableGoroutinesPerCPU),
@@ -424,6 +515,7 @@ func NewRuntimeStatSampler(ctx context.Context, clock hlc.WallClock) *RuntimeSta
GcCount: metric.NewGauge(metaGCCount),
GcPauseNS: metric.NewGauge(metaGCPauseNS),
GcPausePercent: metric.NewGaugeFloat64(metaGCPausePercent),
+ GcAssistNS: metric.NewGauge(metaGCAssistNS),
CPUUserNS: metric.NewGauge(metaCPUUserNS),
CPUUserPercent: metric.NewGaugeFloat64(metaCPUUserPercent),
@@ -515,6 +607,8 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(
gc := &debug.GCStats{}
debug.ReadGCStats(gc)
+ rsr.goRuntimeSampler.sampleRuntimeMetrics()
+
numCgoCall := runtime.NumCgoCall()
numGoroutine := runtime.NumGoroutine()
@@ -615,6 +709,8 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(
combinedNormalizedHostPerc := (hostSrate + hostUrate) / float64(numHostCPUs)
gcPauseRatio := float64(uint64(gc.PauseTotal)-rsr.last.gcPauseTime) / dur
runnableSum := goschedstats.CumulativeNormalizedRunnableGoroutines()
+ gcAssistSeconds := rsr.goRuntimeSampler.float64(runtimeMetricGCAssist)
+ gcAssistNS := int64(gcAssistSeconds * 1e9)
// The number of runnable goroutines per CPU is a count, but it can vary
// quickly. We don't just want to get a current snapshot of it, we want the
// average value since the last sampling.
@@ -668,6 +764,7 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(
rsr.GcCount.Update(gc.NumGC)
rsr.GcPauseNS.Update(int64(gc.PauseTotal))
rsr.GcPausePercent.Update(gcPauseRatio)
+ rsr.GcAssistNS.Update(gcAssistNS)
rsr.CPUUserNS.Update(procUtime)
rsr.CPUUserPercent.Update(procUrate)