Skip to content

Commit

Permalink
obs: export metrics about Go GC Assist work
Browse files Browse the repository at this point in the history
This commit introduced functions to extract exposed metrics in go runtime
metrics api. The runtime metrics is sampled along in SampleEnvironment call
every 10 seconds. New metric GcAssistNS is captured in this pr to capture
represent an estimate to amount of effort of user go routines assist in gc
activities in nanoseconds.

Fixes: #88178

Release note: None
  • Loading branch information
lyang24 committed Feb 16, 2024
1 parent 47a68b5 commit 3f1e528
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/generated/eventlog.md
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ An event of type `runtime_stats` is recorded every 10 seconds as server health m
| `GCRunCount` | The total number of GC runs. | no |
| `NetHostRecvBytes` | The bytes received on all network interfaces since this process started. | no |
| `NetHostSendBytes` | The bytes sent on all network interfaces since this process started. | no |
| `GCAssistNs` | Estimated total CPU time user goroutines spent performing GC tasks to assist the GC. Expressed in nanoseconds. | no |


#### Common fields
Expand Down
1 change: 1 addition & 0 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1558,6 +1558,7 @@
<tr><td>SERVER</td><td>sys.cpu.user.percent</td><td>Current user cpu percentage consumed by the CRDB process</td><td>CPU Time</td><td>GAUGE</td><td>PERCENT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>SERVER</td><td>sys.fd.open</td><td>Process open file descriptors</td><td>File Descriptors</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>SERVER</td><td>sys.fd.softlimit</td><td>Process open FD soft limit</td><td>File Descriptors</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>SERVER</td><td>sys.gc.assist.ns</td><td>Estimated total CPU time user goroutines spent to assist the GC process</td><td>CPU Time</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>SERVER</td><td>sys.gc.count</td><td>Total number of GC runs</td><td>GC Runs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>SERVER</td><td>sys.gc.pause.ns</td><td>Total GC pause</td><td>GC Pause</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>SERVER</td><td>sys.gc.pause.percent</td><td>Current GC pause percentage</td><td>GC Pause</td><td>GAUGE</td><td>PERCENT</td><td>AVG</td><td>NONE</td></tr>
Expand Down
98 changes: 98 additions & 0 deletions pkg/server/status/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ package status

import (
"context"
"fmt"
"os"
"regexp"
"runtime"
"runtime/debug"
"runtime/metrics"
"time"

"github.com/cockroachdb/cockroach/pkg/build"
Expand Down Expand Up @@ -93,6 +95,12 @@ var (
Measurement: "GC Pause",
Unit: metric.Unit_PERCENT,
}
metaGCAssistNS = metric.Metadata{
Name: "sys.gc.assist.ns",
Help: "Estimated total CPU time user goroutines spent to assist the GC process",
Measurement: "CPU Time",
Unit: metric.Unit_NANOSECONDS,
}

metaCPUUserNS = metric.Metadata{
Name: "sys.cpu.user.ns",
Expand Down Expand Up @@ -292,6 +300,85 @@ var diskMetricsIgnoredDevices = envutil.EnvOrDefaultString("COCKROACH_DISK_METRI
// error : any issues fetching stats. This should be a warning only.
var getCgoMemStats func(context.Context) (uint, uint, error)

// Estimated total CPU time goroutines spent performing GC tasks to assist the
// GC and prevent it from falling behind the application. This metric is an
// overestimate, and not directly comparable to system CPU time measurements.
// Compare only with other /cpu/classes metrics.
const runtimeMetricGCAssist = "/cpu/classes/gc/mark/assist:cpu-seconds"

var runtimeMetrics = []string{runtimeMetricGCAssist}

// GoRuntimeSampler are a collection of metrics to sample from golang's runtime environment and
// runtime metrics metadata. It fetches go runtime metrics and provides read access.
// https://pkg.go.dev/runtime/metrics
type GoRuntimeSampler struct {
// The collection of metrics we want to sample.
metricSamples []metrics.Sample
// The mapping to find metric slot in metricSamples by name.
metricIndexes map[string]int
}

// getIndex finds the position of metrics in the sample array by name.
func (grm *GoRuntimeSampler) getIndex(name string) int {
i, found := grm.metricIndexes[name]
if !found {
panic(fmt.Sprintf("unsampled metric: %s", name))
}
return i
}

// float64 gets the sampled value by metrics name as float64.
// N.B. This method will panic if the metrics value is not metrics.KindFloat64.
func (grm *GoRuntimeSampler) float64(name string) float64 {
i := grm.getIndex(name)
return grm.metricSamples[i].Value.Float64()
}

// sampleRuntimeMetrics reads from metrics.Read api and fill in the value
// in the metricSamples field.
// Benchmark results on 12 core Apple M3 Pro:
// goos: darwin
// goarch: arm64
// pkg: github.com/cockroachdb/cockroach/pkg/server/status
// BenchmarkGoRuntimeSampler
// BenchmarkGoRuntimeSampler-12 28886398 40.03 ns/op
//
// func BenchmarkGoRuntimeSampler(b *testing.B) {
// s := NewGoRuntimeSampler([]string{runtimeMetricGCAssist})
// for n := 0; n < b.N; n++ {
// s.sampleRuntimeMetrics()
// }
// }
func (grm *GoRuntimeSampler) sampleRuntimeMetrics() {
metrics.Read(grm.metricSamples)
}

// NewGoRuntimeSampler constructs a new GoRuntimeSampler object.
// This method will panic on invalid metrics names provided.
func NewGoRuntimeSampler(metricNames []string) *GoRuntimeSampler {
m := metrics.All()
metricTypes := make(map[string]metrics.ValueKind, len(m))
for _, desc := range m {
metricTypes[desc.Name] = desc.Kind
}
metricSamples := make([]metrics.Sample, len(metricNames))
metricIndexes := make(map[string]int, len(metricNames))
for i, n := range metricNames {
_, hasDesc := metricTypes[n]
if !hasDesc {
panic(fmt.Sprintf("unexpected metric: %s", n))
}
metricSamples[i] = metrics.Sample{Name: n}
metricIndexes[n] = i
}

grm := &GoRuntimeSampler{
metricSamples: metricSamples,
metricIndexes: metricIndexes,
}
return grm
}

// RuntimeStatSampler is used to periodically sample the runtime environment
// for useful statistics, performing some rudimentary calculations and storing
// the resulting information in a format that can be easily consumed by status
Expand Down Expand Up @@ -326,6 +413,8 @@ type RuntimeStatSampler struct {
// Only show "not implemented" errors once, we don't need the log spam.
fdUsageNotImplemented bool

goRuntimeSampler *GoRuntimeSampler

// Metric gauges maintained by the sampler.
// Go runtime stats.
CgoCalls *metric.Gauge
Expand All @@ -338,6 +427,7 @@ type RuntimeStatSampler struct {
GcCount *metric.Gauge
GcPauseNS *metric.Gauge
GcPausePercent *metric.GaugeFloat64
GcAssistNS *metric.Gauge
// CPU stats for the CRDB process usage.
CPUUserNS *metric.Gauge
CPUUserPercent *metric.GaugeFloat64
Expand Down Expand Up @@ -414,6 +504,7 @@ func NewRuntimeStatSampler(ctx context.Context, clock hlc.WallClock) *RuntimeSta
startTimeNanos: clock.Now().UnixNano(),
initialNetCounters: netCounters,
initialDiskCounters: diskCounters,
goRuntimeSampler: NewGoRuntimeSampler(runtimeMetrics),
CgoCalls: metric.NewGauge(metaCgoCalls),
Goroutines: metric.NewGauge(metaGoroutines),
RunnableGoroutinesPerCPU: metric.NewGaugeFloat64(metaRunnableGoroutinesPerCPU),
Expand All @@ -424,6 +515,7 @@ func NewRuntimeStatSampler(ctx context.Context, clock hlc.WallClock) *RuntimeSta
GcCount: metric.NewGauge(metaGCCount),
GcPauseNS: metric.NewGauge(metaGCPauseNS),
GcPausePercent: metric.NewGaugeFloat64(metaGCPausePercent),
GcAssistNS: metric.NewGauge(metaGCAssistNS),

CPUUserNS: metric.NewGauge(metaCPUUserNS),
CPUUserPercent: metric.NewGaugeFloat64(metaCPUUserPercent),
Expand Down Expand Up @@ -515,6 +607,8 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(
gc := &debug.GCStats{}
debug.ReadGCStats(gc)

rsr.goRuntimeSampler.sampleRuntimeMetrics()

numCgoCall := runtime.NumCgoCall()
numGoroutine := runtime.NumGoroutine()

Expand Down Expand Up @@ -615,6 +709,8 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(
combinedNormalizedHostPerc := (hostSrate + hostUrate) / float64(numHostCPUs)
gcPauseRatio := float64(uint64(gc.PauseTotal)-rsr.last.gcPauseTime) / dur
runnableSum := goschedstats.CumulativeNormalizedRunnableGoroutines()
gcAssistSeconds := rsr.goRuntimeSampler.float64(runtimeMetricGCAssist)
gcAssistNS := int64(gcAssistSeconds * 1e9)
// The number of runnable goroutines per CPU is a count, but it can vary
// quickly. We don't just want to get a current snapshot of it, we want the
// average value since the last sampling.
Expand Down Expand Up @@ -651,6 +747,7 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(
GCRunCount: uint64(gc.NumGC),
NetHostRecvBytes: deltaNet.BytesRecv,
NetHostSendBytes: deltaNet.BytesSent,
GCAssistNs: uint64(gcAssistNS),
}

logStats(ctx, stats)
Expand All @@ -668,6 +765,7 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(
rsr.GcCount.Update(gc.NumGC)
rsr.GcPauseNS.Update(int64(gc.PauseTotal))
rsr.GcPausePercent.Update(gcPauseRatio)
rsr.GcAssistNS.Update(gcAssistNS)

rsr.CPUUserNS.Update(procUtime)
rsr.CPUUserPercent.Update(procUrate)
Expand Down
3 changes: 3 additions & 0 deletions pkg/util/log/eventpb/health_events.proto
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,7 @@ message RuntimeStats {
uint64 net_host_recv_bytes = 18 [(gogoproto.jsontag) = ",omitempty"];
// The bytes sent on all network interfaces since this process started.
uint64 net_host_send_bytes = 19 [(gogoproto.jsontag) = ",omitempty"];
// Estimated total CPU time user goroutines spent performing GC tasks to
// assist the GC. Expressed in nanoseconds.
uint64 gc_assist_ns = 20 [(gogoproto.customname) = "GCAssistNs", (gogoproto.jsontag) = ",omitempty"];
}
9 changes: 9 additions & 0 deletions pkg/util/log/eventpb/json_encode_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 3f1e528

Please sign in to comment.