diff --git a/pkg/server/BUILD.bazel b/pkg/server/BUILD.bazel index 67675ea16206..89fc9ecc87cf 100644 --- a/pkg/server/BUILD.bazel +++ b/pkg/server/BUILD.bazel @@ -381,6 +381,7 @@ go_test( "@com_github_grpc_ecosystem_grpc_gateway//runtime:go_default_library", "@com_github_kr_pretty//:pretty", "@com_github_lib_pq//:pq", + "@com_github_prometheus_client_model//go", "@com_github_stretchr_testify//assert", "@com_github_stretchr_testify//require", "@org_golang_google_grpc//:go_default_library", diff --git a/pkg/server/status_test.go b/pkg/server/status_test.go index 1cd0acf9ee44..c1c7783c973d 100644 --- a/pkg/server/status_test.go +++ b/pkg/server/status_test.go @@ -66,6 +66,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/errors" "github.com/kr/pretty" + io_prometheus_client "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -929,6 +930,17 @@ func TestChartCatalogGen(t *testing.T) { } } +// walkAllSections invokes the visitor on each of the ChartSections nestled under +// the input one. +func walkAllSections(chartCatalog []catalog.ChartSection, visit func(c *catalog.ChartSection)) { + for _, c := range chartCatalog { + visit(&c) + for _, ic := range c.Subsections { + visit(ic) + } + } +} + // findUndefinedMetrics finds metrics listed in pkg/ts/catalog/chart_catalog.go // that are not defined. This is most likely caused by a metric being removed. func findUndefinedMetrics(c *catalog.ChartSection, metadata map[string]metric.Metadata) []string { @@ -1015,9 +1027,35 @@ func TestChartCatalogMetrics(t *testing.T) { metricNames = append(metricNames, metricName) } sort.Strings(metricNames) - t.Fatalf(`The following metrics need to be added to the chart catalog + t.Errorf(`The following metrics need to be added to the chart catalog (pkg/ts/catalog/chart_catalog.go): %v`, metricNames) } + + internalTSDBMetricNamesWithoutPrefix := map[string]struct{}{} + for _, name := range catalog.AllInternalTimeseriesMetricNames() { + name = strings.TrimPrefix(name, "cr.node.") + name = strings.TrimPrefix(name, "cr.store.") + internalTSDBMetricNamesWithoutPrefix[name] = struct{}{} + } + walkAllSections(chartCatalog, func(cs *catalog.ChartSection) { + for _, chart := range cs.Charts { + for _, metric := range chart.Metrics { + if *metric.MetricType.Enum() != io_prometheus_client.MetricType_HISTOGRAM { + continue + } + // We have a histogram. Make sure that it is properly represented in + // AllInternalTimeseriesMetricNames(). It's not a complete check but good enough in + // practice. Ideally we wouldn't require `histogramMetricsNames` and + // the associated manual step when adding a histogram. See: + // https://github.com/cockroachdb/cockroach/issues/64373 + _, ok := internalTSDBMetricNamesWithoutPrefix[metric.Name+"-p50"] + if !ok { + t.Errorf("histogram %s needs to be added to `catalog.histogramMetricsNames` manually", + metric.Name) + } + } + } + }) } func TestHotRangesResponse(t *testing.T) { diff --git a/pkg/ts/catalog/metrics.go b/pkg/ts/catalog/metrics.go index d6f28fa47c9e..03b93925cc35 100644 --- a/pkg/ts/catalog/metrics.go +++ b/pkg/ts/catalog/metrics.go @@ -12,28 +12,110 @@ package catalog import "sort" -var metricsNames []string +var internalTSMetricsNames []string func init() { - metricsNames = allMetricsNames() + internalTSMetricsNames = allInternalTSMetricsNames() } -// AllMetricsNames returns all of the possible metric names used by the -// sections. +// AllInternalTimeseriesMetricNames returns a slice that returns all of the +// metric names used by CockroachDB's internal timeseries database. This is +// *not* the list of names as it appears on prometheus. In particular, since +// the internal TS DB does not support histograms, it instead records timeseries +// for a number of quantiles. // -// Note that it will return some metric names that don't exist since the -// sections do not indicate whether metrics are per-node or per-store (so both -// are returned). -func AllMetricsNames() []string { - return metricsNames +// For technical reasons, the returned set will contain metrics names that do +// not exist. This is because a running server is required to determine the +// proper prefix for each metric (per-node or per-store); we don't have one +// here, so we return both possible prefixes. +func AllInternalTimeseriesMetricNames() []string { + return internalTSMetricsNames } -func allMetricsNames() []string { +// The histogramMetricsNames variable below was originally seeded using the invocation +// below. It's kept up to date via `server.TestChartCatalogMetrics`. +var _ = ` +./cockroach demo --empty -e \ + "select name from crdb_internal.node_metrics where name like '%-p50'" | \ + sed -E 's/^(.*)-p50$/"\1": {},/' +` + +var histogramMetricsNames = map[string]struct{}{ + "sql.txn.latency.internal": {}, + "sql.conn.latency": {}, + "sql.mem.sql.session.max": {}, + "sql.stats.flush.duration": {}, + "changefeed.checkpoint_hist_nanos": {}, + "admission.wait_durations.sql-sql-response": {}, + "admission.wait_durations.sql-kv-response": {}, + "sql.exec.latency": {}, + "sql.stats.mem.max.internal": {}, + "admission.wait_durations.sql-leaf-start": {}, + "sql.disk.distsql.max": {}, + "txn.restarts": {}, + "sql.stats.flush.duration.internal": {}, + "sql.distsql.exec.latency": {}, + "sql.mem.internal.txn.max": {}, + "changefeed.emit_hist_nanos": {}, + "changefeed.flush_hist_nanos": {}, + "sql.service.latency": {}, + "round-trip-latency": {}, + "admission.wait_durations.kv": {}, + "sql.mem.distsql.max": {}, + "kv.prober.write.latency": {}, + "exec.latency": {}, + "admission.wait_durations.sql-root-start": {}, + "sql.mem.bulk.max": {}, + "sql.distsql.flows.queue_wait": {}, + "sql.txn.latency": {}, + "sql.mem.root.max": {}, + "admission.wait_durations.kv-stores": {}, + "sql.stats.mem.max": {}, + "sql.distsql.service.latency.internal": {}, + "sql.stats.reported.mem.max.internal": {}, + "sql.stats.reported.mem.max": {}, + "sql.exec.latency.internal": {}, + "sql.mem.internal.session.max": {}, + "sql.distsql.exec.latency.internal": {}, + "kv.prober.read.latency": {}, + "sql.distsql.service.latency": {}, + "sql.service.latency.internal": {}, + "sql.mem.sql.txn.max": {}, + "liveness.heartbeatlatency": {}, + "txn.durations": {}, + "raft.process.handleready.latency": {}, + "raft.process.commandcommit.latency": {}, + "raft.process.logcommit.latency": {}, + "raft.scheduler.latency": {}, + "txnwaitqueue.pusher.wait_time": {}, + "txnwaitqueue.query.wait_time": {}, + "raft.process.applycommitted.latency": {}, +} + +func allInternalTSMetricsNames() []string { m := map[string]struct{}{} for _, section := range charts { for _, chart := range section.Charts { for _, metric := range chart.Metrics { - m[metric] = struct{}{} + // Jump through hoops to create the correct internal timeseries metrics names. + // See: + // https://github.com/cockroachdb/cockroach/issues/64373 + _, isHist := histogramMetricsNames[metric] + if !isHist { + m[metric] = struct{}{} + continue + } + for _, p := range []string{ + "p50", + "p75", + "p90", + "p99", + "p99.9", + "p99.99", + "p99.999", + } { + m[metric+"-"+p] = struct{}{} + } } } } diff --git a/pkg/ts/server.go b/pkg/ts/server.go index 2d3d7d01b933..74763e4090d7 100644 --- a/pkg/ts/server.go +++ b/pkg/ts/server.go @@ -337,7 +337,7 @@ func dumpImpl( ) error { names := req.Names if len(names) == 0 { - names = catalog.AllMetricsNames() + names = catalog.AllInternalTimeseriesMetricNames() } resolutions := req.Resolutions if len(resolutions) == 0 {