Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ts: include histogram quantiles in tsdump #69469

Merged
merged 1 commit into from
Aug 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/server/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,7 @@ go_test(
"@com_github_grpc_ecosystem_grpc_gateway//runtime:go_default_library",
"@com_github_kr_pretty//:pretty",
"@com_github_lib_pq//:pq",
"@com_github_prometheus_client_model//go",
"@com_github_stretchr_testify//assert",
"@com_github_stretchr_testify//require",
"@org_golang_google_grpc//:go_default_library",
Expand Down
40 changes: 39 additions & 1 deletion pkg/server/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
"github.com/kr/pretty"
io_prometheus_client "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
Expand Down Expand Up @@ -929,6 +930,17 @@ func TestChartCatalogGen(t *testing.T) {
}
}

// walkAllSections invokes the visitor on each of the ChartSections nestled under
// the input one.
func walkAllSections(chartCatalog []catalog.ChartSection, visit func(c *catalog.ChartSection)) {
for _, c := range chartCatalog {
visit(&c)
for _, ic := range c.Subsections {
visit(ic)
}
}
}

// findUndefinedMetrics finds metrics listed in pkg/ts/catalog/chart_catalog.go
// that are not defined. This is most likely caused by a metric being removed.
func findUndefinedMetrics(c *catalog.ChartSection, metadata map[string]metric.Metadata) []string {
Expand Down Expand Up @@ -1015,9 +1027,35 @@ func TestChartCatalogMetrics(t *testing.T) {
metricNames = append(metricNames, metricName)
}
sort.Strings(metricNames)
t.Fatalf(`The following metrics need to be added to the chart catalog
t.Errorf(`The following metrics need to be added to the chart catalog
(pkg/ts/catalog/chart_catalog.go): %v`, metricNames)
}

internalTSDBMetricNamesWithoutPrefix := map[string]struct{}{}
for _, name := range catalog.AllInternalTimeseriesMetricNames() {
name = strings.TrimPrefix(name, "cr.node.")
name = strings.TrimPrefix(name, "cr.store.")
internalTSDBMetricNamesWithoutPrefix[name] = struct{}{}
}
walkAllSections(chartCatalog, func(cs *catalog.ChartSection) {
for _, chart := range cs.Charts {
for _, metric := range chart.Metrics {
if *metric.MetricType.Enum() != io_prometheus_client.MetricType_HISTOGRAM {
continue
}
// We have a histogram. Make sure that it is properly represented in
// AllInternalTimeseriesMetricNames(). It's not a complete check but good enough in
// practice. Ideally we wouldn't require `histogramMetricsNames` and
// the associated manual step when adding a histogram. See:
// https://github.com/cockroachdb/cockroach/issues/64373
_, ok := internalTSDBMetricNamesWithoutPrefix[metric.Name+"-p50"]
if !ok {
t.Errorf("histogram %s needs to be added to `catalog.histogramMetricsNames` manually",
metric.Name)
}
}
}
})
}

func TestHotRangesResponse(t *testing.T) {
Expand Down
104 changes: 93 additions & 11 deletions pkg/ts/catalog/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,110 @@ package catalog

import "sort"

var metricsNames []string
var internalTSMetricsNames []string

func init() {
metricsNames = allMetricsNames()
internalTSMetricsNames = allInternalTSMetricsNames()
}

// AllMetricsNames returns all of the possible metric names used by the
// sections.
// AllInternalTimeseriesMetricNames returns a slice that returns all of the
// metric names used by CockroachDB's internal timeseries database. This is
// *not* the list of names as it appears on prometheus. In particular, since
// the internal TS DB does not support histograms, it instead records timeseries
// for a number of quantiles.
//
// Note that it will return some metric names that don't exist since the
// sections do not indicate whether metrics are per-node or per-store (so both
// are returned).
func AllMetricsNames() []string {
return metricsNames
// For technical reasons, the returned set will contain metrics names that do
// not exist. This is because a running server is required to determine the
// proper prefix for each metric (per-node or per-store); we don't have one
// here, so we return both possible prefixes.
func AllInternalTimeseriesMetricNames() []string {
return internalTSMetricsNames
}

func allMetricsNames() []string {
// The histogramMetricsNames variable below was originally seeded using the invocation
// below. It's kept up to date via `server.TestChartCatalogMetrics`.
var _ = `
./cockroach demo --empty -e \
"select name from crdb_internal.node_metrics where name like '%-p50'" | \
sed -E 's/^(.*)-p50$/"\1": {},/'
`

var histogramMetricsNames = map[string]struct{}{
"sql.txn.latency.internal": {},
"sql.conn.latency": {},
"sql.mem.sql.session.max": {},
"sql.stats.flush.duration": {},
"changefeed.checkpoint_hist_nanos": {},
"admission.wait_durations.sql-sql-response": {},
"admission.wait_durations.sql-kv-response": {},
"sql.exec.latency": {},
"sql.stats.mem.max.internal": {},
"admission.wait_durations.sql-leaf-start": {},
"sql.disk.distsql.max": {},
"txn.restarts": {},
"sql.stats.flush.duration.internal": {},
"sql.distsql.exec.latency": {},
"sql.mem.internal.txn.max": {},
"changefeed.emit_hist_nanos": {},
"changefeed.flush_hist_nanos": {},
"sql.service.latency": {},
"round-trip-latency": {},
"admission.wait_durations.kv": {},
"sql.mem.distsql.max": {},
"kv.prober.write.latency": {},
"exec.latency": {},
"admission.wait_durations.sql-root-start": {},
"sql.mem.bulk.max": {},
"sql.distsql.flows.queue_wait": {},
"sql.txn.latency": {},
"sql.mem.root.max": {},
"admission.wait_durations.kv-stores": {},
"sql.stats.mem.max": {},
"sql.distsql.service.latency.internal": {},
"sql.stats.reported.mem.max.internal": {},
"sql.stats.reported.mem.max": {},
"sql.exec.latency.internal": {},
"sql.mem.internal.session.max": {},
"sql.distsql.exec.latency.internal": {},
"kv.prober.read.latency": {},
"sql.distsql.service.latency": {},
"sql.service.latency.internal": {},
"sql.mem.sql.txn.max": {},
"liveness.heartbeatlatency": {},
"txn.durations": {},
"raft.process.handleready.latency": {},
"raft.process.commandcommit.latency": {},
"raft.process.logcommit.latency": {},
"raft.scheduler.latency": {},
"txnwaitqueue.pusher.wait_time": {},
"txnwaitqueue.query.wait_time": {},
"raft.process.applycommitted.latency": {},
}

func allInternalTSMetricsNames() []string {
m := map[string]struct{}{}
for _, section := range charts {
for _, chart := range section.Charts {
for _, metric := range chart.Metrics {
m[metric] = struct{}{}
// Jump through hoops to create the correct internal timeseries metrics names.
// See:
// https://github.com/cockroachdb/cockroach/issues/64373
_, isHist := histogramMetricsNames[metric]
if !isHist {
m[metric] = struct{}{}
continue
}
for _, p := range []string{
"p50",
"p75",
"p90",
"p99",
"p99.9",
"p99.99",
"p99.999",
} {
m[metric+"-"+p] = struct{}{}
}
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/ts/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ func dumpImpl(
) error {
names := req.Names
if len(names) == 0 {
names = catalog.AllMetricsNames()
names = catalog.AllInternalTimeseriesMetricNames()
}
resolutions := req.Resolutions
if len(resolutions) == 0 {
Expand Down