Skip to content

Commit

Permalink
roachtest/cdc: export stats for initial scan test to roachperf
Browse files Browse the repository at this point in the history
This change updates the cdc/initial_scan_only test to produce a
`stats.json` artifact to be consumed by roachprod. This file
contains stats for p99 foreground latency, changefeed throughput,
and CPU usage.

Release note: None
  • Loading branch information
jayshrivastava committed Nov 30, 2022
1 parent 596c25e commit ee0fa07
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 2 deletions.
4 changes: 4 additions & 0 deletions pkg/cmd/roachtest/clusterstats/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ type AggQuery struct {
Query string
AggFn AggregateFn
Interval Interval
Tag string
}

// StatExporter defines an interface to export statistics to roachperf.
Expand Down Expand Up @@ -278,6 +279,9 @@ func (cs *clusterStatCollector) getStatSummary(
}

ret.AggTag = summaryQuery.Query
if summaryQuery.Tag != "" {
ret.AggTag = summaryQuery.Tag
}
// If there is more than one label name associated with the summary, we
// cannot be sure which is the correct label.
if len(taggedSummarySeries) != 1 {
Expand Down
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/tests/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ go_library(
"canary.go",
"cancel.go",
"cdc.go",
"cdc_stats.go",
"chaos.go",
"clearrange.go",
"cli.go",
Expand Down
46 changes: 44 additions & 2 deletions pkg/cmd/roachtest/tests/cdc.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/cdctest"
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/clusterstats"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
Expand Down Expand Up @@ -77,6 +78,7 @@ type cdcTester struct {
crdbNodes option.NodeListOption
workloadNode option.NodeListOption
logger *logger.Logger
promCfg *prometheus.Config

// sinkType -> sinkURI
sinkCache map[sinkType]string
Expand All @@ -85,6 +87,43 @@ type cdcTester struct {
doneCh chan struct{}
}

// startStatsCollection sets the start point of the stats collection window
// and returns a function which should be called at the end of the test to dump a
// stats.json file to the artifacts directory.
func (ct *cdcTester) startStatsCollection() func() {
if ct.promCfg == nil {
ct.t.Error("prometheus configuration is nil")
}
promClient, err := clusterstats.SetupCollectorPromClient(ct.ctx, ct.cluster, ct.t.L(), ct.promCfg)
if err != nil {
ct.t.Errorf("error creating prometheus client for stats collector: %s", err)
}

statsCollector := clusterstats.NewStatsCollector(ct.ctx, promClient)
startTime := timeutil.Now()
return func() {
endTime := timeutil.Now()
err := statsCollector.Exporter().Export(ct.ctx, ct.cluster, ct.t,
startTime,
endTime,
[]clusterstats.AggQuery{sqlServiceLatencyAgg, changefeedThroughputAgg, cpuUsageAgg},
func(stats map[string]clusterstats.StatSummary) (string, float64) {
// TODO(jayant): update this metric to be more accurate.
// It may be worth plugging in real latency values from the latency
// verifier here in the future for more accuracy. However, it may not be
// worth the added complexity. Since latency verifier failures will show
// up as roachtest failures, we don't need to make them very apparent in
// roachperf. Note that other roachperf stats, such as the aggregate stats
// above, will be accurate.
return "Total Run Time (mins)", endTime.Sub(startTime).Minutes()
},
)
if err != nil {
ct.t.Errorf("error exporting stats file: %s", err)
}
}
}

func (ct *cdcTester) startCRDBChaos() {
chaosStopper := make(chan time.Time)
ct.mon.Go(func(ctx context.Context) error {
Expand Down Expand Up @@ -468,18 +507,19 @@ func newCDCTester(ctx context.Context, t test.Test, c cluster.Cluster) cdcTester
if !t.SkipInit() {
tester.startGrafana()
}

return tester
}

func (ct *cdcTester) startGrafana() {
// Setup the prometheus instance on the workload node
cfg := (&prometheus.Config{}).
WithPrometheusNode(ct.workloadNode.InstallNodes()[0]).
WithCluster(ct.crdbNodes.InstallNodes()).
WithNodeExporter(ct.crdbNodes.InstallNodes()).
WithGrafanaDashboard("https://go.crdb.dev/p/changefeed-roachtest-grafana-dashboard")
cfg.Grafana.Enabled = true

ct.promCfg = cfg

err := ct.cluster.StartGrafana(ct.ctx, ct.t.L(), cfg)
if err != nil {
ct.t.Errorf("error starting prometheus/grafana: %s", err)
Expand Down Expand Up @@ -846,6 +886,7 @@ func registerCDC(r registry.Registry) {

ct.runTPCCWorkload(tpccArgs{warehouses: 100})

exportStatsFile := ct.startStatsCollection()
feed := ct.newChangefeed(feedArgs{
sinkType: kafkaSink,
targets: allTpccTargets,
Expand All @@ -855,6 +896,7 @@ func registerCDC(r registry.Registry) {
initialScanLatency: 30 * time.Minute,
})
feed.waitForCompletion()
exportStatsFile()
},
})
r.Add(registry.TestSpec{
Expand Down
42 changes: 42 additions & 0 deletions pkg/cmd/roachtest/tests/cdc_stats.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package tests

import "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/clusterstats"

var (
// sqlServiceLatency is the sql_service_latency_bucket prometheus metric.
sqlServiceLatency = clusterstats.ClusterStat{LabelName: "node", Query: "sql_service_latency_bucket"}
// sqlServiceLatencyAgg is the P99 latency of foreground SQL traffic across all nodes measured in ms.
sqlServiceLatencyAgg = clusterstats.AggQuery{
Stat: sqlServiceLatency,
Query: "histogram_quantile(0.99, sum by(le) (rate(sql_service_latency_bucket[2m]))) / (1000*1000)",
Tag: "P99 Foreground Latency (ms)",
}

// changefeedThroughput is the changefeed_emitted_bytes prometheus metric.
changefeedThroughput = clusterstats.ClusterStat{LabelName: "node", Query: "changefeed_emitted_bytes"}
// changefeedThroughputAgg is the total rate of bytes being emitted by a cluster measured in MB/s.
changefeedThroughputAgg = clusterstats.AggQuery{
Stat: changefeedThroughput,
Query: "sum(rate(changefeed_emitted_bytes[1m]) / (1000 * 1000))",
Tag: "Throughput (MBps)",
}

// cpuUsage is the sys_cpu_combined_percent_normalized prometheus metric per mode.
cpuUsage = clusterstats.ClusterStat{LabelName: "node", Query: "sys_cpu_combined_percent_normalized"}
// cpuUsageAgg is the average CPU usage across all nodes.
cpuUsageAgg = clusterstats.AggQuery{
Stat: cpuUsage,
Query: "avg(sys_cpu_combined_percent_normalized) * 100",
Tag: "CPU Utilization (%)",
}
)

0 comments on commit ee0fa07

Please sign in to comment.