Skip to content

Commit

Permalink
roachtest: check metrics during multi-region TPC-C runs
Browse files Browse the repository at this point in the history
This commit adds the checking of prometheus metrics during a TPC-C run.
We assert that the shapes of the curves look the way we expect, erroring
the roachtest if it was found to mismatch.

Release note: None
  • Loading branch information
otan committed Jul 6, 2021
1 parent 0e88c09 commit 310b9fb
Show file tree
Hide file tree
Showing 11 changed files with 880 additions and 3 deletions.
1 change: 1 addition & 0 deletions build/bazelutil/check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ pkg/kv/kvclient/rangecache/range_cache.go://go:generate mockgen -package=rangeca
pkg/kv/kvclient/rangefeed/rangefeed.go://go:generate mockgen -package=rangefeed -source rangefeed.go -destination=mocks_generated.go .
pkg/kv/kvclient/kvcoord/transport.go://go:generate mockgen -package=kvcoord -destination=mocks_generated.go . Transport
pkg/roachpb/api.go://go:generate mockgen -package=roachpb -destination=mocks_generated.go . InternalClient,Internal_RangeFeedClient
pkg/cmd/roachtest/drt.go://go:generate mockgen -source drt.go -package main -destination drt_generated.go
pkg/security/securitytest/securitytest.go://go:generate go-bindata -mode 0600 -modtime 1400000000 -pkg securitytest -o embedded.go -ignore README.md -ignore regenerate.sh test_certs
pkg/security/securitytest/securitytest.go://go:generate gofmt -s -w embedded.go
pkg/security/securitytest/securitytest.go://go:generate goimports -w embedded.go
Expand Down
3 changes: 3 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,7 @@ github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCV
github.com/json-iterator/go v1.1.7/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
github.com/json-iterator/go v1.1.8/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
github.com/json-iterator/go v1.1.10 h1:Kz6Cvnvv2wGdaG/V8yMvfkmNiXq9Ya2KUv4rouJJr68=
github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
github.com/jstemmer/go-junit-report v0.9.1 h1:6QPYqodiu3GuPL+7mfx+NwDdp2eTkp9IfEUpgAwUN0o=
Expand Down Expand Up @@ -1043,9 +1044,11 @@ github.com/moby/moby v20.10.6+incompatible/go.mod h1:fDXVQ6+S340veQPv35CzDahGBmH
github.com/moby/term v0.0.0-20201216013528-df9cb8a40635 h1:rzf0wL0CHVc8CEsgyygG0Mn9CNCCPZqOPaz8RiiHYQk=
github.com/moby/term v0.0.0-20201216013528-df9cb8a40635/go.mod h1:FBS0z0QWA44HXygs7VXDUOGoN/1TV3RuWkLO04am3wc=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v0.0.0-20180320133207-05fbef0ca5da/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
github.com/modern-go/reflect2 v1.0.1 h1:9f412s+6RmYXLWZSEzVVgPGK7C2PphHj5RJrvfx9AWI=
github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc=
github.com/montanaflynn/stats v0.6.3 h1:F8446DrvIF5V5smZfZ8K9nrmmix0AFgevPdLruGOmzk=
Expand Down
10 changes: 10 additions & 0 deletions pkg/cmd/roachtest/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ go_library(
"django.go",
"django_blocklist.go",
"drop.go",
"drt.go",
"drt_generated.go",
"encryption.go",
"engine_switch.go",
"event_log.go",
Expand Down Expand Up @@ -187,10 +189,14 @@ go_library(
"@com_github_cockroachdb_ttycolor//:ttycolor",
"@com_github_codahale_hdrhistogram//:hdrhistogram",
"@com_github_dustin_go_humanize//:go-humanize",
"@com_github_golang_mock//gomock",
"@com_github_kr_pretty//:pretty",
"@com_github_lib_pq//:pq",
"@com_github_nlopes_slack//:slack",
"@com_github_petermattis_goid//:goid",
"@com_github_prometheus_client_golang//api",
"@com_github_prometheus_client_golang//api/prometheus/v1:prometheus",
"@com_github_prometheus_common//model",
"@com_github_shopify_sarama//:sarama",
"@com_github_shopify_toxiproxy//client",
"@com_github_spf13_cobra//:cobra",
Expand All @@ -211,6 +217,7 @@ go_test(
size = "small",
srcs = [
"blocklist_test.go",
"drt_test.go",
"tpcc_test.go",
"z_cluster_test.go",
"z_test_registry_test.go",
Expand All @@ -222,6 +229,7 @@ go_test(
"//pkg/cmd/roachtest/cluster",
"//pkg/cmd/roachtest/logger",
"//pkg/cmd/roachtest/option",
"//pkg/cmd/roachtest/prometheus",
"//pkg/cmd/roachtest/registry",
"//pkg/cmd/roachtest/spec",
"//pkg/cmd/roachtest/test",
Expand All @@ -231,8 +239,10 @@ go_test(
"//pkg/util/timeutil",
"//pkg/util/version",
"@com_github_cockroachdb_errors//:errors",
"@com_github_golang_mock//gomock",
"@com_github_google_go_github//github",
"@com_github_kr_pretty//:pretty",
"@com_github_prometheus_common//model",
"@com_github_stretchr_testify//assert",
"@com_github_stretchr_testify//require",
"@org_golang_x_oauth2//:oauth2",
Expand Down
50 changes: 50 additions & 0 deletions pkg/cmd/roachtest/chaos.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,47 @@ type Chaos struct {
// DrainAndQuit is used to determine if want to kill the node vs draining it
// first and shutting down gracefully.
DrainAndQuit bool
// ChaosEventCh is a channel that the chaos runner will send events on when
// the runner performs an action.
// Chaos is responsible for closing the channel when the test is over.
// This is optional.
ChaosEventCh chan ChaosEvent
}

// ChaosEventType signifies an event that occurs during chaos.
type ChaosEventType uint64

const (
// ChaosEventTypePreShutdown signifies a shutdown on target(s) is about to happen.
ChaosEventTypePreShutdown ChaosEventType = iota
// ChaosEventTypeShutdownComplete signifies that the target(s) have shutdown.
ChaosEventTypeShutdownComplete
// ChaosEventTypePreStartup signifies the target(s) is about to be restarted.
ChaosEventTypePreStartup
// ChaosEventTypeStartupComplete signifies the target(s) have restarted.
ChaosEventTypeStartupComplete

// ChaosEventTypeStart signifies the chaos runner has started.
ChaosEventTypeStart
// ChaosEventTypeEnd signifies the chaos runner has ended.
ChaosEventTypeEnd
)

// ChaosEvent is an event which happens during chaos running.
type ChaosEvent struct {
Type ChaosEventType
Target option.NodeListOption
Time time.Time
}

func (ch *Chaos) sendEvent(t ChaosEventType, target option.NodeListOption) {
if ch.ChaosEventCh != nil {
ch.ChaosEventCh <- ChaosEvent{
Type: t,
Target: target,
Time: timeutil.Now(),
}
}
}

// Runner returns a closure that runs chaos against the given cluster without
Expand All @@ -64,13 +105,18 @@ func (ch *Chaos) Runner(
return err
}
defer func() {
ch.sendEvent(ChaosEventTypeEnd, nil)
if ch.ChaosEventCh != nil {
close(ch.ChaosEventCh)
}
l.Printf("chaos stopping: %v", err)
}()
t := timeutil.Timer{}
{
p, _ := ch.Timer.Timing()
t.Reset(p)
}
ch.sendEvent(ChaosEventTypeStart, nil)
for {
select {
case <-ch.Stopper:
Expand All @@ -86,6 +132,7 @@ func (ch *Chaos) Runner(
target := ch.Target()
m.ExpectDeaths(int32(len(target)))

ch.sendEvent(ChaosEventTypePreShutdown, target)
if ch.DrainAndQuit {
l.Printf("stopping and draining %v\n", target)
if err := c.StopE(ctx, target, option.StopArgs("--sig=15"), withWorkerAction()); err != nil {
Expand All @@ -97,6 +144,7 @@ func (ch *Chaos) Runner(
return errors.Wrapf(err, "could not stop node %s", target)
}
}
ch.sendEvent(ChaosEventTypeShutdownComplete, target)

select {
case <-ch.Stopper:
Expand All @@ -123,9 +171,11 @@ func (ch *Chaos) Runner(
}
l.Printf("restarting %v after %s of downtime\n", target, downTime)
t.Reset(period)
ch.sendEvent(ChaosEventTypePreStartup, target)
if err := c.StartE(ctx, target, withWorkerAction()); err != nil {
return errors.Wrapf(err, "could not restart node %s", target)
}
ch.sendEvent(ChaosEventTypeStartupComplete, target)
}
}
}
Loading

0 comments on commit 310b9fb

Please sign in to comment.