-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
Copy pathadmission_control_tpcc_overload.go
187 lines (174 loc) · 6.67 KB
/
admission_control_tpcc_overload.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package tests
import (
"context"
"fmt"
"strings"
"time"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/ts/tspb"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
)
// tpccOlapQuery is a contrived query that seems to do serious damage to a
// cluster. The query itself is a hash join with a selective filter and a
// limited sort.
const tpccOlapQuery = `SELECT
i_id, s_w_id, s_quantity, i_price
FROM
stock JOIN item ON s_i_id = i_id
WHERE
s_quantity < 100 AND i_price > 90
ORDER BY
i_price DESC, s_quantity ASC
LIMIT
100;`
type tpccOLAPSpec struct {
Nodes int
CPUs int
Warehouses int
Concurrency int
}
func (s tpccOLAPSpec) run(ctx context.Context, t test.Test, c cluster.Cluster) {
crdbNodes, workloadNode := setupTPCC(
ctx, t, c, tpccOptions{
Warehouses: s.Warehouses, SetupType: usingImport,
})
// We make use of querybench below, only available through the `workload`
// binary.
c.Put(ctx, t.DeprecatedWorkload(), "./workload", workloadNode)
const queryFileName = "queries.sql"
// querybench expects the entire query to be on a single line.
queryLine := `"` + strings.Replace(tpccOlapQuery, "\n", " ", -1) + `"`
c.Run(ctx, workloadNode, "echo", queryLine, "> "+queryFileName)
t.Status("waiting")
m := c.NewMonitor(ctx, crdbNodes)
rampDuration := 2 * time.Minute
duration := 3 * time.Minute
m.Go(func(ctx context.Context) error {
t.WorkerStatus("running querybench")
cmd := fmt.Sprintf(
"./workload run querybench --db tpcc"+
" --tolerate-errors=t"+
" --concurrency=%d"+
" --query-file %s"+
" --histograms="+t.PerfArtifactsDir()+"/stats.json "+
" --ramp=%s --duration=%s {pgurl:1-%d}",
s.Concurrency, queryFileName, rampDuration, duration, c.Spec().NodeCount-1)
c.Run(ctx, workloadNode, cmd)
return nil
})
m.Wait()
verifyNodeLiveness(ctx, c, t, duration)
}
// Check that node liveness did not fail more than maxFailures times across
// all of the nodes.
func verifyNodeLiveness(
ctx context.Context, c cluster.Cluster, t test.Test, runDuration time.Duration,
) {
const maxFailures = 10
adminURLs, err := c.ExternalAdminUIAddr(ctx, t.L(), c.Node(1))
if err != nil {
t.Fatal(err)
}
now := timeutil.Now()
var response tspb.TimeSeriesQueryResponse
// Retry because timeseries queries can fail if the underlying inter-node
// connections are in a failed state which can happen due to overload.
// Now that the load has stopped, this should resolve itself soon.
// Even with 60 retries we'll at most spend 30s attempting to fetch
// the metrics.
if err := retry.WithMaxAttempts(ctx, retry.Options{
MaxBackoff: 500 * time.Millisecond,
}, 60, func() (err error) {
response, err = getMetrics(adminURLs[0], now.Add(-runDuration), now, []tsQuery{
{
name: "cr.node.liveness.heartbeatfailures",
queryType: total,
},
})
return err
}); err != nil {
t.Fatalf("failed to fetch liveness metrics: %v", err)
}
if len(response.Results[0].Datapoints) <= 1 {
t.Fatalf("not enough datapoints in timeseries query response: %+v", response)
}
datapoints := response.Results[0].Datapoints
finalCount := int(datapoints[len(datapoints)-1].Value)
initialCount := int(datapoints[0].Value)
if failures := finalCount - initialCount; failures > maxFailures {
t.Fatalf("Node liveness failed %d times, expected no more than %d",
failures, maxFailures)
} else {
t.L().Printf("Node liveness failed %d times which is fewer than %d",
failures, maxFailures)
}
}
func registerTPCCOverload(r registry.Registry) {
specs := []tpccOLAPSpec{
{
CPUs: 8,
Concurrency: 96,
Nodes: 3,
Warehouses: 50,
},
}
for _, s := range specs {
name := fmt.Sprintf("admission-control/tpcc-olap/nodes=%d/cpu=%d/w=%d/c=%d",
s.Nodes, s.CPUs, s.Warehouses, s.Concurrency)
r.Add(registry.TestSpec{
Name: name,
Owner: registry.OwnerAdmissionControl,
Tags: []string{`weekly`},
Cluster: r.MakeClusterSpec(s.Nodes+1, spec.CPU(s.CPUs)),
Run: s.run,
EncryptionSupport: registry.EncryptionMetamorphic,
Timeout: 20 * time.Minute,
})
}
}
// This test begins a ramping TPCC workload that will overwhelm the CRDB nodes.
// There is no way to "pass" this test since the 6 nodes can't possibly handle
// 10K warehouses. If they could handle this load, then the test should be
// changed to increase that count. The purpose of the test is to make sure that
// the nodes don't fail under unsustainable overload. As of today (v22.2), the
// CRDB nodes will eventually OOM around 3-4 hours through the ramp period.
func registerTPCCSevereOverload(r registry.Registry) {
r.Add(registry.TestSpec{
Name: "admission-control/tpcc-severe-overload",
Owner: registry.OwnerAdmissionControl,
// TODO(abaptist): This test will require a lot of admission control work
// to pass. Just putting it here to make easy to run at any time.
Skip: "#89142",
Cluster: r.MakeClusterSpec(7, spec.CPU(8)),
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
roachNodes := c.Range(1, c.Spec().NodeCount-1)
workloadNode := c.Spec().NodeCount
c.Put(ctx, t.Cockroach(), "./cockroach", c.All())
c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), roachNodes)
t.Status("initializing (~1h)")
c.Run(ctx, c.Node(workloadNode), "./cockroach workload fixtures import tpcc --checks=false --warehouses=10000 {pgurl:1}")
// This run passes through 4 "phases"
// 1) No admission control, low latencies (up to ~1500 warehouses).
// 2) Admission control delays, growing latencies (up to ~3000 warehouses).
// 3) High latencies (100s+), queues building (up to ~4500 warehouse).
// 4) Memory and goroutine unbounded growth with eventual node crashes (up to ~6000 warehouse).
t.Status("running workload (fails in ~3-4 hours)")
c.Run(ctx, c.Node(workloadNode), "./cockroach workload run tpcc --ramp=6h --tolerate-errors --warehouses=10000 '{pgurl:1-6}'")
},
})
}