Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

roachtest: add cluster settings operations #123806

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pkg/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ ALL_TESTS = [
"//pkg/cmd/roachprod-microbench/cluster:cluster_test",
"//pkg/cmd/roachprod-microbench:roachprod-microbench_test",
"//pkg/cmd/roachtest/clusterstats:clusterstats_test",
"//pkg/cmd/roachtest/operations:operations_test",
"//pkg/cmd/roachtest/option:option_test",
"//pkg/cmd/roachtest/registry:registry_test",
"//pkg/cmd/roachtest/roachtestflags:roachtestflags_test",
Expand Down Expand Up @@ -1164,6 +1165,7 @@ GO_TARGETS = [
"//pkg/cmd/roachtest/grafana:grafana",
"//pkg/cmd/roachtest/operation:operation",
"//pkg/cmd/roachtest/operations:operations",
"//pkg/cmd/roachtest/operations:operations_test",
"//pkg/cmd/roachtest/option:option",
"//pkg/cmd/roachtest/option:option_test",
"//pkg/cmd/roachtest/registry:registry",
Expand Down
11 changes: 10 additions & 1 deletion pkg/cmd/roachtest/operations/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")

go_library(
name = "operations",
srcs = [
"add_column.go",
"add_index.go",
"cluster_settings.go",
"disk_stall.go",
"network_partition.go",
"node_kill.go",
Expand All @@ -22,5 +23,13 @@ go_library(
"//pkg/cmd/roachtest/roachtestutil",
"//pkg/roachprod",
"//pkg/util/randutil",
"//pkg/util/timeutil",
],
)

go_test(
name = "operations_test",
srcs = ["cluster_settings_test.go"],
embed = [":operations"],
deps = ["@com_github_stretchr_testify//require"],
)
120 changes: 120 additions & 0 deletions pkg/cmd/roachtest/operations/cluster_settings.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
//

package operations

import (
"context"
"fmt"
"math/rand"
"strings"
"time"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/operation"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestflags"
"github.com/cockroachdb/cockroach/pkg/util/randutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
)

type clusterSettingOp struct {
Name string
Generator func() string
Owner registry.Owner
}

func timeBasedValue(
timeSupplier func() time.Time, frequency time.Duration, valueForSegment func(int64) string,
) func() string {
return func() string {
segment := timeSupplier().Unix() / int64(frequency.Seconds())
return valueForSegment(segment)
}
}

// timeBasedValues returns a function that returns a value from the given list
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!

// of values based on the current time and the given frequency.
func timeBasedValues(
timeSupplier func() time.Time, values []string, frequency time.Duration,
) func() string {
return timeBasedValue(timeSupplier, frequency, func(segment int64) string {
idx := int(segment) % len(values)
return values[idx]
})
}

// timeBasedRandomValue returns a function that returns a random value based on
// the current time and the given frequency.
func timeBasedRandomValue(
timeSupplier func() time.Time, frequency time.Duration, valueFromRNG func(*rand.Rand) string,
) func() string {
return func() string {
segment := timeSupplier().Unix() / int64(frequency.Seconds())
return valueFromRNG(randutil.NewTestRandWithSeed(segment))
}
}

func setClusterSetting(
ctx context.Context, o operation.Operation, c cluster.Cluster, op clusterSettingOp,
) registry.OperationCleanup {
value := op.Generator()
conn := c.Conn(ctx, o.L(), 1, option.VirtualClusterName(roachtestflags.VirtualCluster))
defer conn.Close()

o.Status(fmt.Sprintf("setting cluster setting %s to %s", op.Name, value))
_, err := conn.ExecContext(ctx, fmt.Sprintf("SET CLUSTER SETTING %s = '%s'", op.Name, value))
if err != nil {
o.Fatal(err)
}
return nil
}

func registerClusterSettings(r registry.Registry) {
timeSupplier := func() time.Time {
return timeutil.Now()
}
ops := []clusterSettingOp{
// Converts all leases to expiration. Tradeoff between lower throughput and higher availability.
// Weekly cycle.
{
Name: "kv.expiration_leases_only.enabled",
Generator: timeBasedValues(timeSupplier, []string{"true", "false"}, 24*7*time.Minute),
Owner: registry.OwnerKV,
},
// When running multi-store with `--wal-failover=among-stores`, this configures
// the threshold to trigger a fail-over to a secondary store’s WAL.
// 20-minute cycle.
{
Name: "storage.wal_failover.unhealthy_op_threshold",
Generator: timeBasedRandomValue(timeSupplier, 20*time.Minute, func(rng *rand.Rand) string {
return fmt.Sprintf("%d", rng.Intn(246)+5)
}),
Owner: registry.OwnerStorage,
},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should add one for kv.snapshot_receiver.excise.enabled as well, but I'm good with this as a follow-up change.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea, merging this one for now. Will create a follow-up PR with some additional cluster settings, I have some others in mind as well.

}
sanitizeOpName := func(name string) string {
return strings.ReplaceAll(name, ".", "_")
}
for _, op := range ops {
r.AddOperation(registry.OperationSpec{
Name: "cluster-settings/scheduled/" + sanitizeOpName(op.Name),
Owner: op.Owner,
Timeout: 5 * time.Minute,
CompatibleClouds: registry.AllClouds,
Dependencies: []registry.OperationDependency{registry.OperationRequiresNodes},
Run: func(ctx context.Context, o operation.Operation, c cluster.Cluster) registry.OperationCleanup {
return setClusterSetting(ctx, o, c, op)
},
})
}
}
80 changes: 80 additions & 0 deletions pkg/cmd/roachtest/operations/cluster_settings_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
//

package operations

import (
"fmt"
"math/rand"
"testing"
"time"

"github.com/stretchr/testify/require"
)

func TestTimeBasedValueGenerator(t *testing.T) {
// Define a time supplier that steps by 10 minutes
curTime := time.Date(2022, 1, 1, 0, 20, 0, 0, time.UTC)
timeSupplier := func() time.Time {
defer func() {
curTime = curTime.Add(10 * time.Minute)
}()
return curTime
}

// Define the values and frequency for the test
values := []string{"value1", "value2", "value3"}
frequency := 15 * time.Minute

// Create the value generator
valueGenerator := timeBasedValues(timeSupplier, values, frequency)
// Call the value generator and check the result, one value should be generated
// twice and the other once, since the frequency is 15 minutes and the time
// supplier steps by 10 minutes.
counts := map[string]int{}
for i := 0; i < 3; i++ {
value := valueGenerator()
counts[value]++
}
expectedCounts := map[string]int{
"value2": 1,
"value3": 2,
}
require.Equal(t, expectedCounts, counts)
}

func TestTimeBasedRandomValueGenerator(t *testing.T) {
// Define a time supplier that steps by 10 minutes
curTime := time.Date(2022, 1, 1, 0, 20, 0, 0, time.UTC)
timeSupplier := func() time.Time {
defer func() {
curTime = curTime.Add(10 * time.Minute)
}()
return curTime
}

// Create the value generator
valueGenerator := timeBasedRandomValue(timeSupplier, 30*time.Minute, func(rng *rand.Rand) string {
return fmt.Sprintf("%d", rng.Intn(246)+5)
})
// Call the value generator and check the result, we expect the same value to
// be generated a few times since the call frequency is higher than the random
// value generation frequency.
counts := map[string]int{}
for i := 0; i < 10; i++ {
value := valueGenerator()
counts[value]++
}
expectedCounts := map[string]int{
"108": 3, "178": 3, "43": 1, "58": 3,
}
require.Equal(t, expectedCounts, counts)
}
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/operations/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ func RegisterOperations(r registry.Registry) {
registerNetworkPartition(r)
registerDiskStall(r)
registerNodeKill(r)
registerClusterSettings(r)
}
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/roachtestutil/operations/dependency.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ func CheckDependencies(
for _, dep := range spec.Dependencies {
switch dep {
case registry.OperationRequiresNodes:
if len(c.Nodes()) == 0 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's counter-intuitive that c.Nodes() denotes the empty set (of nodes). We should probably update the API; my quick audit of all the callers didn't yield any other than the above.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, took me a few to realise why OperationRequiresNodes wasn't working.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for catching this!

if len(c.All()) == 0 {
return false, nil
}
case registry.OperationRequiresPopulatedDatabase:
Expand Down
Loading