Skip to content

Commit

Permalink
Merge pull request cockroachdb#129327 from cockroachdb/blathers/backp…
Browse files Browse the repository at this point in the history
…ort-release-24.2-129117
  • Loading branch information
renatolabs authored Aug 20, 2024
2 parents 1f02d5d + 5612599 commit aa5cfc5
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 25 deletions.
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ func verifyNodeLiveness(
if err := retry.WithMaxAttempts(ctx, retry.Options{
MaxBackoff: 500 * time.Millisecond,
}, 60, func() (err error) {
response, err = getMetrics(ctx, c, t, adminURLs[0], now.Add(-runDuration), now, []tsQuery{
response, err = getMetrics(ctx, c, t, adminURLs[0], "", now.Add(-runDuration), now, []tsQuery{
{
name: "cr.node.liveness.heartbeatfailures",
queryType: total,
Expand Down
6 changes: 3 additions & 3 deletions pkg/cmd/roachtest/tests/disk_stall.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ func runDiskStalledWALFailover(
t.Fatal("process exited unexectedly")
}

data := mustGetMetrics(ctx, c, t, adminURL,
data := mustGetMetrics(ctx, c, t, adminURL, install.SystemInterfaceName,
workloadStartAt.Add(5*time.Minute),
timeutil.Now().Add(-time.Minute),
[]tsQuery{
Expand Down Expand Up @@ -308,7 +308,7 @@ func runDiskStalledDetection(
}

stallAt := timeutil.Now()
response := mustGetMetrics(ctx, c, t, adminURL, workloadStartAt, stallAt, []tsQuery{
response := mustGetMetrics(ctx, c, t, adminURL, install.SystemInterfaceName, workloadStartAt, stallAt, []tsQuery{
{name: "cr.node.sql.query.count", queryType: total},
})
cum := response.Results[0].Datapoints
Expand Down Expand Up @@ -360,7 +360,7 @@ func runDiskStalledDetection(

{
now := timeutil.Now()
response := mustGetMetrics(ctx, c, t, adminURL, workloadStartAt, now, []tsQuery{
response := mustGetMetrics(ctx, c, t, adminURL, install.SystemInterfaceName, workloadStartAt, now, []tsQuery{
{name: "cr.node.sql.query.count", queryType: total},
})
cum := response.Results[0].Datapoints
Expand Down
35 changes: 19 additions & 16 deletions pkg/cmd/roachtest/tests/rebalance_load.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,15 @@ func registerRebalanceLoad(r registry.Registry) {
mixedversion.ClusterSettingOption(
install.ClusterSettingsOption(settings.ClusterSettings),
),
// Multi-tenant deployments are currently unsupported. See #127378.
mixedversion.EnabledDeploymentModes(mixedversion.SystemOnlyDeployment),
)
mvt.OnStartup("maybe enable split/scatter on tenant",
func(ctx context.Context, l *logger.Logger, r *rand.Rand, h *mixedversion.Helper) error {
return enableTenantSplitScatter(l, r, h)
})
mvt.InMixedVersion("rebalance load run",
func(ctx context.Context, l *logger.Logger, r *rand.Rand, h *mixedversion.Helper) error {
return rebalanceByLoad(
ctx, t, c, rebalanceMode, maxDuration, concurrency, appNode, numStores, numNodes)
ctx, t, l, c, rebalanceMode, maxDuration, concurrency, appNode, numStores, numNodes)
})
mvt.Run()
} else {
Expand All @@ -122,7 +124,7 @@ func registerRebalanceLoad(r registry.Registry) {
settings.ClusterSettings["server.cpu_profile.cpu_usage_combined_threshold"] = "90"
c.Start(ctx, t.L(), startOpts, settings, roachNodes)
require.NoError(t, rebalanceByLoad(
ctx, t, c, rebalanceMode, maxDuration,
ctx, t, t.L(), c, rebalanceMode, maxDuration,
concurrency, appNode, numStores, numNodes,
))
}
Expand Down Expand Up @@ -193,7 +195,7 @@ func registerRebalanceLoad(r registry.Registry) {
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
if c.IsLocal() {
concurrency = 32
fmt.Printf("lowering concurrency to %d in local testing\n", concurrency)
t.L().Printf("lowering concurrency to %d in local testing", concurrency)
}
rebalanceLoadRun(
ctx, t, c, "leases and replicas", 10*time.Minute, concurrency, true, /* mixedVersion */
Expand Down Expand Up @@ -231,6 +233,7 @@ func registerRebalanceLoad(r registry.Registry) {
func rebalanceByLoad(
ctx context.Context,
t test.Test,
l *logger.Logger,
c cluster.Cluster,
rebalanceMode string,
maxDuration time.Duration,
Expand All @@ -246,10 +249,10 @@ func rebalanceByLoad(
splits := (numStores * storeToRangeFactor) - 1
c.Run(ctx, option.WithNodes(appNode), fmt.Sprintf("./cockroach workload init kv --drop --splits=%d {pgurl:1}", splits))

db := c.Conn(ctx, t.L(), 1)
db := c.Conn(ctx, l, 1)
defer db.Close()

require.NoError(t, WaitFor3XReplication(ctx, t, t.L(), db))
require.NoError(t, WaitFor3XReplication(ctx, t, l, db))

var m *errgroup.Group
m, ctx = errgroup.WithContext(ctx)
Expand All @@ -260,7 +263,7 @@ func rebalanceByLoad(
ctx, cancel := context.WithCancel(ctx)

m.Go(func() error {
t.L().Printf("starting load generator\n")
l.Printf("starting load generator")
err := c.RunE(ctx, option.WithNodes(appNode), fmt.Sprintf(
"./cockroach workload run kv --read-percent=95 --tolerate-errors --concurrency=%d "+
"--duration=%v {pgurl:1-%d}",
Expand All @@ -275,9 +278,9 @@ func rebalanceByLoad(
})

m.Go(func() error {
t.Status("checking for CPU balance")
l.Printf("checking for CPU balance")

storeCPUFn, err := makeStoreCPUFn(ctx, c, t, numNodes, numStores)
storeCPUFn, err := makeStoreCPUFn(ctx, t, l, c, numNodes, numStores)
if err != nil {
return err
}
Expand All @@ -297,18 +300,18 @@ func rebalanceByLoad(
now := timeutil.Now()
clusterStoresCPU, err := storeCPUFn(ctx)
if err != nil {
t.L().Printf("unable to get the cluster stores CPU %s\n", err.Error())
l.Printf("unable to get the cluster stores CPU: %v", err)
continue
}
var curIsBalanced bool
curIsBalanced, reason = isLoadEvenlyDistributed(clusterStoresCPU, meanCPUTolerance)
t.L().Printf("cpu %s", reason)
l.Printf("cpu %s", reason)
if !prevIsBalanced && curIsBalanced {
balancedStartTime = now
}
prevIsBalanced = curIsBalanced
if prevIsBalanced && now.Sub(balancedStartTime) > stableDuration {
t.Status("successfully achieved CPU balance; waiting for kv to finish running")
l.Printf("successfully achieved CPU balance; waiting for kv to finish running")
cancel()
return nil
}
Expand All @@ -322,9 +325,9 @@ func rebalanceByLoad(
// the cluster stores. When there are multiple stores per node, stores on the
// same node will report identical CPU.
func makeStoreCPUFn(
octx context.Context, c cluster.Cluster, t test.Test, numNodes, numStores int,
ctx context.Context, t test.Test, l *logger.Logger, c cluster.Cluster, numNodes, numStores int,
) (func(ctx context.Context) ([]float64, error), error) {
adminURLs, err := c.ExternalAdminUIAddr(octx, t.L(), c.Node(1))
adminURLs, err := c.ExternalAdminUIAddr(ctx, l, c.Node(1))
if err != nil {
return nil, err
}
Expand All @@ -342,7 +345,7 @@ func makeStoreCPUFn(
return func(ctx context.Context) ([]float64, error) {
now := timeutil.Now()
resp, err := getMetricsWithSamplePeriod(
ctx, c, t, url, startTime, now, statSamplePeriod, tsQueries)
ctx, c, t, url, install.SystemInterfaceName, startTime, now, statSamplePeriod, tsQueries)
if err != nil {
return nil, err
}
Expand Down
17 changes: 12 additions & 5 deletions pkg/cmd/roachtest/tests/ts_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/ts/tspb"
)

Expand Down Expand Up @@ -51,10 +52,11 @@ func mustGetMetrics(
c cluster.Cluster,
t test.Test,
adminURL string,
virtualCluster string,
start, end time.Time,
tsQueries []tsQuery,
) tspb.TimeSeriesQueryResponse {
response, err := getMetrics(ctx, c, t, adminURL, start, end, tsQueries)
response, err := getMetrics(ctx, c, t, adminURL, virtualCluster, start, end, tsQueries)
if err != nil {
t.Fatal(err)
}
Expand All @@ -66,17 +68,19 @@ func getMetrics(
c cluster.Cluster,
t test.Test,
adminURL string,
virtualCluster string,
start, end time.Time,
tsQueries []tsQuery,
) (tspb.TimeSeriesQueryResponse, error) {
return getMetricsWithSamplePeriod(ctx, c, t, adminURL, start, end, defaultSamplePeriod, tsQueries)
return getMetricsWithSamplePeriod(ctx, c, t, adminURL, virtualCluster, start, end, defaultSamplePeriod, tsQueries)
}

func getMetricsWithSamplePeriod(
ctx context.Context,
c cluster.Cluster,
t test.Test,
adminURL string,
virtualCluster string,
start, end time.Time,
samplePeriod time.Duration,
tsQueries []tsQuery,
Expand Down Expand Up @@ -114,7 +118,10 @@ func getMetricsWithSamplePeriod(
Queries: queries,
}
var response tspb.TimeSeriesQueryResponse
client := roachtestutil.DefaultHTTPClient(c, t.L(), roachtestutil.HTTPTimeout(500*time.Millisecond))
client := roachtestutil.DefaultHTTPClient(
c, t.L(), roachtestutil.HTTPTimeout(500*time.Millisecond),
roachtestutil.VirtualCluster(virtualCluster),
)
err := client.PostProtobuf(ctx, url, &request, &response)
return response, err

Expand All @@ -134,7 +141,7 @@ func verifyTxnPerSecond(
t.Fatal(err)
}
adminURL := adminUIAddrs[0]
response := mustGetMetrics(ctx, c, t, adminURL, start, end, []tsQuery{
response := mustGetMetrics(ctx, c, t, adminURL, install.SystemInterfaceName, start, end, []tsQuery{
{name: "cr.node.txn.commits", queryType: rate},
{name: "cr.node.txn.commits", queryType: total},
})
Expand Down Expand Up @@ -185,7 +192,7 @@ func verifyLookupsPerSec(
t.Fatal(err)
}
adminURL := adminUIAddrs[0]
response := mustGetMetrics(ctx, c, t, adminURL, start, end, []tsQuery{
response := mustGetMetrics(ctx, c, t, adminURL, install.SystemInterfaceName, start, end, []tsQuery{
{name: "cr.node.distsender.rangelookups", queryType: rate},
})

Expand Down

0 comments on commit aa5cfc5

Please sign in to comment.