Skip to content

Commit

Permalink
kvserver: fix flake in TestStoreMetrics
Browse files Browse the repository at this point in the history
Fixes cockroachdb#60146

When TestStoreMetrics was converted to use TestCluster in cockroachdb#59670.
The verifyStats method was changed to not stop the stores anymore
that was a mistake that caused this flake. This change adds back the
stopping and fixes a race with lease transfers.

Release note: None
  • Loading branch information
lunevalex committed May 17, 2021
1 parent 09bae48 commit 72c9595
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 19 deletions.
70 changes: 51 additions & 19 deletions pkg/kv/kvserver/client_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ package kvserver_test
import (
"context"
"fmt"
"strconv"
"sync"
"testing"
"time"

Expand Down Expand Up @@ -54,6 +56,8 @@ func checkGauge(t *testing.T, id string, g gaugeValuer, e int64) {
func verifyStats(t *testing.T, tc *testcluster.TestCluster, storeIdxSlice ...int) {
t.Helper()
var stores []*kvserver.Store
var wg sync.WaitGroup

for _, storeIdx := range storeIdxSlice {
stores = append(stores, tc.GetFirstStoreFromServer(t, storeIdx))
}
Expand All @@ -71,6 +75,20 @@ func verifyStats(t *testing.T, tc *testcluster.TestCluster, storeIdxSlice ...int
})
}

wg.Add(len(storeIdxSlice))
// We actually stop *all* of the Servers. Stopping only a few is riddled
// with deadlocks since operations can span nodes, but stoppers don't
// know about this - taking all of them down at the same time is the
// only sane way of guaranteeing that nothing interesting happens, at
// least when bringing down the nodes jeopardizes majorities.
for _, storeIdx := range storeIdxSlice {
go func(i int) {
defer wg.Done()
tc.StopServer(i)
}(storeIdx)
}
wg.Wait()

for _, s := range stores {
idString := s.Ident.String()
m := s.Metrics()
Expand Down Expand Up @@ -107,6 +125,11 @@ func verifyStats(t *testing.T, tc *testcluster.TestCluster, storeIdxSlice ...int
if t.Failed() {
t.Fatalf("verifyStats failed, aborting test.")
}

// Restart all Stores.
for _, storeIdx := range storeIdxSlice {
require.NoError(t, tc.RestartServer(storeIdx))
}
}

func verifyRocksDBStats(t *testing.T, s *kvserver.Store) {
Expand Down Expand Up @@ -228,26 +251,37 @@ func TestStoreMetrics(t *testing.T) {
defer log.Scope(t).Close(t)

ctx := context.Background()
tc := testcluster.StartTestCluster(t, 3,
base.TestClusterArgs{
ReplicationMode: base.ReplicationManual,
ServerArgs: base.TestServerArgs{
CacheSize: 1 << 20, /* 1 MiB */
StoreSpecs: []base.StoreSpec{
{
InMemory: true,
// Specify a size to trigger the BlockCache in Pebble.
Size: base.SizeSpec{
InBytes: 512 << 20, /* 512 MiB */
},
stickyEngineRegistry := server.NewStickyInMemEnginesRegistry()
defer stickyEngineRegistry.CloseAllStickyInMemEngines()
const numServers int = 3
stickyServerArgs := make(map[int]base.TestServerArgs)
for i := 0; i < numServers; i++ {
stickyServerArgs[i] = base.TestServerArgs{
CacheSize: 1 << 20, /* 1 MiB */
StoreSpecs: []base.StoreSpec{
{
InMemory: true,
StickyInMemoryEngineID: strconv.FormatInt(int64(i), 10),
// Specify a size to trigger the BlockCache in Pebble.
Size: base.SizeSpec{
InBytes: 512 << 20, /* 512 MiB */
},
},
Knobs: base.TestingKnobs{
Store: &kvserver.StoreTestingKnobs{
DisableRaftLogQueue: true,
},
},
Knobs: base.TestingKnobs{
Server: &server.TestingKnobs{
StickyEngineRegistry: stickyEngineRegistry,
},
Store: &kvserver.StoreTestingKnobs{
DisableRaftLogQueue: true,
},
},
}
}
tc := testcluster.StartTestCluster(t, numServers,
base.TestClusterArgs{
ReplicationMode: base.ReplicationManual,
ServerArgsPerNode: stickyServerArgs,
})
defer tc.Stopper().Stop(ctx)

Expand Down Expand Up @@ -303,9 +337,7 @@ func TestStoreMetrics(t *testing.T) {
// Verify stats after addition.
verifyStats(t, tc, 1, 2)
checkGauge(t, "store 0", tc.GetFirstStoreFromServer(t, 0).Metrics().ReplicaCount, initialCount+1)

tc.TransferRangeLeaseOrFatal(t, desc, tc.Target(1))
tc.RemoveVotersOrFatal(t, key, tc.Target(0))
tc.RemoveLeaseHolderOrFatal(t, desc, tc.Target(0), tc.Target(1))
testutils.SucceedsSoon(t, func() error {
_, err := tc.GetFirstStoreFromServer(t, 0).GetReplica(desc.RangeID)
if err == nil {
Expand Down
21 changes: 21 additions & 0 deletions pkg/testutils/testcluster/testcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -848,6 +848,27 @@ func (tc *TestCluster) TransferRangeLeaseOrFatal(
}
}

// RemoveLeaseHolderOrFatal is a convenience version of TransferRangeLease and RemoveVoter
func (tc *TestCluster) RemoveLeaseHolderOrFatal(
t testing.TB,
rangeDesc roachpb.RangeDescriptor,
src roachpb.ReplicationTarget,
dest roachpb.ReplicationTarget,
) {
testutils.SucceedsSoon(t, func() error {
if err := tc.TransferRangeLease(rangeDesc, dest); err != nil {
return err
}
if _, err := tc.RemoveVoters(rangeDesc.StartKey.AsRawKey(), src); err != nil {
if strings.Contains(err.Error(), "to remove self (leaseholder)") {
return err
}
t.Fatal(err)
}
return nil
})
}

// MoveRangeLeaseNonCooperatively is part of the TestClusterInterface.
func (tc *TestCluster) MoveRangeLeaseNonCooperatively(
rangeDesc roachpb.RangeDescriptor, dest roachpb.ReplicationTarget, manual *hlc.HybridManualClock,
Expand Down

0 comments on commit 72c9595

Please sign in to comment.