Skip to content

Commit

Permalink
Merge pull request #23576 from mrtracy/mtracy/lease_race_investigation
Browse files Browse the repository at this point in the history
storage: Partially de-flake TestStoreMetrics
  • Loading branch information
Matt Tracy authored Mar 8, 2018
2 parents f7b3725 + 8b890e6 commit ea5f8b2
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 38 deletions.
72 changes: 39 additions & 33 deletions pkg/storage/client_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,15 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/metric"
)

func checkGauge(t *testing.T, g *metric.Gauge, e int64) {
func checkGauge(t *testing.T, id string, g *metric.Gauge, e int64) {
t.Helper()
if a := g.Value(); a != e {
t.Error(errors.Errorf("%s for store: actual %d != expected %d", g.GetName(), a, e))
t.Error(errors.Errorf("%s for store %s: gauge %d != computed %d", g.GetName(), id, a, e))
}
}

func verifyStats(t *testing.T, mtc *multiTestContext, storeIdxSlice ...int) {
t.Helper()
var stores []*storage.Store
var wg sync.WaitGroup

Expand Down Expand Up @@ -81,17 +83,13 @@ func verifyStats(t *testing.T, mtc *multiTestContext, storeIdxSlice ...int) {
wg.Wait()

for _, s := range stores {
fatalf := func(msg string, args ...interface{}) {
prefix := s.Ident.String() + ": "
t.Fatalf(prefix+msg, args...)
}

idString := s.Ident.String()
m := s.Metrics()

// Sanity check: LiveBytes is not zero (ensures we don't have
// zeroed out structures.)
if liveBytes := m.LiveBytes.Value(); liveBytes == 0 {
fatalf("expected livebytes to be non-zero, was zero")
t.Errorf("store %s; got zero live bytes, expected non-zero", idString)
}

// Compute real total MVCC statistics from store.
Expand All @@ -101,23 +99,24 @@ func verifyStats(t *testing.T, mtc *multiTestContext, storeIdxSlice ...int) {
}

// Ensure that real MVCC stats match computed stats.
checkGauge(t, m.LiveBytes, realStats.LiveBytes)
checkGauge(t, m.KeyBytes, realStats.KeyBytes)
checkGauge(t, m.ValBytes, realStats.ValBytes)
checkGauge(t, m.IntentBytes, realStats.IntentBytes)
checkGauge(t, m.LiveCount, realStats.LiveCount)
checkGauge(t, m.KeyCount, realStats.KeyCount)
checkGauge(t, m.ValCount, realStats.ValCount)
checkGauge(t, m.IntentCount, realStats.IntentCount)
checkGauge(t, m.SysBytes, realStats.SysBytes)
checkGauge(t, m.SysCount, realStats.SysCount)
checkGauge(t, idString, m.LiveBytes, realStats.LiveBytes)
checkGauge(t, idString, m.KeyBytes, realStats.KeyBytes)
checkGauge(t, idString, m.ValBytes, realStats.ValBytes)
checkGauge(t, idString, m.IntentBytes, realStats.IntentBytes)
checkGauge(t, idString, m.LiveCount, realStats.LiveCount)
checkGauge(t, idString, m.KeyCount, realStats.KeyCount)
checkGauge(t, idString, m.ValCount, realStats.ValCount)
checkGauge(t, idString, m.IntentCount, realStats.IntentCount)
// TODO(mrtracy): Re-enable SysBytes check when #23574 is fixed.
// checkGauge(t, idString, m.SysBytes, realStats.SysBytes)
checkGauge(t, idString, m.SysCount, realStats.SysCount)
// "Ages" will be different depending on how much time has passed. Even with
// a manual clock, this can be an issue in tests. Therefore, we do not
// verify them in this test.
}

if t.Failed() {
fatalf("verifyStats failed, aborting test.")
}
if t.Failed() {
t.Fatalf("verifyStats failed, aborting test.")
}

// Restart all Stores.
Expand All @@ -136,7 +135,7 @@ func verifyRocksDBStats(t *testing.T, s *storage.Store) {
gauge *metric.Gauge
min int64
}{
{m.RdbBlockCacheHits, 10},
{m.RdbBlockCacheHits, 4},
{m.RdbBlockCacheMisses, 0},
{m.RdbBlockCacheUsage, 0},
{m.RdbBlockCachePinnedUsage, 0},
Expand All @@ -156,7 +155,6 @@ func verifyRocksDBStats(t *testing.T, s *storage.Store) {

func TestStoreMetrics(t *testing.T) {
defer leaktest.AfterTest(t)()
t.Skip("TODO(mrtracy): #9204")

mtc := &multiTestContext{}
defer mtc.Stop()
Expand Down Expand Up @@ -184,7 +182,7 @@ func TestStoreMetrics(t *testing.T) {
}

// Verify range count is as expected
checkGauge(t, mtc.stores[0].Metrics().ReplicaCount, 2)
checkGauge(t, "store 0", mtc.stores[0].Metrics().ReplicaCount, 2)

// Verify all stats on store0 after split.
verifyStats(t, mtc, 0)
Expand All @@ -203,8 +201,8 @@ func TestStoreMetrics(t *testing.T) {
}
mtc.waitForValues(roachpb.Key("z"), []int64{5, 5, 5})

// Verify all stats on store 0 and 1 after addition.
verifyStats(t, mtc, 0, 1)
// Verify all stats on stores after addition.
verifyStats(t, mtc, 0, 1, 2)

// Create a transaction statement that fails, but will add an entry to the
// sequence cache. Regression test for #4969.
Expand All @@ -217,22 +215,30 @@ func TestStoreMetrics(t *testing.T) {
}

// Verify stats after sequence cache addition.
verifyStats(t, mtc, 0)
checkGauge(t, mtc.stores[0].Metrics().ReplicaCount, 2)
verifyStats(t, mtc, 0, 1, 2)
checkGauge(t, "store 0", mtc.stores[0].Metrics().ReplicaCount, 2)

// Unreplicate range from the first store.
mtc.unreplicateRange(replica.RangeID, 0)
testutils.SucceedsSoon(t, func() error {
// This statement can fail if store 0 is not the leaseholder.
if err := mtc.transferLeaseNonFatal(context.TODO(), replica.RangeID, 0, 1); err != nil {
t.Log(err)
}
// This statement will fail if store 0 IS the leaseholder. This can happen
// even after the previous statement.
return mtc.unreplicateRangeNonFatal(replica.RangeID, 0)
})

// Force GC Scan on store 0 in order to fully remove range.
mtc.stores[1].ForceReplicaGCScanAndProcess()
mtc.waitForValues(roachpb.Key("z"), []int64{0, 5, 5})

// Verify range count is as expected.
checkGauge(t, mtc.stores[0].Metrics().ReplicaCount, 1)
checkGauge(t, mtc.stores[1].Metrics().ReplicaCount, 1)
checkGauge(t, "store 0", mtc.stores[0].Metrics().ReplicaCount, 1)
checkGauge(t, "store 1", mtc.stores[1].Metrics().ReplicaCount, 1)

// Verify all stats on store0 and store1 after range is removed.
verifyStats(t, mtc, 0, 1)
// Verify all stats on all stores after range is removed.
verifyStats(t, mtc, 0, 1, 2)

verifyRocksDBStats(t, mtc.stores[0])
verifyRocksDBStats(t, mtc.stores[1])
Expand Down
26 changes: 21 additions & 5 deletions pkg/storage/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1054,6 +1054,7 @@ func (m *multiTestContext) changeReplicas(

// replicateRange replicates the given range onto the given stores.
func (m *multiTestContext) replicateRange(rangeID roachpb.RangeID, dests ...int) {
m.t.Helper()
if err := m.replicateRangeNonFatal(rangeID, dests...); err != nil {
m.t.Fatal(err)
}
Expand Down Expand Up @@ -1098,6 +1099,7 @@ func (m *multiTestContext) replicateRangeNonFatal(rangeID roachpb.RangeID, dests

// unreplicateRange removes a replica of the range from the dest store.
func (m *multiTestContext) unreplicateRange(rangeID roachpb.RangeID, dest int) {
m.t.Helper()
if err := m.unreplicateRangeNonFatal(rangeID, dest); err != nil {
m.t.Fatal(err)
}
Expand Down Expand Up @@ -1155,9 +1157,21 @@ func (m *multiTestContext) waitForValues(key roachpb.Key, expected []int64) {
func (m *multiTestContext) transferLease(
ctx context.Context, rangeID roachpb.RangeID, source int, dest int,
) {
if err := m.transferLeaseNonFatal(ctx, rangeID, source, dest); err != nil {
m.t.Fatal(err)
}
}

// transferLease transfers the lease for the given range from the source
// replica to the target replica. Assumes that the caller knows who the
// current leaseholder is.
// Returns an error rather than calling m.t.Fatal upon error.
func (m *multiTestContext) transferLeaseNonFatal(
ctx context.Context, rangeID roachpb.RangeID, source int, dest int,
) error {
live := m.stores[dest] != nil && !m.stores[dest].IsDraining()
if !live {
m.t.Fatalf("can't transfer lease to down or draining node at index %d", dest)
return errors.Errorf("can't transfer lease to down or draining node at index %d", dest)
}

// Heartbeat the liveness record of the destination node to make sure that the
Expand All @@ -1172,19 +1186,21 @@ func (m *multiTestContext) transferLease(
m.mu.RUnlock()
l, err := nl.Self()
if err != nil {
m.t.Fatal(err)
return err
}
if err := nl.Heartbeat(ctx, l); err != nil {
m.t.Fatal(err)
return err
}

sourceRepl, err := m.stores[source].GetReplica(rangeID)
if err != nil {
m.t.Fatal(err)
return err
}
if err := sourceRepl.AdminTransferLease(context.Background(), m.idents[dest].StoreID); err != nil {
m.t.Fatal(err)
return err
}

return nil
}

// advanceClock advances the mtc's manual clock such that all
Expand Down

0 comments on commit ea5f8b2

Please sign in to comment.