From 4f9e30da3f155efbc53a59efd77545249eb7b8f4 Mon Sep 17 00:00:00 2001 From: sumeerbhola Date: Tue, 24 Oct 2023 20:24:15 -0400 Subject: [PATCH] kvserver: add BenchmarkNodeLivenessScanStorage to measure liveness scan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Node liveness scans, like the one done in MaybeGossipNodeLivenessRaftMuLocked, while holding raftMu, are performance sensitive, and slowness has caused production issues (https://github.com/cockroachlabs/support/issues/2665, https://github.com/cockroachlabs/support/issues/2107). This benchmark measures the scan performance both when DELs (due to GC) have not been compacted away, and when they have. It also sets up a varying number of live versions since decommissioned nodes will have a single live version. Results on M1 macbook on master with dead-keys=false and compacted=true: NodeLivenessScanStorage/num-live=2/compacted=true-10 26.80µ ± 9% NodeLivenessScanStorage/num-live=5/compacted=true-10 30.34µ ± 3% NodeLivenessScanStorage/num-live=10/compacted=true-10 38.88µ ± 8% NodeLivenessScanStorage/num-live=1000/compacted=true-10 861.5µ ± 3% When compacted=false the scan takes ~10ms, which is > 100x slower, but probably acceptable for this workload. NodeLivenessScanStorage/num-live=2/compacted=false-10 9.430m ± 5% NodeLivenessScanStorage/num-live=5/compacted=false-10 9.534m ± 4% NodeLivenessScanStorage/num-live=10/compacted=false-10 9.456m ± 2% NodeLivenessScanStorage/num-live=1000/compacted=false-10 10.34m ± 7% dead-keys=true (and compacted=false) defeats the NextPrefix optimization, since the next prefix can have all its keys deleted and the iterator has to step through all of them (it can't be sure that all the keys for that next prefix are deleted). This case should not occur in the liveness range, as we don't remove decommissioned entries, but is included for better understanding. NodeLivenessScanStorage/num-live=2/dead-keys=true/compacted=false-10 58.33m Compared to v22.2, the results are sometimes > 10x faster, when the pebbleMVCCScanner seek optimization in v22.2 was defeated. │ sec/op │ sec/op vs base │ NodeLivenessScanStorage/num-live=2/compacted=false-10 117.280m ± 2% 9.430m ± 5% -91.96% (p=0.002 n=6) NodeLivenessScanStorage/num-live=5/compacted=false-10 117.298m ± 0% 9.534m ± 4% -91.87% (p=0.002 n=6) NodeLivenessScanStorage/num-live=10/compacted=false-10 12.009m ± 0% 9.456m ± 2% -21.26% (p=0.002 n=6) NodeLivenessScanStorage/num-live=1000/compacted=false-10 13.04m ± 0% 10.34m ± 7% -20.66% (p=0.002 n=6) │ block-bytes/op │ block-bytes/op vs base │ NodeLivenessScanStorage/num-live=2/compacted=false-10 14.565Mi ± 0% 8.356Mi ± 0% -42.63% (p=0.002 n=6) NodeLivenessScanStorage/num-live=5/compacted=false-10 14.570Mi ± 0% 8.361Mi ± 0% -42.61% (p=0.002 n=6) NodeLivenessScanStorage/num-live=10/compacted=false-10 11.094Mi ± 0% 8.368Mi ± 0% -24.57% (p=0.002 n=6) NodeLivenessScanStorage/num-live=1000/compacted=false-10 12.235Mi ± 0% 8.990Mi ± 0% -26.53% (p=0.002 n=6) │ B/op │ B/op vs base │ NodeLivenessScanStorage/num-live=2/compacted=false-10 42.83Ki ± 4% 41.87Ki ± 0% -2.22% (p=0.002 n=6) NodeLivenessScanStorage/num-live=5/compacted=false-10 43.28Ki ± 3% 41.84Ki ± 0% -3.32% (p=0.002 n=6) NodeLivenessScanStorage/num-live=10/compacted=false-10 37.59Ki ± 0% 41.92Ki ± 0% +11.51% (p=0.002 n=6) NodeLivenessScanStorage/num-live=1000/compacted=false-10 37.67Ki ± 1% 42.66Ki ± 0% +13.23% (p=0.002 n=6) │ allocs/op │ allocs/op vs base │ NodeLivenessScanStorage/num-live=2/compacted=false-10 105.00 ± 8% 85.00 ± 0% -19.05% (p=0.002 n=6) NodeLivenessScanStorage/num-live=5/compacted=false-10 107.00 ± 5% 85.00 ± 0% -20.56% (p=0.002 n=6) NodeLivenessScanStorage/num-live=10/compacted=false-10 74.00 ± 1% 85.00 ± 0% +14.86% (p=0.002 n=6) NodeLivenessScanStorage/num-live=1000/compacted=false-10 79.00 ± 1% 92.00 ± 1% +16.46% (p=0.002 n=6) Relates to https://github.com/cockroachlabs/support/issues/2665 Epic: none Release note: None --- pkg/kv/kvserver/node_liveness_test.go | 147 ++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/pkg/kv/kvserver/node_liveness_test.go b/pkg/kv/kvserver/node_liveness_test.go index 38135491119a..cd358b707a15 100644 --- a/pkg/kv/kvserver/node_liveness_test.go +++ b/pkg/kv/kvserver/node_liveness_test.go @@ -13,6 +13,7 @@ package kvserver_test import ( "bytes" "context" + "fmt" "reflect" "sort" "strconv" @@ -33,9 +34,11 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/listenerutil" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/testutils/skip" "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/leaktest" @@ -1301,3 +1304,147 @@ func TestNodeLivenessDecommissionAbsent(t *testing.T) { // Recommission from third node. setMembershipStatus(nl2, livenesspb.MembershipStatus_ACTIVE, true) } + +func BenchmarkNodeLivenessScanStorage(b *testing.B) { + skip.UnderShort(b) + defer log.Scope(b).Close(b) + + ctx := context.Background() + const numNodes = 100 + setupEng := func(b *testing.B, numLiveVersions int, haveFullyDeadKeys bool) storage.Engine { + eng := storage.NewDefaultInMemForTesting(storage.DisableAutomaticCompactions) + // 20 per minute, so 1000 represents 50 min of liveness writes in a level. + // This is unusual, but we can have such accumulation if flushes and + // compactions are rare. + const numVersionsPerLevel = 1000 + // All versions in level l will be deleted in level l+1. The versions + // written at the highest level are not deleted and the number of these + // per node is controlled by numLiveVersions. Additionally, if + // haveFullyDeadKeys is true, the highest level only has live versions for + // alternating nodes. + // + // NB: haveFullyDeadKeys=true is not representative of NodeLiveness, since + // we don't remove decommissioned entries. It is included here just to + // understand the effect on the NextPrefix optimization. + const numLevels = 7 + tsFunc := func(l int, v int) int64 { + return int64(l*numVersionsPerLevel + v + 10) + } + for l := 0; l < numLevels; l++ { + for v := 0; v < numVersionsPerLevel; v++ { + ts := tsFunc(l, v) + for n := roachpb.NodeID(0); n < numNodes; n++ { + lKey := keys.NodeLivenessKey(n) + // Always write a version if not at the highest level. If at the + // highest level, only write if v < numLiveVersions. Additionally, + // either haveFullyDeadKeys must be false or this node is one that + // has live versions. + if l < numLevels-1 || (v < numLiveVersions && (!haveFullyDeadKeys || n%2 == 0)) { + liveness := livenesspb.Liveness{ + NodeID: n, + Epoch: 100, + Expiration: hlc.LegacyTimestamp{WallTime: ts}, + Draining: false, + Membership: livenesspb.MembershipStatus_ACTIVE, + } + + require.NoError(b, storage.MVCCPutProto( + ctx, eng, lKey, hlc.Timestamp{WallTime: ts}, &liveness, + storage.MVCCWriteOptions{})) + } + // Else most recent level and the other conditions for writing a + // version are not satisfied. + + if l != 0 { + // Clear the key from the next older level. + require.NoError(b, eng.ClearMVCC(storage.MVCCKey{ + Key: lKey, + Timestamp: hlc.Timestamp{WallTime: tsFunc(l-1, v)}, + }, storage.ClearOptions{})) + } + } + if l == 0 && v < 10 { + // Flush to grow the memtable size. + require.NoError(b, eng.Flush()) + } + } + if l == 0 { + // Since did many flushes, compact everything down. + require.NoError(b, eng.Compact()) + } else { + // Flush the next level. This will become a L0 sub-level. + require.NoError(b, eng.Flush()) + } + } + return eng + } + scanLiveness := func(b *testing.B, eng storage.Engine, expectedCount int) (blockBytes uint64) { + ss := &kvpb.ScanStats{} + opts := storage.MVCCScanOptions{ + ScanStats: ss, + } + scanRes, err := storage.MVCCScan( + ctx, eng.NewReadOnly(storage.StandardDurability), keys.NodeLivenessPrefix, + keys.NodeLivenessKeyMax, hlc.MaxTimestamp, opts) + if err != nil { + b.Fatal(err.Error()) + } + if expectedCount != len(scanRes.KVs) { + b.Fatalf("expected %d != actual %d", expectedCount, len(scanRes.KVs)) + } + return ss.BlockBytes + } + + // We expect active nodes to have 100s of live versions since liveness is + // written every 3s, and GC is configured to happen after 10min. But GC can + // be delayed, and decommissioned nodes will only have 1 version, so we + // explore those extremes. + // + // Results on M1 macbook with dead-keys=false and compacted=true: + // NodeLivenessScanStorage/num-live=2/compacted=true-10 26.80µ ± 9% + // NodeLivenessScanStorage/num-live=5/compacted=true-10 30.34µ ± 3% + // NodeLivenessScanStorage/num-live=10/compacted=true-10 38.88µ ± 8% + // NodeLivenessScanStorage/num-live=1000/compacted=true-10 861.5µ ± 3% + // + // When compacted=false the scan takes ~10ms, which is > 100x slower, but + // probably acceptable for this workload. + // NodeLivenessScanStorage/num-live=2/compacted=false-10 9.430m ± 5% + // NodeLivenessScanStorage/num-live=5/compacted=false-10 9.534m ± 4% + // NodeLivenessScanStorage/num-live=10/compacted=false-10 9.456m ± 2% + // NodeLivenessScanStorage/num-live=1000/compacted=false-10 10.34m ± 7% + // + // dead-keys=true (and compacted=false) defeats the NextPrefix optimization, + // since the next prefix can have all its keys deleted and the iterator has + // to step through all of them (it can't be sure that all the keys for that + // next prefix are deleted). This case should not occur in the liveness + // range, as discussed earlier. + // + // NodeLivenessScanStorage/num-live=2/dead-keys=true/compacted=false-10 58.33m + for _, numLiveVersions := range []int{2, 5, 10, 1000} { + b.Run(fmt.Sprintf("num-live=%d", numLiveVersions), func(b *testing.B) { + for _, haveDeadKeys := range []bool{false, true} { + b.Run(fmt.Sprintf("dead-keys=%t", haveDeadKeys), func(b *testing.B) { + for _, compacted := range []bool{false, true} { + b.Run(fmt.Sprintf("compacted=%t", compacted), func(b *testing.B) { + eng := setupEng(b, numLiveVersions, haveDeadKeys) + defer eng.Close() + if compacted { + require.NoError(b, eng.Compact()) + } + b.ResetTimer() + blockBytes := uint64(0) + for i := 0; i < b.N; i++ { + expectedCount := numNodes + if haveDeadKeys { + expectedCount /= 2 + } + blockBytes += scanLiveness(b, eng, expectedCount) + } + b.ReportMetric(float64(blockBytes)/float64(b.N), "block-bytes/op") + }) + } + }) + } + }) + } +}