From a760335a5f0e170f965860de55c4dce960101020 Mon Sep 17 00:00:00 2001 From: Austen McClernon Date: Fri, 27 Oct 2023 16:56:02 +0000 Subject: [PATCH 1/2] kv,server,roachpb: avoid error overhead for x-locality comparison Cross locality traffic instrumentation was added to raft, snapshots and batch requests to quantify the amount of cross region/zone traffic. Errors would be returned from `CompareWithLocality` when the region, or zone locality flags were set in an unsupported manner according to our documentation. These error allocations added overhead (cpu/mem) when hit. Alter `CompareWithLocality` to return booleans in place of an error to reduce overhead. Resolves: #111148 Resolves: #111142 Informs: #111561 Release note: None --- pkg/kv/kvclient/kvcoord/dist_sender.go | 10 ++-- pkg/kv/kvserver/store_snapshot.go | 10 ++-- pkg/roachpb/metadata.go | 14 +++-- pkg/roachpb/metadata_test.go | 79 +++++++++++--------------- pkg/server/node.go | 10 ++-- 5 files changed, 57 insertions(+), 66 deletions(-) diff --git a/pkg/kv/kvclient/kvcoord/dist_sender.go b/pkg/kv/kvclient/kvcoord/dist_sender.go index 392b917482f6..701e82d716e2 100644 --- a/pkg/kv/kvclient/kvcoord/dist_sender.go +++ b/pkg/kv/kvclient/kvcoord/dist_sender.go @@ -2656,12 +2656,12 @@ func (ds *DistSender) getLocalityComparison( return roachpb.LocalityComparisonType_UNDEFINED } - comparisonResult, regionErr, zoneErr := gatewayNodeDesc.Locality.CompareWithLocality(destinationNodeDesc.Locality) - if regionErr != nil { - log.VInfof(ctx, 5, "unable to determine if the given nodes are cross region %v", regionErr) + comparisonResult, regionValid, zoneValid := gatewayNodeDesc.Locality.CompareWithLocality(destinationNodeDesc.Locality) + if !regionValid { + log.VInfof(ctx, 5, "unable to determine if the given nodes are cross region") } - if zoneErr != nil { - log.VInfof(ctx, 5, "unable to determine if the given nodes are cross zone %v", zoneErr) + if !zoneValid { + log.VInfof(ctx, 5, "unable to determine if the given nodes are cross zone") } return comparisonResult } diff --git a/pkg/kv/kvserver/store_snapshot.go b/pkg/kv/kvserver/store_snapshot.go index 900557990f8a..fa9bfc25135b 100644 --- a/pkg/kv/kvserver/store_snapshot.go +++ b/pkg/kv/kvserver/store_snapshot.go @@ -1272,12 +1272,12 @@ func (s *Store) getLocalityComparison( ) roachpb.LocalityComparisonType { firstLocality := s.cfg.StorePool.GetNodeLocality(fromNodeID) secLocality := s.cfg.StorePool.GetNodeLocality(toNodeID) - comparisonResult, regionErr, zoneErr := firstLocality.CompareWithLocality(secLocality) - if regionErr != nil { - log.VEventf(ctx, 5, "unable to determine if the given nodes are cross region %v", regionErr) + comparisonResult, regionValid, zoneValid := firstLocality.CompareWithLocality(secLocality) + if !regionValid { + log.VEventf(ctx, 5, "unable to determine if the given nodes are cross region") } - if zoneErr != nil { - log.VEventf(ctx, 5, "unable to determine if the given nodes are cross zone %v", zoneErr) + if !zoneValid { + log.VEventf(ctx, 5, "unable to determine if the given nodes are cross zone") } return comparisonResult } diff --git a/pkg/roachpb/metadata.go b/pkg/roachpb/metadata.go index bc64d70f52f9..21d4c26b5ef1 100644 --- a/pkg/roachpb/metadata.go +++ b/pkg/roachpb/metadata.go @@ -698,7 +698,7 @@ func (l Locality) getFirstRegionFirstZone() ( // iteration. func (l Locality) CompareWithLocality( other Locality, -) (_ LocalityComparisonType, regionErr error, zoneErr error) { +) (_ LocalityComparisonType, regionValid bool, zoneValid bool) { firstRegionValue, hasRegion, firstZoneKey, firstZone, hasZone := l.getFirstRegionFirstZone() firstRegionValueOther, hasRegionOther, firstZoneKeyOther, firstZoneOther, hasZoneOther := other.getFirstRegionFirstZone() @@ -707,21 +707,23 @@ func (l Locality) CompareWithLocality( if !hasRegion || !hasRegionOther { isCrossRegion = false - regionErr = errors.Errorf("localities must have a valid region tier key for cross-region comparison") + } else { + regionValid = true } if (!hasZone || !hasZoneOther) || (firstZoneKey != firstZoneKeyOther) { isCrossZone = false - zoneErr = errors.Errorf("localities must have a valid zone tier key for cross-zone comparison") + } else { + zoneValid = true } if isCrossRegion { - return LocalityComparisonType_CROSS_REGION, regionErr, zoneErr + return LocalityComparisonType_CROSS_REGION, regionValid, zoneValid } else { if isCrossZone { - return LocalityComparisonType_SAME_REGION_CROSS_ZONE, regionErr, zoneErr + return LocalityComparisonType_SAME_REGION_CROSS_ZONE, regionValid, zoneValid } else { - return LocalityComparisonType_SAME_REGION_SAME_ZONE, regionErr, zoneErr + return LocalityComparisonType_SAME_REGION_SAME_ZONE, regionValid, zoneValid } } } diff --git a/pkg/roachpb/metadata_test.go b/pkg/roachpb/metadata_test.go index 672bb3f6d719..4d76b8628eda 100644 --- a/pkg/roachpb/metadata_test.go +++ b/pkg/roachpb/metadata_test.go @@ -224,9 +224,6 @@ func TestLocalityMatches(t *testing.T) { } func TestLocalityCompareWithLocality(t *testing.T) { - regionErrStr := "localities must have a valid region tier key for cross-region comparison" - zoneErrStr := "localities must have a valid zone tier key for cross-zone comparison" - firstRegionStr := "us-east" secRegionStr := "us-west" firstZoneStr := "us-east-1" @@ -251,83 +248,83 @@ func TestLocalityCompareWithLocality(t *testing.T) { } for _, tc := range []struct { - l string - other string - localityType LocalityComparisonType - crossRegionErr string - crossZoneErr string + l string + other string + localityType LocalityComparisonType + regionValid bool + zoneValid bool }{ // -------- Part 1: check for different zone tier alternatives -------- // Valid tier keys, same regions and same zones. {l: "region=us-west,zone=us-west-1", other: "region=us-west,zone=us-west-1", - localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, crossRegionErr: "", crossZoneErr: ""}, + localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, regionValid: true, zoneValid: true}, // Valid tier keys, different regions and different zones. {l: "region=us-west,zone=us-west-1", other: "region=us-east,zone=us-west-2", - localityType: LocalityComparisonType_CROSS_REGION, crossRegionErr: "", crossZoneErr: ""}, + localityType: LocalityComparisonType_CROSS_REGION, regionValid: true, zoneValid: true}, // Valid tier keys, different regions and different zones. {l: "region=us-west,availability-zone=us-west-1", other: "region=us-east,availability-zone=us-east-1", - localityType: LocalityComparisonType_CROSS_REGION, crossRegionErr: "", crossZoneErr: ""}, + localityType: LocalityComparisonType_CROSS_REGION, regionValid: true, zoneValid: true}, // Valid tier keys, same regions and different zones. {l: "region=us-west,az=us-west-1", other: "region=us-west,other-keys=us,az=us-east-1", - localityType: LocalityComparisonType_SAME_REGION_CROSS_ZONE, crossRegionErr: "", crossZoneErr: ""}, + localityType: LocalityComparisonType_SAME_REGION_CROSS_ZONE, regionValid: true, zoneValid: true}, // Invalid zone tier key and different regions. {l: "region=us-west,availability-zone=us-west-1", other: "region=us-east,zone=us-east-1", - localityType: LocalityComparisonType_CROSS_REGION, crossRegionErr: "", crossZoneErr: zoneErrStr}, + localityType: LocalityComparisonType_CROSS_REGION, regionValid: true, zoneValid: false}, // Valid zone tier key (edge case), different zones and regions. {l: "region=us-west,zone=us-west-1", other: "region=us-east,zone=us-west-2,az=us-west-1", - localityType: LocalityComparisonType_CROSS_REGION, crossRegionErr: "", crossZoneErr: ""}, + localityType: LocalityComparisonType_CROSS_REGION, regionValid: true, zoneValid: true}, // Missing zone tier key and different regions. {l: "region=us-west,zone=us-west-1", other: "region=us-east", - localityType: LocalityComparisonType_CROSS_REGION, crossRegionErr: "", crossZoneErr: zoneErrStr}, + localityType: LocalityComparisonType_CROSS_REGION, regionValid: true, zoneValid: false}, // Different region and different zones with non-unique & invalid zone tier key. {l: "region=us-west,zone=us-west-1,az=us-west-2", other: "az=us-west-1,region=us-west,zone=us-west-1", - localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, crossRegionErr: "", crossZoneErr: zoneErrStr}, + localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, regionValid: true, zoneValid: false}, // Different regions and different zones with non-unique & valid zone tier key. {l: "region=us-west,az=us-west-2,zone=us-west-1", other: "region=us-west,az=us-west-1", - localityType: LocalityComparisonType_SAME_REGION_CROSS_ZONE, crossRegionErr: "", crossZoneErr: ""}, + localityType: LocalityComparisonType_SAME_REGION_CROSS_ZONE, regionValid: true, zoneValid: true}, // Invalid region tier key and different zones. {l: "country=us,zone=us-west-1", other: "country=us,zone=us-west-2", - localityType: LocalityComparisonType_SAME_REGION_CROSS_ZONE, crossRegionErr: regionErrStr, crossZoneErr: ""}, + localityType: LocalityComparisonType_SAME_REGION_CROSS_ZONE, regionValid: false, zoneValid: true}, // Missing region tier key and different zones. {l: "az=us-west-1", other: "region=us-east,az=us-west-2", - localityType: LocalityComparisonType_SAME_REGION_CROSS_ZONE, crossRegionErr: regionErrStr, crossZoneErr: ""}, + localityType: LocalityComparisonType_SAME_REGION_CROSS_ZONE, regionValid: false, zoneValid: true}, // Invalid region and zone tier key. {l: "invalid-key=us-west,zone=us-west-1", other: "region=us-east,invalid-key=us-west-1", - localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, crossRegionErr: regionErrStr, crossZoneErr: zoneErrStr}, + localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, regionValid: false, zoneValid: false}, // Invalid region and zone tier key. {l: "country=us,dc=us-west-2", other: "country=us,dc=us-west-2", - localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, crossRegionErr: regionErrStr, crossZoneErr: zoneErrStr}, + localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, regionValid: false, zoneValid: false}, // -------- Part 2: single region, single zone -------- // One: (both) Two: (region) {l: makeLocalityStr(firstRegionStr, firstZoneStr), other: makeLocalityStr(secRegionStr, ""), - localityType: LocalityComparisonType_CROSS_REGION, crossRegionErr: "", crossZoneErr: zoneErrStr}, + localityType: LocalityComparisonType_CROSS_REGION, regionValid: true, zoneValid: false}, // One: (both) Two: (zone) {l: makeLocalityStr(firstRegionStr, firstZoneStr), other: makeLocalityStr("", secZoneStr), - localityType: LocalityComparisonType_SAME_REGION_CROSS_ZONE, crossRegionErr: regionErrStr, crossZoneErr: ""}, + localityType: LocalityComparisonType_SAME_REGION_CROSS_ZONE, regionValid: false, zoneValid: true}, // One: (region) Two: (region) {l: makeLocalityStr(firstRegionStr, ""), other: makeLocalityStr(secRegionStr, ""), - localityType: LocalityComparisonType_CROSS_REGION, crossRegionErr: "", crossZoneErr: zoneErrStr}, + localityType: LocalityComparisonType_CROSS_REGION, regionValid: true, zoneValid: false}, // One: (zone) Two: (zone) {l: makeLocalityStr("", firstZoneStr), other: makeLocalityStr("", secZoneStr), - localityType: LocalityComparisonType_SAME_REGION_CROSS_ZONE, crossRegionErr: regionErrStr, crossZoneErr: ""}, + localityType: LocalityComparisonType_SAME_REGION_CROSS_ZONE, regionValid: false, zoneValid: true}, // One: (region) Two: (zone) {l: makeLocalityStr(firstRegionStr, ""), other: makeLocalityStr("", secZoneStr), - localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, crossRegionErr: regionErrStr, crossZoneErr: zoneErrStr}, + localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, regionValid: false, zoneValid: false}, // One: (both) Two: (both) {l: makeLocalityStr(firstRegionStr, firstZoneStr), other: makeLocalityStr(secRegionStr, secZoneStr), - localityType: LocalityComparisonType_CROSS_REGION, crossRegionErr: "", crossZoneErr: ""}, + localityType: LocalityComparisonType_CROSS_REGION, regionValid: true, zoneValid: true}, // One: (none) Two: (none) {l: makeLocalityStr("", ""), other: makeLocalityStr("", ""), - localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, crossRegionErr: regionErrStr, crossZoneErr: zoneErrStr}, + localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, regionValid: false, zoneValid: false}, // One: (region) Two: (none) {l: makeLocalityStr(firstRegionStr, ""), other: makeLocalityStr("", ""), - localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, crossRegionErr: regionErrStr, crossZoneErr: zoneErrStr}, + localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, regionValid: false, zoneValid: false}, // One: (zone) Two: (none) {l: makeLocalityStr("", firstZoneStr), other: makeLocalityStr("", ""), - localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, crossRegionErr: regionErrStr, crossZoneErr: zoneErrStr}, + localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, regionValid: false, zoneValid: false}, // One: (both) Two: (none) {l: makeLocalityStr(firstRegionStr, firstZoneStr), other: makeLocalityStr("", ""), - localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, crossRegionErr: regionErrStr, crossZoneErr: zoneErrStr}, + localityType: LocalityComparisonType_SAME_REGION_SAME_ZONE, regionValid: false, zoneValid: false}, } { t.Run(fmt.Sprintf("%s-crosslocality-%s", tc.l, tc.other), func(t *testing.T) { var l Locality @@ -335,21 +332,13 @@ func TestLocalityCompareWithLocality(t *testing.T) { require.NoError(t, l.Set(tc.l)) require.NoError(t, other.Set(tc.other)) type localities struct { - localityType LocalityComparisonType - crossRegionErr string - crossZoneErr string - } - localityType, regionErr, zoneErr := l.CompareWithLocality(other) - regionErrStr := "" - if regionErr != nil { - regionErrStr = regionErr.Error() - } - zoneErrStr := "" - if zoneErr != nil { - zoneErrStr = zoneErr.Error() + localityType LocalityComparisonType + regionValid bool + zoneValid bool } - actual := localities{localityType, regionErrStr, zoneErrStr} - expected := localities{tc.localityType, tc.crossRegionErr, tc.crossZoneErr} + localityType, regionValid, zoneValid := l.CompareWithLocality(other) + actual := localities{localityType, regionValid, zoneValid} + expected := localities{tc.localityType, tc.regionValid, tc.zoneValid} require.Equal(t, expected, actual) }) } diff --git a/pkg/server/node.go b/pkg/server/node.go index 63ef8392bfb6..6a1bda87f474 100644 --- a/pkg/server/node.go +++ b/pkg/server/node.go @@ -1401,12 +1401,12 @@ func (n *Node) getLocalityComparison( return roachpb.LocalityComparisonType_UNDEFINED } - comparisonResult, regionErr, zoneErr := n.Descriptor.Locality.CompareWithLocality(gatewayNodeDesc.Locality) - if regionErr != nil { - log.VInfof(ctx, 5, "unable to determine if the given nodes are cross region %v", regionErr) + comparisonResult, regionValid, zoneValid := n.Descriptor.Locality.CompareWithLocality(gatewayNodeDesc.Locality) + if !regionValid { + log.VInfof(ctx, 5, "unable to determine if the given nodes are cross region") } - if zoneErr != nil { - log.VInfof(ctx, 5, "unable to determine if the given nodes are cross zone %v", zoneErr) + if !zoneValid { + log.VInfof(ctx, 5, "unable to determine if the given nodes are cross zone") } return comparisonResult From 4f9e30da3f155efbc53a59efd77545249eb7b8f4 Mon Sep 17 00:00:00 2001 From: sumeerbhola Date: Tue, 24 Oct 2023 20:24:15 -0400 Subject: [PATCH 2/2] kvserver: add BenchmarkNodeLivenessScanStorage to measure liveness scan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Node liveness scans, like the one done in MaybeGossipNodeLivenessRaftMuLocked, while holding raftMu, are performance sensitive, and slowness has caused production issues (https://github.com/cockroachlabs/support/issues/2665, https://github.com/cockroachlabs/support/issues/2107). This benchmark measures the scan performance both when DELs (due to GC) have not been compacted away, and when they have. It also sets up a varying number of live versions since decommissioned nodes will have a single live version. Results on M1 macbook on master with dead-keys=false and compacted=true: NodeLivenessScanStorage/num-live=2/compacted=true-10 26.80µ ± 9% NodeLivenessScanStorage/num-live=5/compacted=true-10 30.34µ ± 3% NodeLivenessScanStorage/num-live=10/compacted=true-10 38.88µ ± 8% NodeLivenessScanStorage/num-live=1000/compacted=true-10 861.5µ ± 3% When compacted=false the scan takes ~10ms, which is > 100x slower, but probably acceptable for this workload. NodeLivenessScanStorage/num-live=2/compacted=false-10 9.430m ± 5% NodeLivenessScanStorage/num-live=5/compacted=false-10 9.534m ± 4% NodeLivenessScanStorage/num-live=10/compacted=false-10 9.456m ± 2% NodeLivenessScanStorage/num-live=1000/compacted=false-10 10.34m ± 7% dead-keys=true (and compacted=false) defeats the NextPrefix optimization, since the next prefix can have all its keys deleted and the iterator has to step through all of them (it can't be sure that all the keys for that next prefix are deleted). This case should not occur in the liveness range, as we don't remove decommissioned entries, but is included for better understanding. NodeLivenessScanStorage/num-live=2/dead-keys=true/compacted=false-10 58.33m Compared to v22.2, the results are sometimes > 10x faster, when the pebbleMVCCScanner seek optimization in v22.2 was defeated. │ sec/op │ sec/op vs base │ NodeLivenessScanStorage/num-live=2/compacted=false-10 117.280m ± 2% 9.430m ± 5% -91.96% (p=0.002 n=6) NodeLivenessScanStorage/num-live=5/compacted=false-10 117.298m ± 0% 9.534m ± 4% -91.87% (p=0.002 n=6) NodeLivenessScanStorage/num-live=10/compacted=false-10 12.009m ± 0% 9.456m ± 2% -21.26% (p=0.002 n=6) NodeLivenessScanStorage/num-live=1000/compacted=false-10 13.04m ± 0% 10.34m ± 7% -20.66% (p=0.002 n=6) │ block-bytes/op │ block-bytes/op vs base │ NodeLivenessScanStorage/num-live=2/compacted=false-10 14.565Mi ± 0% 8.356Mi ± 0% -42.63% (p=0.002 n=6) NodeLivenessScanStorage/num-live=5/compacted=false-10 14.570Mi ± 0% 8.361Mi ± 0% -42.61% (p=0.002 n=6) NodeLivenessScanStorage/num-live=10/compacted=false-10 11.094Mi ± 0% 8.368Mi ± 0% -24.57% (p=0.002 n=6) NodeLivenessScanStorage/num-live=1000/compacted=false-10 12.235Mi ± 0% 8.990Mi ± 0% -26.53% (p=0.002 n=6) │ B/op │ B/op vs base │ NodeLivenessScanStorage/num-live=2/compacted=false-10 42.83Ki ± 4% 41.87Ki ± 0% -2.22% (p=0.002 n=6) NodeLivenessScanStorage/num-live=5/compacted=false-10 43.28Ki ± 3% 41.84Ki ± 0% -3.32% (p=0.002 n=6) NodeLivenessScanStorage/num-live=10/compacted=false-10 37.59Ki ± 0% 41.92Ki ± 0% +11.51% (p=0.002 n=6) NodeLivenessScanStorage/num-live=1000/compacted=false-10 37.67Ki ± 1% 42.66Ki ± 0% +13.23% (p=0.002 n=6) │ allocs/op │ allocs/op vs base │ NodeLivenessScanStorage/num-live=2/compacted=false-10 105.00 ± 8% 85.00 ± 0% -19.05% (p=0.002 n=6) NodeLivenessScanStorage/num-live=5/compacted=false-10 107.00 ± 5% 85.00 ± 0% -20.56% (p=0.002 n=6) NodeLivenessScanStorage/num-live=10/compacted=false-10 74.00 ± 1% 85.00 ± 0% +14.86% (p=0.002 n=6) NodeLivenessScanStorage/num-live=1000/compacted=false-10 79.00 ± 1% 92.00 ± 1% +16.46% (p=0.002 n=6) Relates to https://github.com/cockroachlabs/support/issues/2665 Epic: none Release note: None --- pkg/kv/kvserver/node_liveness_test.go | 147 ++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/pkg/kv/kvserver/node_liveness_test.go b/pkg/kv/kvserver/node_liveness_test.go index 38135491119a..cd358b707a15 100644 --- a/pkg/kv/kvserver/node_liveness_test.go +++ b/pkg/kv/kvserver/node_liveness_test.go @@ -13,6 +13,7 @@ package kvserver_test import ( "bytes" "context" + "fmt" "reflect" "sort" "strconv" @@ -33,9 +34,11 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/listenerutil" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/testutils/skip" "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/leaktest" @@ -1301,3 +1304,147 @@ func TestNodeLivenessDecommissionAbsent(t *testing.T) { // Recommission from third node. setMembershipStatus(nl2, livenesspb.MembershipStatus_ACTIVE, true) } + +func BenchmarkNodeLivenessScanStorage(b *testing.B) { + skip.UnderShort(b) + defer log.Scope(b).Close(b) + + ctx := context.Background() + const numNodes = 100 + setupEng := func(b *testing.B, numLiveVersions int, haveFullyDeadKeys bool) storage.Engine { + eng := storage.NewDefaultInMemForTesting(storage.DisableAutomaticCompactions) + // 20 per minute, so 1000 represents 50 min of liveness writes in a level. + // This is unusual, but we can have such accumulation if flushes and + // compactions are rare. + const numVersionsPerLevel = 1000 + // All versions in level l will be deleted in level l+1. The versions + // written at the highest level are not deleted and the number of these + // per node is controlled by numLiveVersions. Additionally, if + // haveFullyDeadKeys is true, the highest level only has live versions for + // alternating nodes. + // + // NB: haveFullyDeadKeys=true is not representative of NodeLiveness, since + // we don't remove decommissioned entries. It is included here just to + // understand the effect on the NextPrefix optimization. + const numLevels = 7 + tsFunc := func(l int, v int) int64 { + return int64(l*numVersionsPerLevel + v + 10) + } + for l := 0; l < numLevels; l++ { + for v := 0; v < numVersionsPerLevel; v++ { + ts := tsFunc(l, v) + for n := roachpb.NodeID(0); n < numNodes; n++ { + lKey := keys.NodeLivenessKey(n) + // Always write a version if not at the highest level. If at the + // highest level, only write if v < numLiveVersions. Additionally, + // either haveFullyDeadKeys must be false or this node is one that + // has live versions. + if l < numLevels-1 || (v < numLiveVersions && (!haveFullyDeadKeys || n%2 == 0)) { + liveness := livenesspb.Liveness{ + NodeID: n, + Epoch: 100, + Expiration: hlc.LegacyTimestamp{WallTime: ts}, + Draining: false, + Membership: livenesspb.MembershipStatus_ACTIVE, + } + + require.NoError(b, storage.MVCCPutProto( + ctx, eng, lKey, hlc.Timestamp{WallTime: ts}, &liveness, + storage.MVCCWriteOptions{})) + } + // Else most recent level and the other conditions for writing a + // version are not satisfied. + + if l != 0 { + // Clear the key from the next older level. + require.NoError(b, eng.ClearMVCC(storage.MVCCKey{ + Key: lKey, + Timestamp: hlc.Timestamp{WallTime: tsFunc(l-1, v)}, + }, storage.ClearOptions{})) + } + } + if l == 0 && v < 10 { + // Flush to grow the memtable size. + require.NoError(b, eng.Flush()) + } + } + if l == 0 { + // Since did many flushes, compact everything down. + require.NoError(b, eng.Compact()) + } else { + // Flush the next level. This will become a L0 sub-level. + require.NoError(b, eng.Flush()) + } + } + return eng + } + scanLiveness := func(b *testing.B, eng storage.Engine, expectedCount int) (blockBytes uint64) { + ss := &kvpb.ScanStats{} + opts := storage.MVCCScanOptions{ + ScanStats: ss, + } + scanRes, err := storage.MVCCScan( + ctx, eng.NewReadOnly(storage.StandardDurability), keys.NodeLivenessPrefix, + keys.NodeLivenessKeyMax, hlc.MaxTimestamp, opts) + if err != nil { + b.Fatal(err.Error()) + } + if expectedCount != len(scanRes.KVs) { + b.Fatalf("expected %d != actual %d", expectedCount, len(scanRes.KVs)) + } + return ss.BlockBytes + } + + // We expect active nodes to have 100s of live versions since liveness is + // written every 3s, and GC is configured to happen after 10min. But GC can + // be delayed, and decommissioned nodes will only have 1 version, so we + // explore those extremes. + // + // Results on M1 macbook with dead-keys=false and compacted=true: + // NodeLivenessScanStorage/num-live=2/compacted=true-10 26.80µ ± 9% + // NodeLivenessScanStorage/num-live=5/compacted=true-10 30.34µ ± 3% + // NodeLivenessScanStorage/num-live=10/compacted=true-10 38.88µ ± 8% + // NodeLivenessScanStorage/num-live=1000/compacted=true-10 861.5µ ± 3% + // + // When compacted=false the scan takes ~10ms, which is > 100x slower, but + // probably acceptable for this workload. + // NodeLivenessScanStorage/num-live=2/compacted=false-10 9.430m ± 5% + // NodeLivenessScanStorage/num-live=5/compacted=false-10 9.534m ± 4% + // NodeLivenessScanStorage/num-live=10/compacted=false-10 9.456m ± 2% + // NodeLivenessScanStorage/num-live=1000/compacted=false-10 10.34m ± 7% + // + // dead-keys=true (and compacted=false) defeats the NextPrefix optimization, + // since the next prefix can have all its keys deleted and the iterator has + // to step through all of them (it can't be sure that all the keys for that + // next prefix are deleted). This case should not occur in the liveness + // range, as discussed earlier. + // + // NodeLivenessScanStorage/num-live=2/dead-keys=true/compacted=false-10 58.33m + for _, numLiveVersions := range []int{2, 5, 10, 1000} { + b.Run(fmt.Sprintf("num-live=%d", numLiveVersions), func(b *testing.B) { + for _, haveDeadKeys := range []bool{false, true} { + b.Run(fmt.Sprintf("dead-keys=%t", haveDeadKeys), func(b *testing.B) { + for _, compacted := range []bool{false, true} { + b.Run(fmt.Sprintf("compacted=%t", compacted), func(b *testing.B) { + eng := setupEng(b, numLiveVersions, haveDeadKeys) + defer eng.Close() + if compacted { + require.NoError(b, eng.Compact()) + } + b.ResetTimer() + blockBytes := uint64(0) + for i := 0; i < b.N; i++ { + expectedCount := numNodes + if haveDeadKeys { + expectedCount /= 2 + } + blockBytes += scanLiveness(b, eng, expectedCount) + } + b.ReportMetric(float64(blockBytes)/float64(b.N), "block-bytes/op") + }) + } + }) + } + }) + } +}