From 2d855d377626836fd6f0048cbd88483c84db9850 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 20 Mar 2023 11:44:13 +0000 Subject: [PATCH] kvserver: reduce `SysBytes` MVCC stats race during merges During a range merge, we subsume the RHS and ship its MVCC stats via the merge trigger to add them to the LHS stats. Since the RHS range ID-local keys aren't present in the merged range, the merge trigger computed these and subtracted them from the given stats. However, this could race with a lease request, which ignores latches and writes to the range ID-local keyspace, resulting in incorrect `SysBytes` MVCC stats. This patch instead computes the range ID-local MVCC stats during subsume and sends them via a new `RangeIDLocalMVCCStats` field. This still doesn't guarantee that they're consistent with the RHS's in-memory stats, since the latch-ingnoring lease request can update these independently of the subsume request's engine snapshot. However, it substantially reduces the likelihood of this race. While it would be possible to prevent this race entirely by introducing additional synchronization between lease requests and merge application, this would likely come with significant additional complexity, which doesn't seem worth it just to avoid `SysBytes` being a few bytes wrong. The main fallout is a log message when the consistency checker detects the stats mismatch, and potential test flake. This PR therefore settles for best-effort prevention. Epic: none Release note: None --- pkg/kv/kvnemesis/env.go | 17 ----------------- pkg/kv/kvpb/api.proto | 8 ++++++++ .../kvserver/batcheval/cmd_end_transaction.go | 18 ++++++++++++++---- pkg/kv/kvserver/batcheval/cmd_subsume.go | 15 +++++++++++++++ pkg/kv/kvserver/replica_command.go | 13 +++++++------ pkg/roachpb/data.proto | 5 +++++ 6 files changed, 49 insertions(+), 27 deletions(-) diff --git a/pkg/kv/kvnemesis/env.go b/pkg/kv/kvnemesis/env.go index 1f6112df1b2c..9aa39d77f935 100644 --- a/pkg/kv/kvnemesis/env.go +++ b/pkg/kv/kvnemesis/env.go @@ -14,7 +14,6 @@ import ( "context" gosql "database/sql" "fmt" - "regexp" "time" "github.com/cockroachdb/cockroach-go/v2/crdb" @@ -69,22 +68,6 @@ func (e *Env) CheckConsistency(ctx context.Context, span roachpb.Span) []error { if err := rows.Scan(&rangeID, &key, &status, &detail); err != nil { return []error{err} } - // TODO(erikgrinaker): There's a known issue that can result in a 10-byte - // discrepancy in SysBytes. This hasn't been investigated, but it's not - // critical so we ignore it for now. See: - // https://github.com/cockroachdb/cockroach/issues/93896 - if status == kvpb.CheckConsistencyResponse_RANGE_CONSISTENT_STATS_INCORRECT.String() { - m := regexp.MustCompile(`.*\ndelta \(stats-computed\): \{(.*)\}`).FindStringSubmatch(detail) - if len(m) > 1 { - delta := m[1] - // Strip out LastUpdateNanos and all zero-valued fields. - delta = regexp.MustCompile(`LastUpdateNanos:\d+`).ReplaceAllString(delta, "") - delta = regexp.MustCompile(`\S+:0\b`).ReplaceAllString(delta, "") - if regexp.MustCompile(`^\s*SysBytes:10\s*$`).MatchString(delta) { - continue - } - } - } switch status { case kvpb.CheckConsistencyResponse_RANGE_INDETERMINATE.String(): // Can't do anything, so let it slide. diff --git a/pkg/kv/kvpb/api.proto b/pkg/kv/kvpb/api.proto index 303b60458029..b78a4a4cab1e 100644 --- a/pkg/kv/kvpb/api.proto +++ b/pkg/kv/kvpb/api.proto @@ -2079,6 +2079,14 @@ message SubsumeResponse { (gogoproto.customname) = "MVCCStats" ]; + // RangeIDLocalMVCCStats are the MVCC statistics for the replicated range + // ID-local keys. During a merge, these must be subtracted from MVCCStats + // since they won't be present in the merged range. + storage.enginepb.MVCCStats range_id_local_mvcc_stats = 8 [ + (gogoproto.nullable) = false, + (gogoproto.customname) = "RangeIDLocalMVCCStats" + ]; + // LeaseAppliedIndex is the lease applied index of the last applied command // at the time that the Subsume request executed. This is NOT intended to be // the lease index of the SubsumeRequest itself. Instead, it is intended to diff --git a/pkg/kv/kvserver/batcheval/cmd_end_transaction.go b/pkg/kv/kvserver/batcheval/cmd_end_transaction.go index 09fbecaaf7ec..4cfb66529222 100644 --- a/pkg/kv/kvserver/batcheval/cmd_end_transaction.go +++ b/pkg/kv/kvserver/batcheval/cmd_end_transaction.go @@ -16,6 +16,7 @@ import ( "sync/atomic" "time" + "github.com/cockroachdb/cockroach/pkg/clusterversion" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/abortspan" @@ -1272,9 +1273,7 @@ func mergeTrigger( // The stats for the merged range are the sum of the LHS and RHS stats // adjusted for range key merges (which is the inverse of the split - // adjustment). The RHS's replicated range ID stats are subtracted -- the only - // replicated range ID keys we copy from the RHS are the keys in the abort - // span, and we've already accounted for those stats above. + // adjustment). ms.Add(merge.RightMVCCStats) msRangeKeyDelta, err := computeSplitRangeKeyStatsDelta(batch, merge.LeftDesc, merge.RightDesc) if err != nil { @@ -1282,7 +1281,18 @@ func mergeTrigger( } ms.Subtract(msRangeKeyDelta) - { + // The RHS's replicated range ID stats are subtracted -- the only replicated + // range ID keys we copy from the RHS are the keys in the abort span, and + // we've already accounted for those stats above. + // + // NB: RangeIDLocalMVCCStats is introduced in 23.2 to mitigate a SysBytes race + // with lease requests (which ignore latches). For 23.1 compatibility, we fall + // back to computing it here when not set. We don't need a version gate since + // it's only used at evaluation time and doesn't affect below-Raft state. + if merge.RightRangeIDLocalMVCCStats != (enginepb.MVCCStats{}) { + ms.Subtract(merge.RightRangeIDLocalMVCCStats) + } else { + _ = clusterversion.V23_1 // remove this branch when 23.1 support is removed ridPrefix := keys.MakeRangeIDReplicatedPrefix(merge.RightDesc.RangeID) sysMS, err := storage.ComputeStats(batch, ridPrefix, ridPrefix.PrefixEnd(), 0 /* nowNanos */) if err != nil { diff --git a/pkg/kv/kvserver/batcheval/cmd_subsume.go b/pkg/kv/kvserver/batcheval/cmd_subsume.go index 05dcc8ddb02e..fbcc45106811 100644 --- a/pkg/kv/kvserver/batcheval/cmd_subsume.go +++ b/pkg/kv/kvserver/batcheval/cmd_subsume.go @@ -135,6 +135,21 @@ func Subsume( reply.LeaseAppliedIndex = cArgs.EvalCtx.GetLeaseAppliedIndex() reply.FreezeStart = cArgs.EvalCtx.Clock().NowAsClockTimestamp() + // We ship the range ID-local replicated stats as well, since these must be + // subtracted from MVCCStats for the merged range. + // + // NB: lease requests can race with this computation, since they ignore + // latches and write to the range ID-local keyspace. This can very rarely + // result in a minor SysBytes discrepancy when the GetMVCCStats() call above + // is not consistent with this readWriter snapshot. We accept this for now, + // rather than introducing additional synchronization complexity. + ridPrefix := keys.MakeRangeIDReplicatedPrefix(desc.RangeID) + reply.RangeIDLocalMVCCStats, err = storage.ComputeStats( + readWriter, ridPrefix, ridPrefix.PrefixEnd(), 0 /* nowNanos */) + if err != nil { + return result.Result{}, err + } + // Collect a read summary from the RHS leaseholder to ship to the LHS // leaseholder. This is used to instruct the LHS on how to update its // timestamp cache to ensure that no future writes are allowed to invalidate diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go index db370553da9e..ab422f0d4017 100644 --- a/pkg/kv/kvserver/replica_command.go +++ b/pkg/kv/kvserver/replica_command.go @@ -764,12 +764,13 @@ func (r *Replica) AdminMerge( Commit: true, InternalCommitTrigger: &roachpb.InternalCommitTrigger{ MergeTrigger: &roachpb.MergeTrigger{ - LeftDesc: updatedLeftDesc, - RightDesc: rightDesc, - RightMVCCStats: rhsSnapshotRes.MVCCStats, - FreezeStart: rhsSnapshotRes.FreezeStart, - RightClosedTimestamp: rhsSnapshotRes.ClosedTimestamp, - RightReadSummary: rhsSnapshotRes.ReadSummary, + LeftDesc: updatedLeftDesc, + RightDesc: rightDesc, + RightMVCCStats: rhsSnapshotRes.MVCCStats, + RightRangeIDLocalMVCCStats: rhsSnapshotRes.RangeIDLocalMVCCStats, + FreezeStart: rhsSnapshotRes.FreezeStart, + RightClosedTimestamp: rhsSnapshotRes.ClosedTimestamp, + RightReadSummary: rhsSnapshotRes.ReadSummary, }, }, }) diff --git a/pkg/roachpb/data.proto b/pkg/roachpb/data.proto index 1d1b0ee1fd9a..179059e6fd86 100644 --- a/pkg/roachpb/data.proto +++ b/pkg/roachpb/data.proto @@ -153,6 +153,11 @@ message MergeTrigger { (gogoproto.nullable) = false ]; + storage.enginepb.MVCCStats right_range_id_local_mvcc_stats = 9 [ + (gogoproto.customname) = "RightRangeIDLocalMVCCStats", + (gogoproto.nullable) = false + ]; + // FreezeStart is a timestamp that is guaranteed to be greater than the // timestamps at which any requests were serviced by the right-hand side range // before it stopped responding to requests altogether (in anticipation of