From aa43bb473975dfa4e0e4728359ab49eab96c36cf Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Mon, 21 Mar 2022 23:48:58 -0400 Subject: [PATCH] kv: scan empty right-hand side of split for stats See conversation in https://cockroachlabs.slack.com/archives/C0KB9Q03D/p1647551964065369. Bulk ingestion operations like IMPORT and index backfills have a fast-path for in-order ingestion where they periodically manually split and scatter the empty head of the keyspace being ingested into. In tests, we've seen that this split-and-scatter step can be expensive. This appears to be due in part to the stats recomputation we perform during range splits. Currently, this stats computation always scans the left hand side of the split. This is unfortunate for bulk-issued manual splits, because those manual splits are intentionally performed on the right border of the range, meaning that their left hand side contains the entire ~500MB range and their right hand side is empty. This commit extends the range split logic by adding a heuristic that chooses to scan the right side of the split first computing stats in cases where the right side is entirely empty. The "scan first" part is subtle, because there are cases where a split needs to scan both sides when computing stats. Specifically, it needs to do so in cases where the range has estimates in its MVCCStats. For an explanation, see `split_stats_helper.go`. It's not clear to me whether this commit is sufficient to help bulk ingestion or whether we'll need to do something about these stats estimates as well. --- .../kvserver/batcheval/cmd_end_transaction.go | 91 ++++++++++++++----- .../kvserver/batcheval/split_stats_helper.go | 83 ++++++++++++----- 2 files changed, 128 insertions(+), 46 deletions(-) diff --git a/pkg/kv/kvserver/batcheval/cmd_end_transaction.go b/pkg/kv/kvserver/batcheval/cmd_end_transaction.go index bc52ee111f09..d008fcb5d113 100644 --- a/pkg/kv/kvserver/batcheval/cmd_end_transaction.go +++ b/pkg/kv/kvserver/batcheval/cmd_end_transaction.go @@ -747,17 +747,22 @@ func RunCommitTrigger( return result.Result{}, nil } -// splitTrigger is called on a successful commit of a transaction -// containing an AdminSplit operation. It copies the AbortSpan for -// the new range and recomputes stats for both the existing, left hand -// side (LHS) range and the right hand side (RHS) range. For -// performance it only computes the stats for the original range (the -// left hand side) and infers the RHS stats by subtracting from the -// original stats. We compute the LHS stats because the split key -// computation ensures that we do not create large LHS -// ranges. However, this optimization is only possible if the stats -// are fully accurate. If they contain estimates, stats for both the -// LHS and RHS are computed. +// splitTrigger is called on a successful commit of a transaction containing an +// AdminSplit operation. It copies the AbortSpan for the new range and +// recomputes stats for both the existing, left hand side (LHS) range and the +// right hand side (RHS) range. For performance it only computes the stats for +// one side of the range and infers the stats for the other side by subtracting +// from the original stats. The choice of which side to scan is controlled by a +// heuristic. This choice defaults to scanning the LHS stats and inferring the +// RHS because the split key computation performed by the splitQueue ensures +// that we do not create large LHS ranges. However, if the RHS's global keyspace +// is entirely empty, it is scanned first instead. An example where we expect +// this heuristic to choose the RHS is bulk ingestion, which often splits off +// empty ranges and benefits from scanning the empty RHS when computing stats. +// Regardless of the choice of which side to scan first, the optimization to +// infer the other side's stats is only possible if the stats are fully accurate +// (ContainsEstimates = 0). If they contain estimates, stats for both the LHS +// and RHS are computed. // // Splits are complicated. A split is initiated when a replica receives an // AdminSplit request. Note that this request (and other "admin" requests) @@ -915,29 +920,63 @@ func splitTrigger( split.RightDesc.StartKey, split.RightDesc.EndKey, desc) } - // Compute the absolute stats for the (post-split) LHS. No more - // modifications to it are allowed after this line. - - leftMS, err := rditer.ComputeStatsForRange(&split.LeftDesc, batch, ts.WallTime) + // Determine which side to scan first when computing the post-split stats. We + // scan the left-hand side first unless the right side's global keyspace is + // entirely empty. In cases where the range's stats do not already contain + // estimates, only one side needs to be scanned. + // TODO(nvanbenschoten): this is a simple heuristic. If we had a cheap way to + // determine the relative sizes of the LHS and RHS, we could be more + // sophisticated here and always choose to scan the cheaper side. + scanRightFirst, err := isGlobalKeyspaceEmpty(batch, &split.RightDesc) if err != nil { - return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to compute stats for LHS range after split") + return enginepb.MVCCStats{}, result.Result{}, errors.Wrapf(err, + "unable to determine whether right hand side of split is empty") } - log.Event(ctx, "computed stats for left hand side range") h := splitStatsHelperInput{ AbsPreSplitBothEstimated: rec.GetMVCCStats(), DeltaBatchEstimated: bothDeltaMS, - AbsPostSplitLeft: leftMS, - AbsPostSplitRightFn: func() (enginepb.MVCCStats, error) { - rightMS, err := rditer.ComputeStatsForRange( - &split.RightDesc, batch, ts.WallTime, - ) - return rightMS, errors.Wrap(err, "unable to compute stats for RHS range after split") - }, + AbsPostSplitLeftFn: makeScanStatsFn(ctx, batch, ts, &split.LeftDesc, "left hand side"), + AbsPostSplitRightFn: makeScanStatsFn(ctx, batch, ts, &split.RightDesc, "right hand side"), + ScanRightFirst: scanRightFirst, } return splitTriggerHelper(ctx, rec, batch, h, split, ts) } +// isGlobalKeyspaceEmpty returns whether the global keyspace of the provided +// range is entirely empty or whether it contains at least one key. +func isGlobalKeyspaceEmpty(reader storage.Reader, d *roachpb.RangeDescriptor) (bool, error) { + span := d.KeySpan().AsRawSpanWithNoLocals() + iter := reader.NewMVCCIterator(storage.MVCCKeyIterKind, storage.IterOptions{UpperBound: span.EndKey}) + defer iter.Close() + iter.SeekGE(storage.MakeMVCCMetadataKey(span.Key)) + ok, err := iter.Valid() + if err != nil { + return false, err + } + return !ok /* empty */, nil +} + +// makeScanStatsFn constructs a splitStatsScanFn for the provided post-split +// range descriptor which computes the range's statistics. +func makeScanStatsFn( + ctx context.Context, + reader storage.Reader, + ts hlc.Timestamp, + sideDesc *roachpb.RangeDescriptor, + sideName string, +) splitStatsScanFn { + return func() (enginepb.MVCCStats, error) { + sideMS, err := rditer.ComputeStatsForRange(sideDesc, reader, ts.WallTime) + if err != nil { + return enginepb.MVCCStats{}, errors.Wrapf(err, + "unable to compute stats for %s range after split", sideName) + } + log.Eventf(ctx, "computed stats for %s range", sideName) + return sideMS, nil + } +} + // splitTriggerHelper continues the work begun by splitTrigger, but has a // reduced scope that has all stats-related concerns bundled into a // splitStatsHelper. @@ -967,6 +1006,10 @@ func splitTriggerHelper( return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to copy last replica GC timestamp") } + // Compute the absolute stats for the (post-split) ranges. No more + // modifications to the left hand side are allowed after this line and any + // modifications to the right hand side are accounted for by updating the + // helper's AbsPostSplitRight() reference. h, err := makeSplitStatsHelper(statsInput) if err != nil { return enginepb.MVCCStats{}, result.Result{}, err diff --git a/pkg/kv/kvserver/batcheval/split_stats_helper.go b/pkg/kv/kvserver/batcheval/split_stats_helper.go index 6e9d76874dc5..d238d0704428 100644 --- a/pkg/kv/kvserver/batcheval/split_stats_helper.go +++ b/pkg/kv/kvserver/batcheval/split_stats_helper.go @@ -25,11 +25,12 @@ import "github.com/cockroachdb/cockroach/pkg/storage/enginepb" // but nothing in this code relies on that). Since we have no reason to // introduce ContainsEstimates in a split trigger, this typically has // ContainsEstimates unset, but the results will be estimate free either way. -// - AbsPostSplitLeft: the stats of the range after applying the split, i.e. -// accounting both for the shrinking as well as for the writes in DeltaBatch -// related to the shrunk keyrange. -// In practice, we obtain this by recomputing the stats, and so we don't -// expect ContainsEstimates to be set in them. +// - AbsPostSplit{Left,Right}: the stats of either the left or right hand side +// range after applying the split, i.e. accounting both for the shrinking as +// well as for the writes in DeltaBatch related to the shrunk keyrange. In +// practice, we obtain this by recomputing the stats using the corresponding +// AbsPostSplit{Left,Right}Fn, and so we don't expect ContainsEstimates to be +// set in them. The choice of which side to scan is controlled by ScanRightFirst. // // We are interested in computing from this the quantities // @@ -103,48 +104,86 @@ import "github.com/cockroachdb/cockroach/pkg/storage/enginepb" type splitStatsHelper struct { in splitStatsHelperInput + absPostSplitLeft *enginepb.MVCCStats absPostSplitRight *enginepb.MVCCStats } +// splitStatsScanFn scans a post-split keyspace to compute its stats. The +// computed stats should not contain estimates. +type splitStatsScanFn func() (enginepb.MVCCStats, error) + // splitStatsHelperInput is passed to makeSplitStatsHelper. type splitStatsHelperInput struct { AbsPreSplitBothEstimated enginepb.MVCCStats DeltaBatchEstimated enginepb.MVCCStats - AbsPostSplitLeft enginepb.MVCCStats + // AbsPostSplitLeftFn returns the stats for the left hand side of the + // split. + AbsPostSplitLeftFn splitStatsScanFn // AbsPostSplitRightFn returns the stats for the right hand side of the - // split. This is only called (and only once) when either of the first two - // fields above contains estimates, so that we can guarantee that the - // post-splits stats don't. - AbsPostSplitRightFn func() (enginepb.MVCCStats, error) + // split. + AbsPostSplitRightFn splitStatsScanFn + // ScanRightFirst controls whether the left hand side or the right hand + // side of the split is scanned first. In cases where neither of the + // input stats contain estimates, this is the only side that needs to + // be scanned. + ScanRightFirst bool } // makeSplitStatsHelper initializes a splitStatsHelper. The values in the input // are assumed to not change outside of the helper and must no longer be used. -// The provided AbsPostSplitRightFn recomputes the right hand side of the split -// after accounting for the split trigger batch. This is only invoked at most -// once, and only when necessary. +// The provided AbsPostSplitLeftFn and AbsPostSplitRightFn recompute the left +// and right hand sides of the split after accounting for the split trigger +// batch. Each are only invoked at most once, and only when necessary. func makeSplitStatsHelper(input splitStatsHelperInput) (splitStatsHelper, error) { h := splitStatsHelper{ in: input, } + // Scan to compute the stats for the first side. + var absPostSplitFirst enginepb.MVCCStats + var err error + if h.in.ScanRightFirst { + absPostSplitFirst, err = input.AbsPostSplitRightFn() + h.absPostSplitRight = &absPostSplitFirst + } else { + absPostSplitFirst, err = input.AbsPostSplitLeftFn() + h.absPostSplitLeft = &absPostSplitFirst + } + if err != nil { + return splitStatsHelper{}, err + } + if h.in.AbsPreSplitBothEstimated.ContainsEstimates == 0 && h.in.DeltaBatchEstimated.ContainsEstimates == 0 { - // We have CombinedErrorDelta zero, so use arithmetic to compute - // AbsPostSplitRight(). + // We have CombinedErrorDelta zero, so use arithmetic to compute the + // stats for the second side. ms := h.in.AbsPreSplitBothEstimated - ms.Subtract(h.in.AbsPostSplitLeft) + ms.Subtract(absPostSplitFirst) ms.Add(h.in.DeltaBatchEstimated) - h.absPostSplitRight = &ms + if h.in.ScanRightFirst { + h.absPostSplitLeft = &ms + } else { + h.absPostSplitRight = &ms + } return h, nil } - // Estimates are contained in the input, so ask the oracle for - // AbsPostSplitRight(). - ms, err := input.AbsPostSplitRightFn() + + // Estimates are contained in the input, so ask the oracle to scan to compute + // the stats for the second side. We only scan the second side when either of + // the input stats above (AbsPreSplitBothEstimated or DeltaBatchEstimated) + // contains estimates, so that we can guarantee that the post-splits stats + // don't. + var absPostSplitSecond enginepb.MVCCStats + if h.in.ScanRightFirst { + absPostSplitSecond, err = input.AbsPostSplitLeftFn() + h.absPostSplitLeft = &absPostSplitSecond + } else { + absPostSplitSecond, err = input.AbsPostSplitRightFn() + h.absPostSplitRight = &absPostSplitSecond + } if err != nil { return splitStatsHelper{}, err } - h.absPostSplitRight = &ms return h, nil } @@ -167,7 +206,7 @@ func (h splitStatsHelper) DeltaPostSplitLeft() enginepb.MVCCStats { // NB: if we ever wanted to also write to the left hand side after init'ing // the helper, we can make that work, too. // NB: note how none of this depends on mutations to absPostSplitRight. - ms := h.in.AbsPostSplitLeft + ms := *h.absPostSplitLeft ms.Subtract(h.in.AbsPreSplitBothEstimated) return ms