From c21c6a577450535d521fb583e7466b7365ed80d2 Mon Sep 17 00:00:00 2001 From: Bilal Akhtar Date: Thu, 29 Sep 2022 11:40:32 -0400 Subject: [PATCH 1/6] storage: Use range key masking in CheckSSTConflicts This change uses range key masking in case of import cancellation/restart inside CheckSSTConflicts. This prevents point keys shadowed by range tombstones from appearing to CheckSSTConflicts, significantly speeding up its runtime and allowing import restarts to succeed without getting wedged. Unblocks #87309. Release note: None. --- pkg/kv/kvserver/batcheval/cmd_add_sstable.go | 2 +- pkg/storage/bench_test.go | 2 +- pkg/storage/sst.go | 12 +++++++----- pkg/storage/sst_test.go | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pkg/kv/kvserver/batcheval/cmd_add_sstable.go b/pkg/kv/kvserver/batcheval/cmd_add_sstable.go index e083ee8da748..f1b8eb090e90 100644 --- a/pkg/kv/kvserver/batcheval/cmd_add_sstable.go +++ b/pkg/kv/kvserver/batcheval/cmd_add_sstable.go @@ -219,7 +219,7 @@ func EvalAddSSTable( leftPeekBound, rightPeekBound := rangeTombstonePeekBounds( args.Key, args.EndKey, desc.StartKey.AsRawKey(), desc.EndKey.AsRawKey()) statsDelta, err = storage.CheckSSTConflicts(ctx, sst, readWriter, start, end, leftPeekBound, rightPeekBound, - args.DisallowShadowing, args.DisallowShadowingBelow, maxIntents, usePrefixSeek) + args.DisallowShadowing, args.DisallowShadowingBelow, sstToReqTS, maxIntents, usePrefixSeek) statsDelta.Add(sstReqStatsDelta) if err != nil { return result.Result{}, errors.Wrap(err, "checking for key collisions") diff --git a/pkg/storage/bench_test.go b/pkg/storage/bench_test.go index a4aec0270ba7..253d220c44b8 100644 --- a/pkg/storage/bench_test.go +++ b/pkg/storage/bench_test.go @@ -1985,7 +1985,7 @@ func runCheckSSTConflicts( b.ResetTimer() for i := 0; i < b.N; i++ { - _, err := CheckSSTConflicts(context.Background(), sstFile.Data(), eng, sstStart, sstEnd, sstStart.Key, sstEnd.Key.Next(), false, hlc.Timestamp{}, math.MaxInt64, usePrefixSeek) + _, err := CheckSSTConflicts(context.Background(), sstFile.Data(), eng, sstStart, sstEnd, sstStart.Key, sstEnd.Key.Next(), false, hlc.Timestamp{}, hlc.Timestamp{}, math.MaxInt64, usePrefixSeek) require.NoError(b, err) } } diff --git a/pkg/storage/sst.go b/pkg/storage/sst.go index ca8997c4ee88..13412dd6b6e7 100644 --- a/pkg/storage/sst.go +++ b/pkg/storage/sst.go @@ -52,6 +52,7 @@ func CheckSSTConflicts( leftPeekBound, rightPeekBound roachpb.Key, disallowShadowing bool, disallowShadowingBelow hlc.Timestamp, + sstToReqTimestamp hlc.Timestamp, maxIntents int64, usePrefixSeek bool, ) (enginepb.MVCCStats, error) { @@ -130,11 +131,12 @@ func CheckSSTConflicts( } extIter := reader.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{ - KeyTypes: IterKeyTypePointsAndRanges, - LowerBound: leftPeekBound, - UpperBound: rightPeekBound, - Prefix: usePrefixSeek, - useL6Filters: true, + KeyTypes: IterKeyTypePointsAndRanges, + LowerBound: leftPeekBound, + UpperBound: rightPeekBound, + RangeKeyMaskingBelow: sstToReqTimestamp, + Prefix: usePrefixSeek, + useL6Filters: true, }) defer extIter.Close() diff --git a/pkg/storage/sst_test.go b/pkg/storage/sst_test.go index 8e2b50a44f91..175e6547c71a 100644 --- a/pkg/storage/sst_test.go +++ b/pkg/storage/sst_test.go @@ -87,7 +87,7 @@ func TestCheckSSTConflictsMaxIntents(t *testing.T) { // Provoke and check WriteIntentErrors. startKey, endKey := MVCCKey{Key: roachpb.Key(start)}, MVCCKey{Key: roachpb.Key(end)} _, err := CheckSSTConflicts(ctx, sstFile.Bytes(), engine, startKey, endKey, startKey.Key, endKey.Key.Next(), - false /*disallowShadowing*/, hlc.Timestamp{} /*disallowShadowingBelow*/, tc.maxIntents, usePrefixSeek) + false /*disallowShadowing*/, hlc.Timestamp{} /*disallowShadowingBelow*/, hlc.Timestamp{} /* sstReqTS */, tc.maxIntents, usePrefixSeek) require.Error(t, err) writeIntentErr := &roachpb.WriteIntentError{} require.ErrorAs(t, err, &writeIntentErr) From 8fd4264f2d2cacd204f08ca1a4533907206e38d9 Mon Sep 17 00:00:00 2001 From: Bilal Akhtar Date: Wed, 28 Sep 2022 15:27:27 -0400 Subject: [PATCH 2/6] storage: Move no-overlap fast path up in CheckSSTConflicts This change moves up the fast path in CheckSSTConflicts where we check for any engine keys within the overlapping SST keys. If no engine keys are found, we can return from the function immediately. Previously we'd open a range-key sst iterator before doing this check, which has a pretty significant construction cost. Fixes #88702. Release note: None --- pkg/storage/sst.go | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/pkg/storage/sst.go b/pkg/storage/sst.go index 13412dd6b6e7..f49a26d2955f 100644 --- a/pkg/storage/sst.go +++ b/pkg/storage/sst.go @@ -72,6 +72,25 @@ func CheckSSTConflicts( rightPeekBound = keys.MaxKey } + var statsDiff enginepb.MVCCStats + var intents []roachpb.Intent + if usePrefixSeek { + // If we're going to be using a prefix iterator, check for the fast path + // first, where there are no keys in the reader between the sstable's start + // and end keys. We use a non-prefix iterator for this search, and reopen a + // prefix one if there are engine keys in the span. + nonPrefixIter := reader.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{ + KeyTypes: IterKeyTypePointsAndRanges, + UpperBound: end.Key, + }) + nonPrefixIter.SeekGE(start) + valid, err := nonPrefixIter.Valid() + nonPrefixIter.Close() + if !valid { + return statsDiff, err + } + } + // Check for any range keys. // // TODO(bilal): Expose reader.Properties.NumRangeKeys() here, so we don't @@ -113,23 +132,6 @@ func CheckSSTConflicts( } rkIter.Close() - var statsDiff enginepb.MVCCStats - var intents []roachpb.Intent - - if usePrefixSeek { - // If we're going to be using a prefix iterator, check for the fast path - // first, where there are no keys in the reader between the sstable's start - // and end keys. We use a non-prefix iterator for this search, and reopen a - // prefix one if there are engine keys in the span. - nonPrefixIter := reader.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{UpperBound: end.Key}) - nonPrefixIter.SeekGE(start) - valid, err := nonPrefixIter.Valid() - nonPrefixIter.Close() - if !valid { - return statsDiff, err - } - } - extIter := reader.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{ KeyTypes: IterKeyTypePointsAndRanges, LowerBound: leftPeekBound, From 25676bba2de56de89cc68de098b19c17e73f6cc5 Mon Sep 17 00:00:00 2001 From: Marcus Gartner Date: Fri, 30 Sep 2022 10:46:27 -0400 Subject: [PATCH 3/6] reduce: remove and simplify index PARTITION BY clauses Release note: None --- pkg/cmd/reduce/reduce/reducesql/reducesql.go | 99 ++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/pkg/cmd/reduce/reduce/reducesql/reducesql.go b/pkg/cmd/reduce/reduce/reducesql/reducesql.go index 5708e9dd64ba..ef9fa32802de 100644 --- a/pkg/cmd/reduce/reduce/reducesql/reducesql.go +++ b/pkg/cmd/reduce/reduce/reducesql/reducesql.go @@ -49,6 +49,10 @@ var SQLPasses = []reduce.Pass{ removeIndexCols, removeIndexPredicate, removeIndexStoringCols, + removeIndexPartitionBy, + removeIndexPartitions, + removeIndexPartitionListValues, + removeIndexPartitionListCols, removeWindowPartitions, removeDBSchema, removeFroms, @@ -828,6 +832,101 @@ var ( } return 0 }) + removeIndexPartitionBy = walkSQL("remove INDEX PARTITION BY", func(xfi int, node interface{}) int { + xf := xfi == 0 + removePartitionBy := func(idx *tree.IndexTableDef) int { + if idx.PartitionByIndex != nil { + if xf { + idx.PartitionByIndex = nil + } + return 1 + } + return 0 + } + switch node := node.(type) { + case *tree.IndexTableDef: + return removePartitionBy(node) + case *tree.UniqueConstraintTableDef: + return removePartitionBy(&node.IndexTableDef) + } + return 0 + }) + removeIndexPartitions = walkSQL("remove INDEX partitions", func(xfi int, node interface{}) int { + removePartitionBy := func(idx *tree.IndexTableDef) int { + if idx.PartitionByIndex != nil { + n := len(idx.PartitionByIndex.List) + if xfi < n { + idx.PartitionByIndex.List = + append(idx.PartitionByIndex.List[:xfi], idx.PartitionByIndex.List[xfi+1:]...) + } + return n + } + return 0 + } + switch node := node.(type) { + case *tree.IndexTableDef: + return removePartitionBy(node) + case *tree.UniqueConstraintTableDef: + return removePartitionBy(&node.IndexTableDef) + } + return 0 + }) + removeIndexPartitionListValues = walkSQL("remove INDEX partition list values", func(xfi int, node interface{}) int { + removePartitionBy := func(idx *tree.IndexTableDef) int { + if idx.PartitionByIndex != nil { + n := 0 + for i := range idx.PartitionByIndex.List { + list := &idx.PartitionByIndex.List[i] + l := len(list.Exprs) + if xfi >= n && xfi < n+l { + list.Exprs = append(list.Exprs[:xfi-n], list.Exprs[xfi-n+1:]...) + } + n += l + } + return n + } + return 0 + } + switch node := node.(type) { + case *tree.IndexTableDef: + return removePartitionBy(node) + case *tree.UniqueConstraintTableDef: + return removePartitionBy(&node.IndexTableDef) + } + return 0 + }) + removeIndexPartitionListCols = walkSQL("remove INDEX partition list cols", func(xfi int, node interface{}) int { + removePartitionBy := func(idx *tree.IndexTableDef) int { + if idx.PartitionByIndex != nil && len(idx.PartitionByIndex.List) > 0 { + n := len(idx.PartitionByIndex.Fields) + if xfi < n { + idx.PartitionByIndex.Fields = + append(idx.PartitionByIndex.Fields[:xfi], idx.PartitionByIndex.Fields[xfi+1:]...) + // Remove the corresponding column from the index columns. + idx.Columns = append(idx.Columns[:xfi], idx.Columns[xfi+1:]...) + // Remove the corresponding value from every tuple in every + // partition. + for i := range idx.PartitionByIndex.List { + list := &idx.PartitionByIndex.List[i] + for j := range list.Exprs { + t := list.Exprs[j].(*tree.Tuple) + t.Exprs = append(t.Exprs[:xfi], t.Exprs[xfi+1:]...) + } + } + + } + return n + } + return 0 + } + switch node := node.(type) { + case *tree.IndexTableDef: + return removePartitionBy(node) + case *tree.UniqueConstraintTableDef: + return removePartitionBy(&node.IndexTableDef) + } + return 0 + }) removeWindowPartitions = walkSQL("remove WINDOW partitions", func(xfi int, node interface{}) int { switch node := node.(type) { case *tree.WindowDef: From a9010ad8a35db1a257a0f8820f3a8ade5bab0e3a Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 3 Oct 2022 10:56:16 +0000 Subject: [PATCH 4/6] storage: improve `BenchmarkUpdateSSTTimestamps` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch improves `BenchmarkUpdateSSTTimestamps`, by running separate benchmarks varying the number of keys and concurrency. In particular, this exercises the `sstable.RewriteKeySuffixes` fast path, while the old benchmark only used the naïve read/write slow path. Release note: None --- pkg/storage/sst_test.go | 96 ++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 54 deletions(-) diff --git a/pkg/storage/sst_test.go b/pkg/storage/sst_test.go index 8e2b50a44f91..280a57dec569 100644 --- a/pkg/storage/sst_test.go +++ b/pkg/storage/sst_test.go @@ -15,12 +15,12 @@ import ( "encoding/binary" "fmt" "math/rand" - "os" - "runtime/pprof" "testing" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/storage/enginepb" + "github.com/cockroachdb/cockroach/pkg/testutils/skip" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" @@ -104,74 +104,62 @@ func TestCheckSSTConflictsMaxIntents(t *testing.T) { } func BenchmarkUpdateSSTTimestamps(b *testing.B) { - const ( - modeZero = iota + 1 // all zeroes - modeCounter // uint64 counter in first 8 bytes - modeRandom // random values - - concurrency = 0 // 0 uses naïve replacement - sstSize = 0 - keyCount = 500000 - valueSize = 8 - valueMode = modeRandom - profile = false // cpuprofile.pprof - ) - - if sstSize > 0 && keyCount > 0 { - b.Fatal("Can't set both sstSize and keyCount") + defer log.Scope(b).Close(b) + skip.UnderShort(b) + + ctx := context.Background() + + for _, numKeys := range []int{1, 10, 100, 1000, 10000, 100000} { + b.Run(fmt.Sprintf("numKeys=%d", numKeys), func(b *testing.B) { + for _, concurrency := range []int{0, 1, 2, 4, 8} { // 0 uses naïve read/write loop + b.Run(fmt.Sprintf("concurrency=%d", concurrency), func(b *testing.B) { + runUpdateSSTTimestamps(ctx, b, numKeys, concurrency) + }) + } + }) } +} - b.StopTimer() +func runUpdateSSTTimestamps(ctx context.Context, b *testing.B, numKeys int, concurrency int) { + const valueSize = 8 r := rand.New(rand.NewSource(7)) - - ctx := context.Background() st := cluster.MakeTestingClusterSettings() sstFile := &MemFile{} writer := MakeIngestionSSTWriter(ctx, st, sstFile) defer writer.Close() + sstTimestamp := hlc.MinTimestamp + reqTimestamp := hlc.Timestamp{WallTime: 1634899098417970999, Logical: 9} + key := make([]byte, 8) value := make([]byte, valueSize) - sstTimestamp := hlc.Timestamp{WallTime: 1} - var i uint64 - for i = 0; (keyCount > 0 && i < keyCount) || (sstSize > 0 && sstFile.Len() < sstSize); i++ { - binary.BigEndian.PutUint64(key, i) - - switch valueMode { - case modeZero: - case modeCounter: - binary.BigEndian.PutUint64(value, i) - case modeRandom: - r.Read(value) - default: - b.Fatalf("unknown value mode %d", valueMode) - } - - var v MVCCValue - v.Value.SetBytes(value) - v.Value.InitChecksum(key) + for i := 0; i < numKeys; i++ { + binary.BigEndian.PutUint64(key, uint64(i)) + r.Read(value) - require.NoError(b, writer.PutMVCC(MVCCKey{Key: key, Timestamp: sstTimestamp}, v)) - } - writer.Close() - b.Logf("%vMB %v keys", sstFile.Len()/1e6, i) - - if profile { - f, err := os.Create("cpuprofile.pprof") - require.NoError(b, err) - defer f.Close() + var mvccValue MVCCValue + mvccValue.Value.SetBytes(value) + mvccValue.Value.InitChecksum(key) - require.NoError(b, pprof.StartCPUProfile(f)) - defer pprof.StopCPUProfile() + if err := writer.PutMVCC(MVCCKey{Key: key, Timestamp: sstTimestamp}, mvccValue); err != nil { + require.NoError(b, err) // for performance + } } + require.NoError(b, writer.Finish()) - requestTimestamp := hlc.Timestamp{WallTime: 1634899098417970999, Logical: 9} + b.SetBytes(int64(numKeys * (len(key) + len(value)))) + b.ResetTimer() - b.StartTimer() + var res []byte for i := 0; i < b.N; i++ { - _, _, err := UpdateSSTTimestamps( - ctx, st, sstFile.Bytes(), sstTimestamp, requestTimestamp, concurrency, nil /* stats */) - require.NoError(b, err) + var ms enginepb.MVCCStats + var err error + res, _, err = UpdateSSTTimestamps( + ctx, st, sstFile.Bytes(), sstTimestamp, reqTimestamp, concurrency, &ms) + if err != nil { + require.NoError(b, err) // for performance + } } + _ = res } From d6ccd106865e86c765112353562cc07f3afe2ead Mon Sep 17 00:00:00 2001 From: Marcus Gartner Date: Fri, 30 Sep 2022 15:19:22 -0400 Subject: [PATCH 5/6] opt: fix invalid transformation in SplitLimitedSelectIntoUnionSelects This commit removes some untested logic in `SplitLimitedSelectIntoUnionSelects` that created invalid expression transformations. With this logic, this rule could construct an unordered limit below the `UnionAll` which is incorrect. The bug could cause incorrect query results. Fixes #88993 Release note (bug fix): A bug has been fixed that could cause incorrect results in rare cases. The bug could only present if the following conditions were true: 1. A query with `ORDER BY` and `LIMIT` was executed. 2. The table containing the `ORDER BY` columns had an index containing those columns. 3. The index in (2) contained a prefix of columns held to a fixed number of values by the query filter (e.g., `WHERE a IN (1, 3)`), a `CHECK` constraint (e.g., `CHECK (a IN (1, 3))`), inferred by a computed column expression (e.g. `WHERE a IN (1, 3)` and a column `b INT AS (a + 10) STORED`), or inferred by a `PARTITION BY` clause (e.g. `INDEX (a, ...) PARTITION BY LIST (a) (PARTITION p VALUES ((1), (3)))`). This bug was present since version 22.1.0. --- .../logic_test/partitioning_constrained_scans | 1 + .../testdata/logic_test/partitioning_index | 18 ++ .../logictest/testdata/logic_test/aggregate | 16 ++ pkg/sql/opt/xform/general_funcs.go | 62 +++-- pkg/sql/opt/xform/testdata/rules/limit | 223 ++++++++---------- 5 files changed, 156 insertions(+), 164 deletions(-) diff --git a/pkg/ccl/logictestccl/testdata/logic_test/partitioning_constrained_scans b/pkg/ccl/logictestccl/testdata/logic_test/partitioning_constrained_scans index d586035403ae..4840f4cff051 100644 --- a/pkg/ccl/logictestccl/testdata/logic_test/partitioning_constrained_scans +++ b/pkg/ccl/logictestccl/testdata/logic_test/partitioning_constrained_scans @@ -317,3 +317,4 @@ select │ └── [/'foo'/e'bar\x00'/5 - ] └── filters └── c = 5 + diff --git a/pkg/ccl/logictestccl/testdata/logic_test/partitioning_index b/pkg/ccl/logictestccl/testdata/logic_test/partitioning_index index e7e6ed83cc7a..173916fd17f3 100644 --- a/pkg/ccl/logictestccl/testdata/logic_test/partitioning_index +++ b/pkg/ccl/logictestccl/testdata/logic_test/partitioning_index @@ -262,3 +262,21 @@ NULL 10 1 20 NULL 1 20 20 1 25 NULL 1 + +# Regression test for #88993 where a limit pushed down into a union of scans +# caused incorrect query results. +statement ok +CREATE TABLE t88993 ( + a INT, + b INT, + c INT, + INDEX (b, c, a) PARTITION BY LIST (b, c) ( + PARTITION p1 VALUES IN ((11, 50)) + ) +); +INSERT INTO t88993 (a, b, c) VALUES (1, 10, 150), (0, 11, 100); + +query I +SELECT min(a) FROM t88993 +---- +0 diff --git a/pkg/sql/logictest/testdata/logic_test/aggregate b/pkg/sql/logictest/testdata/logic_test/aggregate index a326a07c2877..efb90856c9ff 100644 --- a/pkg/sql/logictest/testdata/logic_test/aggregate +++ b/pkg/sql/logictest/testdata/logic_test/aggregate @@ -3786,3 +3786,19 @@ query FFFF select covar_pop(y, x), covar_samp(y, x), regr_sxx(y, x), regr_syy(y, x) from corrupt_combine ---- 37.5 45 2983.333333333333 17.5 + +# Regression test for #88993 where a limit pushed down into a union of scans +# caused incorrect query results. +statement ok +CREATE TABLE t2 ( + a INT, + b INT, + c INT, + INDEX (b, c, a) +); +INSERT INTO t2 (a, b, c) VALUES (1, 10, 20), (0, 11, 100); + +query I +SELECT min(a) FROM t2 WHERE (b <= 11 AND c < 50) OR (b = 11 AND c = 50) OR (b >= 11 AND c > 50) +---- +0 diff --git a/pkg/sql/opt/xform/general_funcs.go b/pkg/sql/opt/xform/general_funcs.go index 6ce63a8fd8ad..bc33618ecd97 100644 --- a/pkg/sql/opt/xform/general_funcs.go +++ b/pkg/sql/opt/xform/general_funcs.go @@ -545,6 +545,7 @@ func (c *CustomFuncs) splitScanIntoUnionScansOrSelects( // UnionAll tree. var noLimitSpans constraint.Spans var last memo.RelExpr + spColList := sp.Cols.ToList() queue := list.New() for i, n := 0, spans.Count(); i < n; i++ { if i >= budgetExceededIndex { @@ -561,23 +562,35 @@ func (c *CustomFuncs) splitScanIntoUnionScansOrSelects( } for j, m := 0, singleKeySpans.Count(); j < m; j++ { // Construct a new Scan for each span. + // Note: newHardLimit will be 0 (i.e., no limit) if there are + // filters to be applied in a Select. newScanPrivate := c.makeNewScanPrivate( sp, cons.Columns, newHardLimit, singleKeySpans.Get(j), ) - newScanOrSelect := c.e.f.ConstructScan(newScanPrivate) + newScanOrLimitedSelect := c.e.f.ConstructScan(newScanPrivate) if !filters.IsTrue() { - newScanOrSelect = c.wrapScanInLimitedSelect( - newScanOrSelect, - sp, - newScanPrivate, - filters, - limit, + // If there are filters, apply them and a limit. The limit is + // not needed if there are no filters because the scan's hard + // limit will be set (see newHardLimit). + // + // TODO(mgartner/msirek): Converting ColSets to ColLists here is + // only safe because column IDs are always allocated in a + // consistent, ascending order for each duplicated table in the + // metadata. If column ID allocation changes, this could break. + newColList := newScanPrivate.Cols.ToList() + newScanOrLimitedSelect = c.e.f.ConstructLimit( + c.e.f.ConstructSelect( + newScanOrLimitedSelect, + c.RemapScanColsInFilter(filters, sp, newScanPrivate), + ), + c.IntConst(tree.NewDInt(tree.DInt(limit))), + ordering.RemapColumns(spColList, newColList), ) } - queue.PushBack(newScanOrSelect) + queue.PushBack(newScanOrLimitedSelect) } } @@ -652,15 +665,21 @@ func (c *CustomFuncs) splitScanIntoUnionScansOrSelects( Columns: cons.Columns.RemapColumns(sp.Table, newScanPrivate.Table), Spans: noLimitSpans, }) + // TODO(mgartner): We should be able to add a LIMIT above the Scan or Select + // below, as long as we remap the original ordering columns. This could + // allow a top-k to be planned instead of a sort. newScanOrSelect := c.e.f.ConstructScan(newScanPrivate) if !filters.IsTrue() { - newScanOrSelect = c.wrapScanInLimitedSelect(newScanOrSelect, sp, newScanPrivate, filters, limit) + newScanOrSelect = c.e.f.ConstructSelect( + newScanOrSelect, + c.RemapScanColsInFilter(filters, sp, newScanPrivate), + ) } // TODO(mgartner/msirek): Converting ColSets to ColLists here is only safe // because column IDs are always allocated in a consistent, ascending order // for each duplicated table in the metadata. If column ID allocation // changes, this could break. - return makeNewUnion(last, newScanOrSelect, sp.Cols.ToList()), true + return makeNewUnion(last, newScanOrSelect, spColList), true } // numAllowedValues returns the number of allowed values for a column with a @@ -711,29 +730,6 @@ func (c *CustomFuncs) numAllowedValues( return 0, false } -// wrapScanInLimitedSelect wraps "scan" in a SelectExpr with filters mapped from -// the originalScanPrivate columns to the columns in scan. If limit is non-zero, -// the SelectExpr is wrapped in a LimitExpr with that limit. -func (c *CustomFuncs) wrapScanInLimitedSelect( - scan memo.RelExpr, - originalScanPrivate, newScanPrivate *memo.ScanPrivate, - filters memo.FiltersExpr, - limit int, -) (limitedSelect memo.RelExpr) { - limitedSelect = c.e.f.ConstructSelect( - scan, - c.RemapScanColsInFilter(filters, originalScanPrivate, newScanPrivate), - ) - if limit != 0 { - limitedSelect = c.e.f.ConstructLimit( - limitedSelect, - c.IntConst(tree.NewDInt(tree.DInt(limit))), - c.EmptyOrdering(), - ) - } - return limitedSelect -} - // indexHasOrderingSequence returns whether the Scan can provide a given // ordering under the assumption that we are scanning a single-key span with the // given keyLength (and if so, whether we need to scan it in reverse). diff --git a/pkg/sql/opt/xform/testdata/rules/limit b/pkg/sql/opt/xform/testdata/rules/limit index 691b00c6642d..ac4ebd938043 100644 --- a/pkg/sql/opt/xform/testdata/rules/limit +++ b/pkg/sql/opt/xform/testdata/rules/limit @@ -2380,9 +2380,9 @@ top-k │ └── [/10/4 - /10/4] └── key: (1-4) -# Repro for #82730 +# Regression test for #82730. exec-ddl -CREATE TABLE table1 ( +CREATE TABLE t82730a ( col1_0 NAME, col1_1 INT8, col1_3 INT8, @@ -2413,13 +2413,13 @@ CREATE TABLE table1 ( ---- opt expect-not=SplitLimitedSelectIntoUnionSelects -UPDATE table1 +UPDATE t82730a SET col1_5 = col1_5 WHERE col1_0 ILIKE col1_0 ORDER BY col1_3 LIMIT 84 ---- -update table1 +update t82730a ├── columns: ├── fetch columns: col1_0:7 col1_1:8 col1_3:9 col1_5:10 ├── update-mapping: @@ -2437,7 +2437,7 @@ update table1 ├── columns: col1_0:7!null col1_1:8 col1_3:9 col1_5:10 ├── key: (7) ├── fd: (7)-->(8-10), (8,9)~~>(7,10) - ├── scan table1 + ├── scan t82730a │ ├── columns: col1_0:7!null col1_1:8 col1_3:9 col1_5:10 │ ├── key: (7) │ └── fd: (7)-->(8-10), (8,9)~~>(7,10) @@ -2445,7 +2445,7 @@ update table1 └── col1_0:7 ILIKE col1_0:7 [outer=(7), constraints=(/7: (/NULL - ])] exec-ddl -CREATE TABLE table2 ( +CREATE TABLE t82730b ( col1_0 NAME, col1_1 INT8, col1_3 INT8, @@ -2476,144 +2476,105 @@ CREATE TABLE table2 ( ---- opt expect=SplitLimitedSelectIntoUnionSelects -UPDATE table2 AS tab_41831 +UPDATE t82730b AS tab_41831 SET col1_5 = col1_5 WHERE col1_0 ILIKE col1_0 ORDER BY col1_3 LIMIT 84 ---- -update table2 [as=tab_41831] +update t82730b [as=tab_41831] ├── columns: ├── fetch columns: col1_0:7 col1_1:8 col1_3:9 col1_5:10 ├── update-mapping: │ └── col1_5:10 => col1_5:4 ├── cardinality: [0 - 0] ├── volatile, mutations - └── index-join table2 + └── top-k ├── columns: col1_0:7!null col1_1:8 col1_3:9 col1_5:10 + ├── internal-ordering: +9 + ├── k: 84 ├── cardinality: [0 - 84] ├── key: (7) ├── fd: (7)-->(8-10), (8,9)~~>(7,10) - └── limit - ├── columns: col1_0:7!null col1_1:8 col1_3:9 - ├── internal-ordering: +9 - ├── cardinality: [0 - 84] + └── select + ├── columns: col1_0:7!null col1_1:8 col1_3:9 col1_5:10 ├── key: (7) - ├── fd: (7)-->(8,9), (8,9)~~>(7) - ├── union-all - │ ├── columns: col1_0:7!null col1_1:8 col1_3:9 - │ ├── left columns: col1_0:37 col1_1:38 col1_3:39 - │ ├── right columns: col1_0:43 col1_1:44 col1_3:45 - │ ├── cardinality: [0 - 336] - │ ├── ordering: +9 - │ ├── limit hint: 84.00 - │ ├── union-all - │ │ ├── columns: col1_0:37!null col1_1:38!null col1_3:39 - │ │ ├── left columns: col1_0:31 col1_1:32 col1_3:33 - │ │ ├── right columns: col1_0:25 col1_1:26 col1_3:27 - │ │ ├── cardinality: [0 - 252] - │ │ ├── ordering: +39 - │ │ ├── limit hint: 84.00 - │ │ ├── union-all - │ │ │ ├── columns: col1_0:31!null col1_1:32!null col1_3:33 - │ │ │ ├── left columns: col1_0:13 col1_1:14 col1_3:15 - │ │ │ ├── right columns: col1_0:19 col1_1:20 col1_3:21 - │ │ │ ├── cardinality: [0 - 168] - │ │ │ ├── ordering: +33 - │ │ │ ├── limit hint: 84.00 - │ │ │ ├── limit - │ │ │ │ ├── columns: col1_0:13!null col1_1:14!null col1_3:15 - │ │ │ │ ├── cardinality: [0 - 84] - │ │ │ │ ├── key: (13) - │ │ │ │ ├── fd: ()-->(14), (13)-->(15), (14,15)~~>(13) - │ │ │ │ ├── ordering: +15 opt(14) [actual: +15] - │ │ │ │ ├── limit hint: 84.00 - │ │ │ │ ├── select - │ │ │ │ │ ├── columns: col1_0:13!null col1_1:14!null col1_3:15 - │ │ │ │ │ ├── key: (13) - │ │ │ │ │ ├── fd: ()-->(14), (13)-->(15), (14,15)~~>(13) - │ │ │ │ │ ├── ordering: +15 opt(14) [actual: +15] - │ │ │ │ │ ├── limit hint: 84.00 - │ │ │ │ │ ├── scan table2@table2_col1_1_col1_3_key [as=tab_41831] - │ │ │ │ │ │ ├── columns: col1_0:13!null col1_1:14!null col1_3:15 - │ │ │ │ │ │ ├── constraint: /14/15: [/2 - /2] - │ │ │ │ │ │ ├── key: (13) - │ │ │ │ │ │ ├── fd: ()-->(14), (13)-->(15), (14,15)~~>(13) - │ │ │ │ │ │ └── ordering: +15 opt(14) [actual: +15] - │ │ │ │ │ └── filters - │ │ │ │ │ └── col1_0:13 ILIKE col1_0:13 [outer=(13), constraints=(/13: (/NULL - ])] - │ │ │ │ └── 84 - │ │ │ └── limit - │ │ │ ├── columns: col1_0:19!null col1_1:20!null col1_3:21 - │ │ │ ├── cardinality: [0 - 84] - │ │ │ ├── key: (19) - │ │ │ ├── fd: ()-->(20), (19)-->(21), (20,21)~~>(19) - │ │ │ ├── ordering: +21 opt(20) [actual: +21] - │ │ │ ├── limit hint: 84.00 - │ │ │ ├── select - │ │ │ │ ├── columns: col1_0:19!null col1_1:20!null col1_3:21 - │ │ │ │ ├── key: (19) - │ │ │ │ ├── fd: ()-->(20), (19)-->(21), (20,21)~~>(19) - │ │ │ │ ├── ordering: +21 opt(20) [actual: +21] - │ │ │ │ ├── limit hint: 84.00 - │ │ │ │ ├── scan table2@table2_col1_1_col1_3_key [as=tab_41831] - │ │ │ │ │ ├── columns: col1_0:19!null col1_1:20!null col1_3:21 - │ │ │ │ │ ├── constraint: /20/21: [/3 - /3] - │ │ │ │ │ ├── key: (19) - │ │ │ │ │ ├── fd: ()-->(20), (19)-->(21), (20,21)~~>(19) - │ │ │ │ │ └── ordering: +21 opt(20) [actual: +21] - │ │ │ │ └── filters - │ │ │ │ └── col1_0:19 ILIKE col1_0:19 [outer=(19), constraints=(/19: (/NULL - ])] - │ │ │ └── 84 - │ │ └── limit - │ │ ├── columns: col1_0:25!null col1_1:26!null col1_3:27 - │ │ ├── cardinality: [0 - 84] - │ │ ├── key: (25) - │ │ ├── fd: ()-->(26), (25)-->(27), (26,27)~~>(25) - │ │ ├── ordering: +27 opt(26) [actual: +27] - │ │ ├── limit hint: 84.00 - │ │ ├── select - │ │ │ ├── columns: col1_0:25!null col1_1:26!null col1_3:27 - │ │ │ ├── key: (25) - │ │ │ ├── fd: ()-->(26), (25)-->(27), (26,27)~~>(25) - │ │ │ ├── ordering: +27 opt(26) [actual: +27] - │ │ │ ├── limit hint: 84.00 - │ │ │ ├── scan table2@table2_col1_1_col1_3_key [as=tab_41831] - │ │ │ │ ├── columns: col1_0:25!null col1_1:26!null col1_3:27 - │ │ │ │ ├── constraint: /26/27: [/4 - /4] - │ │ │ │ ├── key: (25) - │ │ │ │ ├── fd: ()-->(26), (25)-->(27), (26,27)~~>(25) - │ │ │ │ └── ordering: +27 opt(26) [actual: +27] - │ │ │ └── filters - │ │ │ └── col1_0:25 ILIKE col1_0:25 [outer=(25), constraints=(/25: (/NULL - ])] - │ │ └── 84 - │ └── sort - │ ├── columns: col1_0:43!null col1_1:44 col1_3:45 - │ ├── cardinality: [0 - 84] - │ ├── key: (43) - │ ├── fd: (43)-->(44,45), (44,45)~~>(43) - │ ├── ordering: +45 - │ ├── limit hint: 84.00 - │ └── limit - │ ├── columns: col1_0:43!null col1_1:44 col1_3:45 - │ ├── cardinality: [0 - 84] - │ ├── key: (43) - │ ├── fd: (43)-->(44,45), (44,45)~~>(43) - │ ├── select - │ │ ├── columns: col1_0:43!null col1_1:44 col1_3:45 - │ │ ├── key: (43) - │ │ ├── fd: (43)-->(44,45), (44,45)~~>(43) - │ │ ├── limit hint: 84.00 - │ │ ├── scan table2@table2_col1_1_col1_3_key [as=tab_41831] - │ │ │ ├── columns: col1_0:43!null col1_1:44 col1_3:45 - │ │ │ ├── constraint: /44/45 - │ │ │ │ ├── [/NULL - /0] - │ │ │ │ ├── [/6 - /4999999] - │ │ │ │ └── [/5000001 - ] - │ │ │ ├── key: (43) - │ │ │ ├── fd: (43)-->(44,45), (44,45)~~>(43) - │ │ │ └── limit hint: 252.00 - │ │ └── filters - │ │ └── col1_0:43 ILIKE col1_0:43 [outer=(43), constraints=(/43: (/NULL - ])] - │ └── 84 - └── 84 + ├── fd: (7)-->(8-10), (8,9)~~>(7,10) + ├── scan t82730b [as=tab_41831] + │ ├── columns: col1_0:7!null col1_1:8 col1_3:9 col1_5:10 + │ ├── key: (7) + │ └── fd: (7)-->(8-10), (8,9)~~>(7,10) + └── filters + └── col1_0:7 ILIKE col1_0:7 [outer=(7), constraints=(/7: (/NULL - ])] + +# Regression test for #88993. Do not construct an unordered limit below the +# UnionAll. +exec-ddl +CREATE TABLE t88993 ( + a INT, + b INT, + c INT, + INDEX bca_idx (b, c, a) +); +---- + +opt disable=GenerateTopK +SELECT min(a) FROM t88993@bca_idx WHERE (b <= 11 AND c < 50) OR (b = 11 AND c = 50) OR (b >= 11 AND c > 50) +---- +scalar-group-by + ├── columns: min:7 + ├── cardinality: [1 - 1] + ├── key: () + ├── fd: ()-->(7) + ├── limit + │ ├── columns: a:1!null b:2!null c:3!null + │ ├── internal-ordering: +1 + │ ├── cardinality: [0 - 1] + │ ├── key: () + │ ├── fd: ()-->(1-3) + │ ├── union-all + │ │ ├── columns: a:1!null b:2!null c:3!null + │ │ ├── left columns: a:8 b:9 c:10 + │ │ ├── right columns: a:14 b:15 c:16 + │ │ ├── ordering: +1 + │ │ ├── limit hint: 1.00 + │ │ ├── limit + │ │ │ ├── columns: a:8!null b:9!null c:10!null + │ │ │ ├── internal-ordering: +8 opt(9,10) + │ │ │ ├── cardinality: [0 - 1] + │ │ │ ├── key: () + │ │ │ ├── fd: ()-->(8-10) + │ │ │ ├── limit hint: 1.00 + │ │ │ ├── select + │ │ │ │ ├── columns: a:8!null b:9!null c:10!null + │ │ │ │ ├── fd: ()-->(9,10) + │ │ │ │ ├── ordering: +8 opt(9,10) [actual: +8] + │ │ │ │ ├── limit hint: 1.00 + │ │ │ │ ├── scan t88993@bca_idx + │ │ │ │ │ ├── columns: a:8!null b:9!null c:10!null + │ │ │ │ │ ├── constraint: /9/10/8/11: (/11/50/NULL - /11/50] + │ │ │ │ │ ├── flags: force-index=bca_idx + │ │ │ │ │ ├── fd: ()-->(9,10) + │ │ │ │ │ └── ordering: +8 opt(9,10) [actual: +8] + │ │ │ │ └── filters + │ │ │ │ └── (((b:9 <= 11) AND (c:10 < 50)) OR ((b:9 = 11) AND (c:10 = 50))) OR ((b:9 >= 11) AND (c:10 > 50)) [outer=(9,10), constraints=(/9: (/NULL - ]; /10: (/NULL - /49] [/50 - /50] [/51 - ])] + │ │ │ └── 1 + │ │ └── sort + │ │ ├── columns: a:14!null b:15!null c:16!null + │ │ ├── ordering: +14 + │ │ ├── limit hint: 1.00 + │ │ └── select + │ │ ├── columns: a:14!null b:15!null c:16!null + │ │ ├── scan t88993@bca_idx + │ │ │ ├── columns: a:14 b:15!null c:16 + │ │ │ ├── constraint: /15/16/14/17 + │ │ │ │ ├── (/NULL - /11/49] + │ │ │ │ └── (/11/51/NULL - ] + │ │ │ └── flags: force-index=bca_idx + │ │ └── filters + │ │ ├── (((b:15 <= 11) AND (c:16 < 50)) OR ((b:15 = 11) AND (c:16 = 50))) OR ((b:15 >= 11) AND (c:16 > 50)) [outer=(15,16), constraints=(/15: (/NULL - ]; /16: (/NULL - /49] [/50 - /50] [/51 - ])] + │ │ └── a:14 IS NOT NULL [outer=(14), constraints=(/14: (/NULL - ]; tight)] + │ └── 1 + └── aggregations + └── const-agg [as=min:7, outer=(1)] + └── a:1 From 949bd9049869c8cc796ed9043e5f64c41beef319 Mon Sep 17 00:00:00 2001 From: Bilal Akhtar Date: Thu, 29 Sep 2022 17:20:49 -0400 Subject: [PATCH 6/6] storage: Try seeks using nexts, other fast-paths in CheckSSTConflicts Import cancellations and retries tend to set sstToReqTS and have incoming SSTs where we can expect all keys to have the same timestamp. This allows us to more efficiently skip keys in the engine beyond what range key masking does for us (eg. we don't need to check each individual SST key for conflict with an engine range key). This change takes advantage of that to speed up import retries. It also replaces engine seeks with Next()s as much as possible; the TrySeekUsingNext optimization inside the iterator isn't kicking in as some seeks/nexts are interspersed. Release note: None --- pkg/storage/sst.go | 85 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 6 deletions(-) diff --git a/pkg/storage/sst.go b/pkg/storage/sst.go index f49a26d2955f..b67209c2d79e 100644 --- a/pkg/storage/sst.go +++ b/pkg/storage/sst.go @@ -37,6 +37,10 @@ import ( // (key/value/timestamp), for backwards compatibility. If disallowShadowingBelow // is non-empty, disallowShadowing is ignored. // +// sstTimestamp, if non-zero, represents the timestamp that all keys in the SST +// are expected to be at. This method can make performance optimizations with +// the expectation that no SST keys will be at any other timestamp. +// // The given SST and reader cannot contain intents or inline values (i.e. zero // timestamps), but this is only checked for keys that exist in both sides, for // performance. @@ -52,7 +56,7 @@ func CheckSSTConflicts( leftPeekBound, rightPeekBound roachpb.Key, disallowShadowing bool, disallowShadowingBelow hlc.Timestamp, - sstToReqTimestamp hlc.Timestamp, + sstTimestamp hlc.Timestamp, maxIntents int64, usePrefixSeek bool, ) (enginepb.MVCCStats, error) { @@ -72,6 +76,11 @@ func CheckSSTConflicts( rightPeekBound = keys.MaxKey } + // In some iterations below, we try to call Next() instead of SeekGE() for a + // few iterations, as nexts are more performant. If `numNextsBeforeSeek` nexts + // are not sufficient to land at or after a desired SeekGE key, we fall back to + // a seek. + const numNextsBeforeSeek = 5 var statsDiff enginepb.MVCCStats var intents []roachpb.Intent if usePrefixSeek { @@ -132,11 +141,26 @@ func CheckSSTConflicts( } rkIter.Close() + if usePrefixSeek { + // Prefix iteration and range key masking don't work together. See the + // comment on the panic inside pebbleIterator.setOptions. + sstTimestamp = hlc.Timestamp{} + } extIter := reader.NewMVCCIterator(MVCCKeyAndIntentsIterKind, IterOptions{ - KeyTypes: IterKeyTypePointsAndRanges, - LowerBound: leftPeekBound, - UpperBound: rightPeekBound, - RangeKeyMaskingBelow: sstToReqTimestamp, + KeyTypes: IterKeyTypePointsAndRanges, + LowerBound: leftPeekBound, + UpperBound: rightPeekBound, + // NB: Range key masking is performant, but it skips instances where we need + // to adjust GCBytesAge in the returned stats diff. Consider an example + // where a point key is masked by a range tombstone, and we added a new + // revision of that key above the range tombstone in the SST. The GCBytesAge + // contribution of that range tombstone on the point key's key (as opposed + // to the version contribution) needs to be un-done as that key is now being + // used by the live key. + // + // TODO(bilal): Add tests that test for this case, and close this gap in + // GCBytesAge calculation. + RangeKeyMaskingBelow: sstTimestamp, Prefix: usePrefixSeek, useL6Filters: true, }) @@ -716,7 +740,20 @@ func CheckSSTConflicts( sstIter.SeekGE(MVCCKey{Key: extKey.Key}) sstOK, sstErr = sstIter.Valid() if sstOK { - extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key}) + // Seeks on the engine are expensive. Try Next()ing if we're very close + // to the sst key (which we might be). + nextsUntilSeek := numNextsBeforeSeek + for extOK && extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key) < 0 { + extIter.NextKey() + extOK, _ = extIter.Valid() + nextsUntilSeek-- + if nextsUntilSeek <= 0 { + break + } + } + if extOK && extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key) < 0 { + extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key}) + } } extOK, extErr = extIter.Valid() continue @@ -735,6 +772,42 @@ func CheckSSTConflicts( statsDiff.KeyBytes -= int64(len(extKey.Key) + 1) } + // Fast path with sstTimestamp set and a common case of import cancellation. + // Since we use range key masking, we can just Next() the ext iterator + // past its range key. + if sstTimestamp.IsSet() && extHasRange && !extHasPoint && !sstHasRange { + if vers, ok := extRangeKeys.FirstAtOrAbove(sstTimestamp); !ok || vers.Timestamp.Equal(sstTimestamp) { + // All range key versions are below the request timestamp. We can seek + // past the range key, as all SST points/ranges are going to be above + // this range key. + extIter.Next() + extOK, extErr = extIter.Valid() + if !extOK { + break + } + + sstIter.SeekGE(MVCCKey{Key: extIter.UnsafeKey().Key}) + sstOK, sstErr = sstIter.Valid() + if sstOK { + // Seeks on the engine are expensive. Try Next()ing if we're very close + // to the sst key (which we might be). + nextsUntilSeek := numNextsBeforeSeek + for extOK && extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key) < 0 { + extIter.NextKey() + extOK, _ = extIter.Valid() + nextsUntilSeek-- + if nextsUntilSeek <= 0 { + break + } + } + if extOK && extIter.UnsafeKey().Key.Compare(sstIter.UnsafeKey().Key) < 0 { + extIter.SeekGE(MVCCKey{Key: sstIter.UnsafeKey().Key}) + } + } + extOK, extErr = extIter.Valid() + continue + } + } steppedExtIter := false // Before Next-ing the SST iter, if it contains any range keys, check if both: // 1) the next SST key takes us outside the current SST range key