diff --git a/pkg/internal/client/db.go b/pkg/internal/client/db.go index e9d55123f034..b806bcd5109c 100644 --- a/pkg/internal/client/db.go +++ b/pkg/internal/client/db.go @@ -451,10 +451,11 @@ func (db *DB) DelRange(ctx context.Context, begin, end interface{}) error { return getOneErr(db.Run(ctx, b), b) } -// AdminMerge merges the range containing key and the subsequent -// range. After the merge operation is complete, the range containing -// key will contain all of the key/value pairs of the subsequent range -// and the subsequent range will no longer exist. +// AdminMerge merges the range containing key and the subsequent range. After +// the merge operation is complete, the range containing key will contain all of +// the key/value pairs of the subsequent range and the subsequent range will no +// longer exist. Neither range may contain learner replicas, if one does, an +// error is returned. // // key can be either a byte slice or a string. func (db *DB) AdminMerge(ctx context.Context, key interface{}) error { diff --git a/pkg/sql/opt/memo/memo.go b/pkg/sql/opt/memo/memo.go index bfa3df7761d7..634091c028f2 100644 --- a/pkg/sql/opt/memo/memo.go +++ b/pkg/sql/opt/memo/memo.go @@ -142,9 +142,8 @@ type Memo struct { // curID is the highest currently in-use scalar expression ID. curID opt.ScalarID - // withExprs is the set of With expressions that have been constructed thus - // far. - withExprs []RelExpr + // curWithID is the highest currently in-use WITH ID. + curWithID opt.WithID // WARNING: if you add more members, add initialization code in Init. } @@ -162,7 +161,6 @@ func (m *Memo) Init(evalCtx *tree.EvalContext) { m.rootExpr = nil m.rootProps = nil m.memEstimate = 0 - m.withExprs = nil m.dataConversion = evalCtx.SessionData.DataConversion m.reorderJoinsLimit = evalCtx.SessionData.ReorderJoinsLimit @@ -172,6 +170,7 @@ func (m *Memo) Init(evalCtx *tree.EvalContext) { m.saveTablesPrefix = evalCtx.SessionData.SaveTablesPrefix m.curID = 0 + m.curWithID = 0 } // IsEmpty returns true if there are no expressions in the memo. @@ -359,23 +358,8 @@ func (m *Memo) RequestColStat( return nil, false } -// AddWithBinding adds a new expression to the set of referenceable With -// expressions, returning its associated ID that can be used to retrieve it. -func (m *Memo) AddWithBinding(e RelExpr) opt.WithID { - m.withExprs = append(m.withExprs, e) - return opt.WithID(len(m.withExprs)) -} - -// WithExpr returns the expression associated with the given WithID. -// This shouldn't be used for anything relating to physical props. -func (m *Memo) WithExpr(id opt.WithID) RelExpr { - return m.withExprs[id-1] -} - -// CopyWiths copies over the set of WITH expressions used by the other memo. -// TODO(justin): These are just used to compute stats, we should have a way to -// avoid hanging on to expressions like this. -func (m *Memo) CopyWiths(o *Memo, replace func(opt.Expr) opt.Expr) { - m.withExprs = make([]RelExpr, len(o.withExprs)) - copy(m.withExprs, o.withExprs) +// NextWithID returns a not-yet-assigned identifier for a WITH expression. +func (m *Memo) NextWithID() opt.WithID { + m.curWithID++ + return m.curWithID } diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go index 353f4a26ce10..2bbf4cf525fb 100644 --- a/pkg/sql/opt/memo/statistics_builder.go +++ b/pkg/sql/opt/memo/statistics_builder.go @@ -349,7 +349,11 @@ func (sb *statisticsBuilder) colStat(colSet opt.ColSet, e RelExpr) *props.Column return sb.colStat(colSet, e.Child(1).(RelExpr)) case opt.WithScanOp: - return sb.colStatWithScan(colSet, e.(*WithScanExpr)) + // This is tricky, since if we deferred to the expression being referenced, + // the computation of stats for a WithScan would depend on something + // outside of the expression itself. Just call it unknown for now. + // TODO(justin): find a real solution for this. + return sb.colStatUnknown(colSet, e.Relational()) case opt.FakeRelOp: panic(errors.AssertionFailedf("FakeRelOp does not contain col stat for %v", colSet)) @@ -2061,37 +2065,6 @@ func (sb *statisticsBuilder) colStatSequenceSelect( return colStat } -// +-----------+ -// | With Scan | -// +-----------+ - -func (sb *statisticsBuilder) colStatWithScan( - colSet opt.ColSet, ws *WithScanExpr, -) *props.ColumnStatistic { - relProps := ws.Relational() - s := &relProps.Stats - - withExpr := ws.Memo().WithExpr(ws.ID) - - // We need to pass on the colStat request to the referenced expression, but - // we need to translate the columns to the ones returned by the original - // expression, rather than the reference. - cols := translateColSet(colSet, ws.OutCols, ws.InCols) - - // Note that if the WITH contained placeholders, we still hold onto the - // original, unreplaced expression, and so we won't be able to generate stats - // corresponding to the replaced expression. - // TODO(justin): find a way to lift this limitation. One way could be to - // rebuild WithScans when the WITH they reference has placeholders that get - // replaced. - - colstat, _ := s.ColStats.Add(colSet) - *colstat = *sb.colStat(cols, withExpr) - colstat.Cols = colSet - - return colstat -} - // +---------+ // | Unknown | // +---------+ diff --git a/pkg/sql/opt/memo/testdata/stats/groupby b/pkg/sql/opt/memo/testdata/stats/groupby index a89172d1e28d..8fff6350f061 100644 --- a/pkg/sql/opt/memo/testdata/stats/groupby +++ b/pkg/sql/opt/memo/testdata/stats/groupby @@ -468,37 +468,37 @@ GROUP BY q.b with &1 (q) ├── columns: "?column?":6(int!null) ├── cardinality: [0 - 3] - ├── stats: [rows=0.22654092] + ├── stats: [rows=1] ├── fd: ()-->(6) ├── values │ ├── columns: column1:1(bool!null) column2:2(int) │ ├── cardinality: [3 - 3] - │ ├── stats: [rows=3, distinct(1)=2, null(1)=0, distinct(2)=2, null(2)=2] + │ ├── stats: [rows=3] │ ├── (true, NULL) [type=tuple{bool, int}] │ ├── (false, NULL) [type=tuple{bool, int}] │ └── (true, 5) [type=tuple{bool, int}] └── project ├── columns: "?column?":6(int!null) ├── cardinality: [0 - 3] - ├── stats: [rows=0.22654092] + ├── stats: [rows=1] ├── fd: ()-->(6) ├── select │ ├── columns: b:4(int) bool_or:5(bool!null) │ ├── cardinality: [0 - 3] - │ ├── stats: [rows=0.22654092, distinct(5)=0.22654092, null(5)=0] + │ ├── stats: [rows=1, distinct(5)=1, null(5)=0] │ ├── key: (4) │ ├── fd: ()-->(5) │ ├── group-by │ │ ├── columns: b:4(int) bool_or:5(bool) │ │ ├── grouping columns: b:4(int) │ │ ├── cardinality: [0 - 3] - │ │ ├── stats: [rows=1.29289322, distinct(4)=1.29289322, null(4)=1, distinct(5)=1.29289322, null(5)=1] + │ │ ├── stats: [rows=1, distinct(4)=1, null(4)=0, distinct(5)=1, null(5)=0] │ │ ├── key: (4) │ │ ├── fd: (4)-->(5) │ │ ├── select │ │ │ ├── columns: a:3(bool!null) b:4(int) │ │ │ ├── cardinality: [0 - 3] - │ │ │ ├── stats: [rows=1.5, distinct(3)=1, null(3)=0, distinct(4)=1.29289322, null(4)=1] + │ │ │ ├── stats: [rows=1, distinct(3)=1, null(3)=0, distinct(4)=1, null(4)=0] │ │ │ ├── fd: ()-->(3) │ │ │ ├── with-scan &1 (q) │ │ │ │ ├── columns: a:3(bool!null) b:4(int) @@ -506,7 +506,7 @@ with &1 (q) │ │ │ │ │ ├── column1:1(bool) => a:3(bool) │ │ │ │ │ └── column2:2(int) => b:4(int) │ │ │ │ ├── cardinality: [3 - 3] - │ │ │ │ ├── stats: [rows=3, distinct(3)=2, null(3)=0, distinct(4)=2, null(4)=2] + │ │ │ │ ├── stats: [rows=3] │ │ │ │ └── fd: (1)-->(3), (2)-->(4) │ │ │ └── filters │ │ │ └── variable: a [type=bool, outer=(3), constraints=(/3: [/true - /true]; tight), fd=()-->(3)] diff --git a/pkg/sql/opt/memo/testdata/stats/scan b/pkg/sql/opt/memo/testdata/stats/scan index 23f9b10a7bb9..551a7d27ffee 100644 --- a/pkg/sql/opt/memo/testdata/stats/scan +++ b/pkg/sql/opt/memo/testdata/stats/scan @@ -539,38 +539,38 @@ WHERE ---- with &1 (subq) ├── columns: "?column?":26(int!null) - ├── stats: [rows=0.95099005] + ├── stats: [rows=1] ├── fd: ()-->(26) ├── project │ ├── columns: col1:23(bool) tab1.g:18(int4!null) - │ ├── stats: [rows=333333.333, distinct(18)=100, null(18)=0, distinct(23)=333333.333, null(23)=16336.65] + │ ├── stats: [rows=333333.333] │ ├── inner-join (hash) │ │ ├── columns: tab0.e:5(varchar) tab0.f:6("char") tab0.h:8(varchar) tab0.j:10(float!null) tab1.e:16(varchar) tab1.f:17("char") tab1.g:18(int4!null) tab1.j:21(float!null) - │ │ ├── stats: [rows=333333.333, distinct(10)=100, null(10)=0, distinct(18)=100, null(18)=0, distinct(21)=100, null(21)=0, distinct(5,6,8,16,17)=333333.333, null(5,6,8,16,17)=16336.65] + │ │ ├── stats: [rows=333333.333, distinct(10)=100, null(10)=0, distinct(18)=100, null(18)=0, distinct(21)=100, null(21)=0] │ │ ├── scan tab0 │ │ │ ├── columns: tab0.e:5(varchar) tab0.f:6("char") tab0.h:8(varchar) tab0.j:10(float!null) - │ │ │ └── stats: [rows=1000, distinct(10)=100, null(10)=0, distinct(5,6,8)=1000, null(5,6,8)=29.701] + │ │ │ └── stats: [rows=1000, distinct(10)=100, null(10)=0] │ │ ├── scan tab1 │ │ │ ├── columns: tab1.e:16(varchar) tab1.f:17("char") tab1.g:18(int4!null) tab1.j:21(float!null) - │ │ │ └── stats: [rows=1000, distinct(18)=100, null(18)=0, distinct(21)=100, null(21)=0, distinct(16,17)=1000, null(16,17)=19.9] + │ │ │ └── stats: [rows=1000, distinct(18)=100, null(18)=0, distinct(21)=100, null(21)=0] │ │ └── filters │ │ └── tab0.j IN (tab1.j,) [type=bool, outer=(10,21)] │ └── projections │ └── CASE WHEN ilike_escape(regexp_replace(tab0.h, tab1.e, tab0.f, tab0.e::STRING), tab1.f, '') THEN true ELSE false END [type=bool, outer=(5,6,8,16,17)] └── project ├── columns: "?column?":26(int!null) - ├── stats: [rows=0.95099005] + ├── stats: [rows=1] ├── fd: ()-->(26) ├── select │ ├── columns: col0:24(int4!null) col1:25(bool!null) - │ ├── stats: [rows=0.95099005, distinct(24)=0.95099005, null(24)=0, distinct(25)=0.95099005, null(25)=0] + │ ├── stats: [rows=1, distinct(24)=1, null(24)=0, distinct(25)=1, null(25)=0] │ ├── fd: ()-->(25) │ ├── with-scan &1 (subq) │ │ ├── columns: col0:24(int4!null) col1:25(bool) │ │ ├── mapping: │ │ │ ├── tab1.g:18(int4) => col0:24(int4) │ │ │ └── col1:23(bool) => col1:25(bool) - │ │ ├── stats: [rows=333333.333, distinct(24)=100, null(24)=0, distinct(25)=333333.333, null(25)=16336.65] + │ │ ├── stats: [rows=333333.333] │ │ └── fd: (18)-->(24), (23)-->(25) │ └── filters │ └── variable: col1 [type=bool, outer=(25), constraints=(/25: [/true - /true]; tight), fd=()-->(25)] diff --git a/pkg/sql/opt/memo/testdata/stats/select b/pkg/sql/opt/memo/testdata/stats/select index 4705e4e23fef..7f33c26c31eb 100644 --- a/pkg/sql/opt/memo/testdata/stats/select +++ b/pkg/sql/opt/memo/testdata/stats/select @@ -1369,32 +1369,32 @@ SELECT x FROM t WHERE x ---- with &1 (t) ├── columns: x:6(bool!null) - ├── stats: [rows=4e+10, distinct(6)=1, null(6)=0] + ├── stats: [rows=1, distinct(6)=1, null(6)=0] ├── fd: ()-->(6) ├── project │ ├── columns: x:5(bool) - │ ├── stats: [rows=4e+20, distinct(5)=1, null(5)=4e+20] + │ ├── stats: [rows=4e+20] │ ├── left-join (hash) │ │ ├── columns: t1.x:1(bool) t2.x:3(bool) - │ │ ├── stats: [rows=4e+20, distinct(1,3)=1, null(1,3)=4e+20] + │ │ ├── stats: [rows=4e+20] │ │ ├── scan t1 │ │ │ ├── columns: t1.x:1(bool) - │ │ │ └── stats: [rows=2e+10, distinct(1)=1, null(1)=2e+10] + │ │ │ └── stats: [rows=2e+10] │ │ ├── scan t2 │ │ │ ├── columns: t2.x:3(bool) - │ │ │ └── stats: [rows=2e+10, distinct(3)=1, null(3)=2e+10] + │ │ │ └── stats: [rows=2e+10] │ │ └── filters (true) │ └── projections │ └── (t1.x::INT8 << 5533)::BOOL OR t2.x [type=bool, outer=(1,3)] └── select ├── columns: x:6(bool!null) - ├── stats: [rows=4e+10, distinct(6)=1, null(6)=0] + ├── stats: [rows=1, distinct(6)=1, null(6)=0] ├── fd: ()-->(6) ├── with-scan &1 (t) │ ├── columns: x:6(bool) │ ├── mapping: │ │ └── x:5(bool) => x:6(bool) - │ ├── stats: [rows=4e+20, distinct(6)=1, null(6)=4e+20] + │ ├── stats: [rows=4e+20] │ └── fd: (5)-->(6) └── filters └── variable: x [type=bool, outer=(6), constraints=(/6: [/true - /true]; tight), fd=()-->(6)] diff --git a/pkg/sql/opt/memo/testdata/stats/with b/pkg/sql/opt/memo/testdata/stats/with index ec8f2257bf96..bf7a3ce130e6 100644 --- a/pkg/sql/opt/memo/testdata/stats/with +++ b/pkg/sql/opt/memo/testdata/stats/with @@ -41,7 +41,7 @@ with &1 (foo) ├── fd: (1)-->(2-4), (2)-->(5), (3)-->(6) ├── scan a │ ├── columns: a.x:1(int!null) a.y:2(int) a.s:3(string) - │ ├── stats: [rows=5000, distinct(1)=5000, null(1)=0, distinct(2)=400, null(2)=0, distinct(3)=10, null(3)=0] + │ ├── stats: [rows=5000] │ ├── key: (1) │ └── fd: (1)-->(2,3) └── with-scan &1 (foo) @@ -50,6 +50,6 @@ with &1 (foo) │ ├── a.x:1(int) => x:4(int) │ ├── a.y:2(int) => y:5(int) │ └── a.s:3(string) => s:6(string) - ├── stats: [rows=5000, distinct(4)=5000, null(4)=0, distinct(5)=400, null(5)=0, distinct(6)=10, null(6)=0] + ├── stats: [rows=5000] ├── key: (1) └── fd: (1)-->(2-4), (2)-->(5), (3)-->(6) diff --git a/pkg/sql/opt/norm/factory.go b/pkg/sql/opt/norm/factory.go index 76bf410188cd..714bbbdcb85e 100644 --- a/pkg/sql/opt/norm/factory.go +++ b/pkg/sql/opt/norm/factory.go @@ -189,7 +189,6 @@ func (f *Factory) CopyAndReplace( // Copy all metadata to the target memo so that referenced tables and columns // can keep the same ids they had in the "from" memo. f.mem.Metadata().CopyFrom(from.Memo().Metadata()) - f.mem.CopyWiths(from.Memo(), replace) // Perform copy and replacement, and store result as the root of this // factory's memo. diff --git a/pkg/sql/opt/optbuilder/mutation_builder.go b/pkg/sql/opt/optbuilder/mutation_builder.go index dd12c2f02e01..3e9b856fc670 100644 --- a/pkg/sql/opt/optbuilder/mutation_builder.go +++ b/pkg/sql/opt/optbuilder/mutation_builder.go @@ -759,7 +759,7 @@ func (mb *mutationBuilder) buildFKChecks() { // need to buffer it. This could be a normalization rule, but it's probably // more efficient if we did it in here (or we'd end up building the entire FK // subtrees twice). - mb.withID = mb.b.factory.Memo().AddWithBinding(mb.outScope.expr) + mb.withID = mb.b.factory.Memo().NextWithID() for i, n := 0, mb.tab.OutboundForeignKeyCount(); i < n; i++ { fk := mb.tab.OutboundForeignKey(i) diff --git a/pkg/sql/opt/optbuilder/select.go b/pkg/sql/opt/optbuilder/select.go index 4ad70465c659..b3a95fb454be 100644 --- a/pkg/sql/opt/optbuilder/select.go +++ b/pkg/sql/opt/optbuilder/select.go @@ -557,7 +557,7 @@ func (b *Builder) buildCTE( cteScope = projectionsScope - id := b.factory.Memo().AddWithBinding(cteScope.expr) + id := b.factory.Memo().NextWithID() // No good way to show non-select expressions, like INSERT, here. var stmt tree.SelectStatement diff --git a/pkg/storage/merge_queue.go b/pkg/storage/merge_queue.go index 33ec326b5bbf..c9b0c5e7d3fd 100644 --- a/pkg/storage/merge_queue.go +++ b/pkg/storage/merge_queue.go @@ -27,6 +27,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/pkg/errors" ) const ( @@ -239,6 +240,15 @@ func (mq *mergeQueue) process( return nil } + // Range was manually split and not expired, so skip merging. + now := mq.store.Clock().Now() + if now.Less(rhsDesc.GetStickyBit()) { + log.VEventf(ctx, 2, "skipping merge: ranges were manually split and sticky bit was not expired") + // TODO(jeffreyxiao): Consider returning a purgatory error to avoid + // repeatedly processing ranges that cannot be merged. + return nil + } + mergedDesc := &roachpb.RangeDescriptor{ StartKey: lhsDesc.StartKey, EndKey: rhsDesc.EndKey, @@ -264,13 +274,46 @@ func (mq *mergeQueue) process( return nil } - if !replicaSetsEqual(lhsDesc.Replicas().Unwrap(), rhsDesc.Replicas().Unwrap()) { + { + // AdminMerge errors if there are learners on either side and + // AdminRelocateRange removes any on the range it operates on. For the sake + // of obviousness, just remove them all upfront. + newLHSDesc, err := removeLearners(ctx, lhsRepl.store.DB(), lhsDesc) + if err != nil { + log.VEventf(ctx, 2, `%v`, err) + return err + } + lhsDesc = newLHSDesc + newRHSDesc, err := removeLearners(ctx, lhsRepl.store.DB(), &rhsDesc) + if err != nil { + log.VEventf(ctx, 2, `%v`, err) + return err + } + rhsDesc = *newRHSDesc + } + lhsReplicas, rhsReplicas := lhsDesc.Replicas().All(), rhsDesc.Replicas().All() + + // Defensive sanity check that everything is now a voter. + for i := range lhsReplicas { + if lhsReplicas[i].GetType() != roachpb.ReplicaType_VOTER { + return errors.Errorf(`cannot merge non-voter replicas on lhs: %v`, lhsReplicas) + } + } + for i := range rhsReplicas { + if rhsReplicas[i].GetType() != roachpb.ReplicaType_VOTER { + return errors.Errorf(`cannot merge non-voter replicas on rhs: %v`, rhsReplicas) + } + } + + if !replicaSetsEqual(lhsReplicas, rhsReplicas) { var targets []roachpb.ReplicationTarget - for _, lhsReplDesc := range lhsDesc.Replicas().Unwrap() { + for _, lhsReplDesc := range lhsReplicas { targets = append(targets, roachpb.ReplicationTarget{ NodeID: lhsReplDesc.NodeID, StoreID: lhsReplDesc.StoreID, }) } + // AdminRelocateRange moves the lease to the first target in the list, so + // sort the existing leaseholder there to leave it unchanged. lease, _ := lhsRepl.GetLease() for i := range targets { if targets[i].NodeID == lease.Replica.NodeID && targets[i].StoreID == lease.Replica.StoreID { @@ -287,15 +330,6 @@ func (mq *mergeQueue) process( } } - // Range was manually split and not expired, so skip merging. - now := mq.store.Clock().Now() - if now.Less(rhsDesc.GetStickyBit()) { - log.VEventf(ctx, 2, "skipping merge: ranges were manually split and sticky bit was not expired") - // TODO(jeffreyxiao): Consider returning a purgatory error to avoid - // repeatedly processing ranges that cannot be merged. - return nil - } - log.VEventf(ctx, 2, "merging to produce range: %s-%s", mergedDesc.StartKey, mergedDesc.EndKey) reason := fmt.Sprintf("lhs+rhs has (size=%s+%s qps=%.2f+%.2f --> %.2fqps) below threshold (size=%s, qps=%.2f)", humanizeutil.IBytes(lhsStats.Total()), diff --git a/pkg/storage/replica_command.go b/pkg/storage/replica_command.go index 9ae8be77637f..a21e199c0fcd 100644 --- a/pkg/storage/replica_command.go +++ b/pkg/storage/replica_command.go @@ -566,8 +566,19 @@ func (r *Replica) AdminMerge( // Should never happen, but just in case. return errors.Errorf("ranges are not adjacent; %s != %s", origLeftDesc.EndKey, rightDesc.StartKey) } - if l, r := origLeftDesc.Replicas(), rightDesc.Replicas(); !replicaSetsEqual(l.Unwrap(), r.Unwrap()) { - return errors.Errorf("ranges not collocated; %s != %s", l, r) + // For simplicity, don't handle learner replicas, expect the caller to + // resolve them first. (Defensively, we check that there are no non-voter + // replicas, in case some third type is later added). This behavior can be + // changed later if the complexity becomes worth it, but it's not right now. + lReplicas, rReplicas := origLeftDesc.Replicas(), rightDesc.Replicas() + if len(lReplicas.Voters()) != len(lReplicas.All()) { + return errors.Errorf("cannot merge range with non-voter replicas on lhs: %s", lReplicas) + } + if len(rReplicas.Voters()) != len(rReplicas.All()) { + return errors.Errorf("cannot merge range with non-voter replicas on rhs: %s", rReplicas) + } + if !replicaSetsEqual(lReplicas.All(), rReplicas.All()) { + return errors.Errorf("ranges not collocated; %s != %s", lReplicas, rReplicas) } updatedLeftDesc := *origLeftDesc @@ -1549,6 +1560,20 @@ func (s *Store) AdminRelocateRange( rangeDesc.SetReplicas(rangeDesc.Replicas().DeepCopy()) startKey := rangeDesc.StartKey.AsRawKey() + // Step 0: Remove all learners so we don't have to think about them. We could + // do something smarter here and try to promote them, but it doesn't seem + // worth the complexity right now. Revisit if this is an issue in practice. + // + // Note that we can't just add the learners to removeTargets. The below logic + // always does add then remove and if the learner was in the requested + // targets, we might try to add it before removing it. + newDesc, err := removeLearners(ctx, s.DB(), &rangeDesc) + if err != nil { + log.Warning(ctx, err) + return err + } + rangeDesc = *newDesc + // Step 1: Compute which replicas are to be added and which are to be removed. // // TODO(radu): we can't have multiple replicas on different stores on the @@ -1611,32 +1636,6 @@ func (s *Store) AdminRelocateRange( } } - // updateRangeDesc updates the passed RangeDescriptor following the successful - // completion of an AdminChangeReplicasRequest with the single provided target - // and changeType. - // TODO(ajwerner): Remove this for 19.2 after AdminChangeReplicas always - // returns a non-nil Desc. - updateRangeDesc := func( - desc *roachpb.RangeDescriptor, - changeType roachpb.ReplicaChangeType, - target roachpb.ReplicationTarget, - ) { - switch changeType { - case roachpb.ADD_REPLICA: - desc.AddReplica(roachpb.ReplicaDescriptor{ - NodeID: target.NodeID, - StoreID: target.StoreID, - ReplicaID: desc.NextReplicaID, - }) - desc.NextReplicaID++ - case roachpb.REMOVE_REPLICA: - newReplicas := removeTargetFromSlice(desc.Replicas().All(), target) - desc.SetReplicas(roachpb.MakeReplicaDescriptors(&newReplicas)) - default: - panic(errors.Errorf("unknown ReplicaChangeType %v", changeType)) - } - } - sysCfg := s.cfg.Gossip.GetSystemConfig() if sysCfg == nil { return fmt.Errorf("no system config available, unable to perform RelocateRange") @@ -1720,12 +1719,7 @@ func (s *Store) AdminRelocateRange( // local copy of the range descriptor such that future allocator // decisions take it into account. addTargets = removeTargetFromSlice(addTargets, target) - if newDesc != nil { - rangeInfo.Desc = newDesc - } else { - // TODO(ajwerner): Remove this case for 19.2. - updateRangeDesc(rangeInfo.Desc, roachpb.ADD_REPLICA, target) - } + rangeInfo.Desc = newDesc } if len(removeTargets) > 0 && len(removeTargets) > len(addTargets) { @@ -1757,12 +1751,7 @@ func (s *Store) AdminRelocateRange( // copy of the range descriptor such that future allocator decisions take // its absence into account. removeTargets = removeTargetFromSlice(removeTargets, target) - if newDesc != nil { - rangeInfo.Desc = newDesc - } else { - // TODO(ajwerner): Remove this case for 19.2. - updateRangeDesc(rangeInfo.Desc, roachpb.REMOVE_REPLICA, target) - } + rangeInfo.Desc = newDesc } } @@ -1788,6 +1777,26 @@ func removeTargetFromSlice( return targets } +func removeLearners( + ctx context.Context, db *client.DB, desc *roachpb.RangeDescriptor, +) (*roachpb.RangeDescriptor, error) { + learners := desc.Replicas().Learners() + if len(learners) == 0 { + return desc, nil + } + targets := make([]roachpb.ReplicationTarget, len(learners)) + for i := range learners { + targets[i].NodeID = learners[i].NodeID + targets[i].StoreID = learners[i].StoreID + } + log.VEventf(ctx, 2, `removing learner replicas %v from %v`, targets, desc) + newDesc, err := db.AdminChangeReplicas(ctx, desc.StartKey, roachpb.REMOVE_REPLICA, targets, *desc) + if err != nil { + return nil, errors.Wrapf(err, `removing learners from %s`, desc) + } + return newDesc, nil +} + // adminScatter moves replicas and leaseholders for a selection of ranges. func (r *Replica) adminScatter( ctx context.Context, args roachpb.AdminScatterRequest, diff --git a/pkg/storage/replica_learner_test.go b/pkg/storage/replica_learner_test.go index 2e96e36fd324..04c7ec5ab830 100644 --- a/pkg/storage/replica_learner_test.go +++ b/pkg/storage/replica_learner_test.go @@ -14,6 +14,7 @@ import ( "context" "fmt" "path/filepath" + "sort" "strconv" "strings" "sync/atomic" @@ -574,3 +575,126 @@ func TestLearnerFollowerRead(t *testing.T) { return nil }) } + +func TestLearnerAdminRelocateRange(t *testing.T) { + defer leaktest.AfterTest(t)() + + ctx := context.Background() + knobs, ltk := makeLearnerTestKnobs() + tc := testcluster.StartTestCluster(t, 4, base.TestClusterArgs{ + ServerArgs: base.TestServerArgs{Knobs: knobs}, + ReplicationMode: base.ReplicationManual, + }) + defer tc.Stopper().Stop(ctx) + db := sqlutils.MakeSQLRunner(tc.ServerConn(0)) + db.Exec(t, `SET CLUSTER SETTING kv.learner_replicas.enabled = true`) + + scratchStartKey := tc.ScratchRange(t) + atomic.StoreInt64(<k.replicaAddStopAfterLearnerAtomic, 1) + _ = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) + _ = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(2)) + atomic.StoreInt64(<k.replicaAddStopAfterLearnerAtomic, 0) + + // Test AdminRelocateRange's treatment of learners by having one that it has + // to remove and one that should stay and become a voter. + // + // Before: 1 (voter), 2 (learner), 3 (learner) + // After: 1 (voter), 2 (voter), 4 (voter) + targets := []roachpb.ReplicationTarget{tc.Target(0), tc.Target(1), tc.Target(3)} + require.NoError(t, tc.Server(0).DB().AdminRelocateRange(ctx, scratchStartKey, targets)) + desc := tc.LookupRangeOrFatal(t, scratchStartKey) + voters := desc.Replicas().Voters() + require.Len(t, voters, len(targets)) + sort.Slice(voters, func(i, j int) bool { return voters[i].NodeID < voters[j].NodeID }) + for i := range voters { + require.Equal(t, targets[i].NodeID, voters[i].NodeID, `%v`, voters) + require.Equal(t, targets[i].StoreID, voters[i].StoreID, `%v`, voters) + } + require.Empty(t, desc.Replicas().Learners()) +} + +func TestLearnerAdminMerge(t *testing.T) { + defer leaktest.AfterTest(t)() + + ctx := context.Background() + knobs, ltk := makeLearnerTestKnobs() + tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ + ServerArgs: base.TestServerArgs{Knobs: knobs}, + ReplicationMode: base.ReplicationManual, + }) + defer tc.Stopper().Stop(ctx) + db := sqlutils.MakeSQLRunner(tc.ServerConn(0)) + db.Exec(t, `SET CLUSTER SETTING kv.learner_replicas.enabled = true`) + + scratchStartKey := tc.ScratchRange(t) + splitKey1 := scratchStartKey.Next() + splitKey2 := splitKey1.Next() + _, _ = tc.SplitRangeOrFatal(t, splitKey1) + _, _ = tc.SplitRangeOrFatal(t, splitKey2) + + atomic.StoreInt64(<k.replicaAddStopAfterLearnerAtomic, 1) + _ = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) + _ = tc.AddReplicasOrFatal(t, splitKey2, tc.Target(1)) + atomic.StoreInt64(<k.replicaAddStopAfterLearnerAtomic, 0) + + // Learner on the lhs should fail. + err := tc.Server(0).DB().AdminMerge(ctx, scratchStartKey) + if !testutils.IsError(err, `cannot merge range with non-voter replicas on lhs`) { + t.Fatalf(`expected "cannot merge range with non-voter replicas on lhs" error got: %+v`, err) + } + // Learner on the rhs should fail. + err = tc.Server(0).DB().AdminMerge(ctx, splitKey1) + if !testutils.IsError(err, `cannot merge range with non-voter replicas on rhs`) { + t.Fatalf(`expected "cannot merge range with non-voter replicas on rhs" error got: %+v`, err) + } +} + +func TestMergeQueueSeesLearner(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + knobs, ltk := makeLearnerTestKnobs() + tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ + ServerArgs: base.TestServerArgs{Knobs: knobs}, + ReplicationMode: base.ReplicationManual, + }) + defer tc.Stopper().Stop(ctx) + db := sqlutils.MakeSQLRunner(tc.ServerConn(0)) + db.Exec(t, `SET CLUSTER SETTING kv.learner_replicas.enabled = true`) + // TestCluster currently overrides this when used with ReplicationManual. + db.Exec(t, `SET CLUSTER SETTING kv.range_merge.queue_enabled = true`) + + scratchStartKey := tc.ScratchRange(t) + origDesc := tc.LookupRangeOrFatal(t, scratchStartKey) + + splitKey := scratchStartKey.Next() + _, _ = tc.SplitRangeOrFatal(t, splitKey) + + atomic.StoreInt64(<k.replicaAddStopAfterLearnerAtomic, 1) + _ = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1)) + atomic.StoreInt64(<k.replicaAddStopAfterLearnerAtomic, 0) + + // Unsplit the range to clear the sticky bit. + require.NoError(t, tc.Server(0).DB().AdminUnsplit(ctx, splitKey)) + + // Run the merge queue. + store, repl := getFirstStoreReplica(t, tc.Server(0), scratchStartKey) + trace, errMsg, err := store.ManuallyEnqueue(ctx, "merge", repl, true /* skipShouldQueue */) + require.NoError(t, err) + require.Equal(t, ``, errMsg) + formattedTrace := tracing.FormatRecordedSpans(trace) + expectedMessages := []string{ + `removing learner replicas \[n2,s2\]`, + `merging to produce range: /Table/Max-/Max`, + } + if err := testutils.MatchInOrder(formattedTrace, expectedMessages...); err != nil { + t.Fatal(err) + } + + // Sanity check that the desc has the same bounds it did originally. + desc := tc.LookupRangeOrFatal(t, scratchStartKey) + require.Equal(t, origDesc.StartKey, desc.StartKey) + require.Equal(t, origDesc.EndKey, desc.EndKey) + // The merge removed the learner. + require.Len(t, desc.Replicas().Voters(), 1) + require.Empty(t, desc.Replicas().Learners()) +} diff --git a/pkg/testutils/testcluster/testcluster.go b/pkg/testutils/testcluster/testcluster.go index 326afbdbcc65..f5a5ba779de9 100644 --- a/pkg/testutils/testcluster/testcluster.go +++ b/pkg/testutils/testcluster/testcluster.go @@ -358,6 +358,17 @@ func (tc *TestCluster) SplitRange( return tc.Servers[0].SplitRange(splitKey) } +// SplitRangeOrFatal is the same as SplitRange but will Fatal the test on error. +func (tc *TestCluster) SplitRangeOrFatal( + t testing.TB, splitKey roachpb.Key, +) (roachpb.RangeDescriptor, roachpb.RangeDescriptor) { + lhsDesc, rhsDesc, err := tc.Servers[0].SplitRange(splitKey) + if err != nil { + t.Fatalf(`splitting at %s: %+v`, splitKey, err) + } + return lhsDesc, rhsDesc +} + // Target returns a ReplicationTarget for the specified server. func (tc *TestCluster) Target(serverIdx int) roachpb.ReplicationTarget { s := tc.Servers[serverIdx]