-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
replica_command.go
1311 lines (1204 loc) · 48.9 KB
/
replica_command.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2014 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package storage
import (
"bytes"
"context"
"fmt"
"math/rand"
"strings"
"time"
"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/internal/client"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
"github.com/cockroachdb/cockroach/pkg/storage/engine"
"github.com/cockroachdb/cockroach/pkg/storage/storagebase"
"github.com/cockroachdb/cockroach/pkg/storage/storagepb"
"github.com/cockroachdb/cockroach/pkg/util/causer"
"github.com/cockroachdb/cockroach/pkg/util/contextutil"
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/pkg/errors"
"go.etcd.io/etcd/raft"
"go.etcd.io/etcd/raft/raftpb"
)
// AdminSplit divides the range into into two ranges using args.SplitKey.
func (r *Replica) AdminSplit(
ctx context.Context, args roachpb.AdminSplitRequest, reason string,
) (reply roachpb.AdminSplitResponse, _ *roachpb.Error) {
if len(args.SplitKey) == 0 {
return roachpb.AdminSplitResponse{}, roachpb.NewErrorf("cannot split range with no key provided")
}
var lastErr error
retryOpts := base.DefaultRetryOptions()
retryOpts.MaxRetries = 10
for retryable := retry.StartWithCtx(ctx, retryOpts); retryable.Next(); {
// The replica may have been destroyed since the start of the retry loop. We
// need to explicitly check this condition. Having a valid lease, as we
// verify below, does not imply that the range still exists: even after a
// range has been merged into its left-hand neighbor, its final lease (i.e.,
// the lease we have in r.mu.state.Lease) can remain valid indefinitely.
if _, err := r.IsDestroyed(); err != nil {
return reply, roachpb.NewError(err)
}
// Admin commands always require the range lease to begin (see
// executeAdminBatch), but we may have lost it while in this retry loop.
// Without the lease, a replica's local descriptor can be arbitrarily
// stale, which will result in a ConditionFailedError. To avoid this,
// we make sure that we still have the lease before each attempt.
if _, pErr := r.redirectOnOrAcquireLease(ctx); pErr != nil {
return roachpb.AdminSplitResponse{}, pErr
}
reply, lastErr = r.adminSplitWithDescriptor(ctx, args, r.Desc(), true /* delayable */, reason)
// On seeing a ConditionFailedError or an AmbiguousResultError, retry
// the command with the updated descriptor.
if retry := causer.Visit(lastErr, func(err error) bool {
switch err.(type) {
case *roachpb.ConditionFailedError:
return true
case *roachpb.AmbiguousResultError:
return true
default:
return false
}
}); !retry {
return reply, roachpb.NewError(lastErr)
}
}
// If we broke out of the loop after MaxRetries, return the last error.
return roachpb.AdminSplitResponse{}, roachpb.NewError(lastErr)
}
func maybeDescriptorChangedError(desc *roachpb.RangeDescriptor, err error) (string, bool) {
if detail, ok := err.(*roachpb.ConditionFailedError); ok {
// Provide a better message in the common case that the range being changed
// was already changed by a concurrent transaction.
var actualDesc roachpb.RangeDescriptor
if !detail.ActualValue.IsPresent() {
return fmt.Sprintf("descriptor changed: expected %s != [actual] nil (range subsumed)", desc), true
} else if err := detail.ActualValue.GetProto(&actualDesc); err == nil &&
desc.RangeID == actualDesc.RangeID && !desc.Equal(actualDesc) {
return fmt.Sprintf("descriptor changed: [expected] %s != [actual] %s", desc, actualDesc), true
}
}
return "", false
}
func splitSnapshotWarningStr(rangeID roachpb.RangeID, status *raft.Status) string {
var s string
if status != nil && status.RaftState == raft.StateLeader {
for replicaID, pr := range status.Progress {
if replicaID == status.Lead {
// TODO(tschottdorf): remove this line once we have picked up
// https://github.com/etcd-io/etcd/pull/10279
continue
}
if pr.State == raft.ProgressStateReplicate {
// This follower is in good working order.
continue
}
s += fmt.Sprintf("; r%d/%d is ", rangeID, replicaID)
switch pr.State {
case raft.ProgressStateSnapshot:
// If the Raft snapshot queue is backed up, replicas can spend
// minutes or worse until they are caught up.
s += "waiting for a Raft snapshot"
case raft.ProgressStateProbe:
// Assuming the split has already been delayed for a little bit,
// seeing a follower that is probing hints at some problem with
// Raft or Raft message delivery. (Of course it's possible that
// the follower *just* entered probing state).
s += "being probed (may or may not need a Raft snapshot)"
default:
// Future proofing.
s += "in unknown state " + pr.State.String()
}
}
}
return s
}
// adminSplitWithDescriptor divides the range into into two ranges, using
// either args.SplitKey (if provided) or an internally computed key that aims
// to roughly equipartition the range by size. The split is done inside of a
// distributed txn which writes updated left and new right hand side range
// descriptors, and updates the range addressing metadata. The handover of
// responsibility for the reassigned key range is carried out seamlessly
// through a split trigger carried out as part of the commit of that
// transaction.
//
// The supplied RangeDescriptor is used as a form of optimistic lock. An
// operation which might split a range should obtain a copy of the range's
// current descriptor before making the decision to split. If the decision is
// affirmative the descriptor is passed to AdminSplit, which performs a
// Conditional Put on the RangeDescriptor to ensure that no other operation has
// modified the range in the time the decision was being made.
// TODO(tschottdorf): should assert that split key is not a local key.
//
// See the comment on splitTrigger for details on the complexities.
func (r *Replica) adminSplitWithDescriptor(
ctx context.Context,
args roachpb.AdminSplitRequest,
desc *roachpb.RangeDescriptor,
delayable bool,
reason string,
) (roachpb.AdminSplitResponse, error) {
var reply roachpb.AdminSplitResponse
// Determine split key if not provided with args. This scan is
// allowed to be relatively slow because admin commands don't block
// other commands.
log.Event(ctx, "split begins")
var splitKey roachpb.RKey
{
var foundSplitKey roachpb.Key
if len(args.SplitKey) == 0 {
// Find a key to split by size.
var err error
targetSize := r.GetMaxBytes() / 2
foundSplitKey, err = engine.MVCCFindSplitKey(
ctx, r.store.engine, desc.StartKey, desc.EndKey, targetSize)
if err != nil {
return reply, errors.Errorf("unable to determine split key: %s", err)
}
if foundSplitKey == nil {
// No suitable split key could be found.
return reply, unsplittableRangeError{}
}
} else {
// If the key that routed this request to this range is now out of this
// range's bounds, return an error for the client to try again on the
// correct range.
if !storagebase.ContainsKey(*desc, args.Key) {
return reply, roachpb.NewRangeKeyMismatchError(args.Key, args.Key, desc)
}
foundSplitKey = args.SplitKey
}
if !storagebase.ContainsKey(*desc, foundSplitKey) {
return reply, errors.Errorf("requested split key %s out of bounds of %s", args.SplitKey, r)
}
var err error
splitKey, err = keys.Addr(foundSplitKey)
if err != nil {
return reply, err
}
if !splitKey.Equal(foundSplitKey) {
return reply, errors.Errorf("cannot split range at range-local key %s", splitKey)
}
if !engine.IsValidSplitKey(foundSplitKey) {
return reply, errors.Errorf("cannot split range at key %s", splitKey)
}
}
// If the range starts at the splitKey, we treat the AdminSplit
// as a no-op and return success instead of throwing an error.
if desc.StartKey.Equal(splitKey) {
if len(args.SplitKey) == 0 {
log.Fatal(ctx, "MVCCFindSplitKey returned start key of range")
}
log.Event(ctx, "range already split")
return reply, nil
}
log.Event(ctx, "found split key")
// Create right hand side range descriptor.
rightDesc, err := r.store.NewRangeDescriptor(ctx, splitKey, desc.EndKey, desc.Replicas)
if err != nil {
return reply, errors.Errorf("unable to allocate right hand side range descriptor: %s", err)
}
// Init updated version of existing range descriptor.
leftDesc := *desc
leftDesc.IncrementGeneration()
leftDesc.EndKey = splitKey
var extra string
if delayable {
extra += maybeDelaySplitToAvoidSnapshot(ctx, (*splitDelayHelper)(r))
}
extra += splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
log.Infof(ctx, "initiating a split of this range at key %s [r%d] (%s)%s",
splitKey.StringWithDirs(nil /* valDirs */, 50 /* maxLen */), rightDesc.RangeID, reason, extra)
if err := r.store.DB().Txn(ctx, func(ctx context.Context, txn *client.Txn) error {
log.Event(ctx, "split closure begins")
defer log.Event(ctx, "split closure ends")
txn.SetDebugName(splitTxnName)
// Update existing range descriptor for left hand side of
// split. Note that we mutate the descriptor for the left hand
// side of the split first to locate the txn record there.
{
b := txn.NewBatch()
leftDescKey := keys.RangeDescriptorKey(leftDesc.StartKey)
if err := updateRangeDescriptor(b, leftDescKey, desc, &leftDesc); err != nil {
return err
}
// Commit this batch first to ensure that the transaction record
// is created in the right place (split trigger relies on this),
// but also to ensure the transaction record is created _before_
// intents for the RHS range descriptor or addressing records.
// Keep in mind that the BeginTransaction request is injected
// to accompany the first write request, but if part of a batch
// which spans ranges, the dist sender does not guarantee the
// order which parts of the split batch arrive.
//
// Sending the batch containing only the first write guarantees
// the transaction record is written first, preventing cases
// where splits are aborted early due to conflicts with meta
// intents (see #9265).
log.Event(ctx, "updating LHS descriptor")
if err := txn.Run(ctx, b); err != nil {
return err
}
}
// Log the split into the range event log.
// TODO(spencer): event logging API should accept a batch
// instead of a transaction; there's no reason this logging
// shouldn't be done in parallel via the batch with the updated
// range addressing.
if err := r.store.logSplit(ctx, txn, leftDesc, *rightDesc); err != nil {
return err
}
b := txn.NewBatch()
// Create range descriptor for right hand side of the split.
rightDescKey := keys.RangeDescriptorKey(rightDesc.StartKey)
if err := updateRangeDescriptor(b, rightDescKey, nil, rightDesc); err != nil {
return err
}
// Update range descriptor addressing record(s).
if err := splitRangeAddressing(b, rightDesc, &leftDesc); err != nil {
return err
}
// End the transaction manually, instead of letting RunTransaction
// loop do it, in order to provide a split trigger.
b.AddRawRequest(&roachpb.EndTransactionRequest{
Commit: true,
InternalCommitTrigger: &roachpb.InternalCommitTrigger{
SplitTrigger: &roachpb.SplitTrigger{
LeftDesc: leftDesc,
RightDesc: *rightDesc,
},
},
})
// Commit txn with final batch (RHS descriptor and meta).
log.Event(ctx, "commit txn with batch containing RHS descriptor and meta records")
return txn.Run(ctx, b)
}); err != nil {
// The ConditionFailedError can occur because the descriptors acting
// as expected values in the CPuts used to update the left or right
// range descriptors are picked outside the transaction. Return
// ConditionFailedError in the error detail so that the command can be
// retried.
if msg, ok := maybeDescriptorChangedError(desc, err); ok {
// NB: we have to wrap the existing error here as consumers of this
// code look at the root cause to sniff out the changed descriptor.
err = &benignError{errors.Wrap(err, msg)}
}
return reply, errors.Wrapf(err, "split at key %s failed", splitKey)
}
return reply, nil
}
// AdminMerge extends this range to subsume the range that comes next
// in the key space. The merge is performed inside of a distributed
// transaction which writes the left hand side range descriptor (the
// subsuming range) and deletes the range descriptor for the right
// hand side range (the subsumed range). It also updates the range
// addressing metadata. The handover of responsibility for the
// reassigned key range is carried out seamlessly through a merge
// trigger carried out as part of the commit of that transaction. A
// merge requires that the two ranges are collocated on the same set
// of replicas.
//
// The supplied RangeDescriptor is used as a form of optimistic lock. See the
// comment of "AdminSplit" for more information on this pattern.
func (r *Replica) AdminMerge(
ctx context.Context, args roachpb.AdminMergeRequest, reason string,
) (roachpb.AdminMergeResponse, *roachpb.Error) {
var reply roachpb.AdminMergeResponse
origLeftDesc := r.Desc()
if origLeftDesc.EndKey.Equal(roachpb.RKeyMax) {
// Merging the final range doesn't make sense.
return reply, roachpb.NewErrorf("cannot merge final range")
}
// Ensure that every current replica of the LHS has been initialized.
// Otherwise there is a rare race where the replica GC queue can GC a
// replica of the RHS too early. The comment on
// TestStoreRangeMergeUninitializedLHSFollower explains the situation in full.
if err := waitForReplicasInit(ctx, r.store.cfg.NodeDialer, *origLeftDesc); err != nil {
return reply, roachpb.NewError(errors.Wrap(
err, "waiting for all left-hand replicas to initialize"))
}
updatedLeftDesc := *origLeftDesc
rightDescKey := keys.RangeDescriptorKey(origLeftDesc.EndKey)
// Lookup right hand side range (subsumed). This really belongs
// inside the transaction for consistency, but it is important (for
// transaction record placement) that the first action inside the
// transaction is the conditional put to change the left hand side's
// descriptor end key. We look up the descriptor here only to get
// the new end key and then repeat the lookup inside the
// transaction.
{
var rightDesc roachpb.RangeDescriptor
if err := r.store.DB().GetProto(ctx, rightDescKey, &rightDesc); err != nil {
return reply, roachpb.NewError(err)
}
updatedLeftDesc.IncrementGeneration()
updatedLeftDesc.EndKey = rightDesc.EndKey
log.Infof(ctx, "initiating a merge of %s into this range (%s)", rightDesc, reason)
}
runMergeTxn := func(txn *client.Txn) error {
log.Event(ctx, "merge txn begins")
txn.SetDebugName(mergeTxnName)
// Observe the commit timestamp to force a client-side retry. See the
// comment on the retry loop after this closure for details.
//
// TODO(benesch): expose a proper API for preventing the fast path.
_ = txn.CommitTimestamp()
// Pipelining might send QueryIntent requests to the RHS after the RHS has
// noticed the merge and started blocking all traffic. This causes the merge
// transaction to deadlock. Just turn pipelining off; the structure of the
// merge transaction means pipelining provides no performance benefit
// anyway.
if err := txn.DisablePipelining(); err != nil {
return err
}
// Update the range descriptor for the receiving range.
{
b := txn.NewBatch()
leftDescKey := keys.RangeDescriptorKey(updatedLeftDesc.StartKey)
if err := updateRangeDescriptor(b, leftDescKey, origLeftDesc, &updatedLeftDesc); err != nil {
return err
}
// Commit this batch on its own to ensure that the transaction record
// is created in the right place (our triggers rely on this).
log.Event(ctx, "updating LHS descriptor")
if err := txn.Run(ctx, b); err != nil {
return err
}
}
// Do a consistent read of the right hand side's range descriptor.
var rightDesc roachpb.RangeDescriptor
if err := txn.GetProto(ctx, rightDescKey, &rightDesc); err != nil {
return err
}
// Verify that the two ranges are mergeable.
if !bytes.Equal(origLeftDesc.EndKey, rightDesc.StartKey) {
// Should never happen, but just in case.
return errors.Errorf("ranges are not adjacent; %s != %s", origLeftDesc.EndKey, rightDesc.StartKey)
}
if !bytes.Equal(rightDesc.EndKey, updatedLeftDesc.EndKey) {
// This merge raced with a split of the right-hand range.
// TODO(bdarnell): needs a test.
return errors.Errorf("range changed during merge; %s != %s", rightDesc.EndKey, updatedLeftDesc.EndKey)
}
if !replicaSetsEqual(origLeftDesc.Replicas, rightDesc.Replicas) {
return errors.Errorf("ranges not collocated; %s != %s", origLeftDesc.Replicas, rightDesc.Replicas)
}
// Log the merge into the range event log.
// TODO(spencer): event logging API should accept a batch
// instead of a transaction; there's no reason this logging
// shouldn't be done in parallel via the batch with the updated
// range addressing.
if err := r.store.logMerge(ctx, txn, updatedLeftDesc, rightDesc); err != nil {
return err
}
b := txn.NewBatch()
// Update the meta addressing records.
if err := mergeRangeAddressing(b, origLeftDesc, &updatedLeftDesc); err != nil {
return err
}
// Remove the range descriptor for the deleted range.
if err := updateRangeDescriptor(b, rightDescKey, &rightDesc, nil); err != nil {
return err
}
// Send off this batch, ensuring that intents are placed on both the local
// copy and meta2's copy of the right-hand side range descriptor before we
// send the Subsume request below. This is the precondition for sending a
// Subsume request; see the godoc on the Subsume request for details.
if err := txn.Run(ctx, b); err != nil {
return err
}
// Intents have been placed, so the merge is now in its critical phase. Get
// a consistent view of the data from the right-hand range. If the merge
// commits, we'll write this data to the left-hand range in the merge
// trigger.
br, pErr := client.SendWrapped(ctx, r.store.DB().NonTransactionalSender(),
&roachpb.SubsumeRequest{
RequestHeader: roachpb.RequestHeader{Key: rightDesc.StartKey.AsRawKey()},
LeftRange: *origLeftDesc,
})
if pErr != nil {
return pErr.GoError()
}
rhsSnapshotRes := br.(*roachpb.SubsumeResponse)
err := waitForApplication(ctx, r.store.cfg.NodeDialer, rightDesc, rhsSnapshotRes.LeaseAppliedIndex)
if err != nil {
return errors.Wrap(err, "waiting for all right-hand replicas to catch up")
}
// Successful subsume, so we're guaranteed that the right-hand range will
// not serve another request unless this transaction aborts. End the
// transaction manually in order to provide a merge trigger.
b = txn.NewBatch()
b.AddRawRequest(&roachpb.EndTransactionRequest{
Commit: true,
InternalCommitTrigger: &roachpb.InternalCommitTrigger{
MergeTrigger: &roachpb.MergeTrigger{
LeftDesc: updatedLeftDesc,
RightDesc: rightDesc,
RightMVCCStats: rhsSnapshotRes.MVCCStats,
FreezeStart: rhsSnapshotRes.FreezeStart,
},
},
})
log.Event(ctx, "attempting commit")
return txn.Run(ctx, b)
}
// If the merge transaction encounters an error, we need to trigger a full
// abort and try again with a new transaction. Why? runMergeTxn has the side
// effect of sending a Subsume request to the right-hand range, which blocks
// the right-hand range from serving any traffic until the transaction commits
// or aborts. If we retry using the same transaction (i.e., a "transaction
// restart"), we'll send requests to the blocked right-hand range and
// deadlock. The right-hand range will see that the transaction is still
// pending and refuse to respond, but the transaction cannot commit until the
// right-hand range responds. By instead marking the transaction as aborted,
// we'll unlock the right-hand range, giving the next, fresh transaction a
// chance to succeed.
//
// Note that client.DB.Txn performs retries using the same transaction, so we
// have to use our own retry loop.
for {
txn := client.NewTxn(ctx, r.store.DB(), r.NodeID(), client.RootTxn)
err := runMergeTxn(txn)
if err != nil {
txn.CleanupOnError(ctx, err)
}
if _, canRetry := errors.Cause(err).(*roachpb.TransactionRetryWithProtoRefreshError); !canRetry {
if err != nil {
return reply, roachpb.NewErrorf("merge of range into %d failed: %s",
origLeftDesc.RangeID, err)
}
return reply, nil
}
}
}
func waitForApplication(
ctx context.Context, dialer *nodedialer.Dialer, desc roachpb.RangeDescriptor, leaseIndex uint64,
) error {
return contextutil.RunWithTimeout(ctx, "wait for application", 5*time.Second, func(ctx context.Context) error {
g := ctxgroup.WithContext(ctx)
for _, repl := range desc.Replicas {
repl := repl // copy for goroutine
g.GoCtx(func(ctx context.Context) error {
conn, err := dialer.Dial(ctx, repl.NodeID)
if err != nil {
return errors.Wrapf(err, "could not dial n%d", repl.NodeID)
}
_, err = NewPerReplicaClient(conn).WaitForApplication(ctx, &WaitForApplicationRequest{
StoreRequestHeader: StoreRequestHeader{NodeID: repl.NodeID, StoreID: repl.StoreID},
RangeID: desc.RangeID,
LeaseIndex: leaseIndex,
})
return err
})
}
return g.Wait()
})
}
// waitForReplicasInit blocks until it has proof that the replicas listed in
// desc are initialized on their respective stores. It may return a false
// negative, i.e., claim that a replica is uninitialized when it is, in fact,
// initialized, but it will never return a false positive.
func waitForReplicasInit(
ctx context.Context, dialer *nodedialer.Dialer, desc roachpb.RangeDescriptor,
) error {
return contextutil.RunWithTimeout(ctx, "wait for replicas init", 5*time.Second, func(ctx context.Context) error {
g := ctxgroup.WithContext(ctx)
for _, repl := range desc.Replicas {
repl := repl // copy for goroutine
g.GoCtx(func(ctx context.Context) error {
conn, err := dialer.Dial(ctx, repl.NodeID)
if err != nil {
return errors.Wrapf(err, "could not dial n%d", repl.NodeID)
}
_, err = NewPerReplicaClient(conn).WaitForReplicaInit(ctx, &WaitForReplicaInitRequest{
StoreRequestHeader: StoreRequestHeader{NodeID: repl.NodeID, StoreID: repl.StoreID},
RangeID: desc.RangeID,
})
return err
})
}
return g.Wait()
})
}
type snapshotError struct {
// NB: don't implement Cause() on this type without also updating IsSnapshotError.
cause error
}
func (s *snapshotError) Error() string {
return fmt.Sprintf("snapshot failed: %s", s.cause.Error())
}
// IsSnapshotError returns true iff the error indicates a preemptive
// snapshot failed.
func IsSnapshotError(err error) bool {
return causer.Visit(err, func(err error) bool {
_, ok := errors.Cause(err).(*snapshotError)
return ok
})
}
// ChangeReplicas adds or removes a replica of a range. The change is performed
// in a distributed transaction and takes effect when that transaction is committed.
// When removing a replica, only the NodeID and StoreID fields of the Replica are used.
//
// The supplied RangeDescriptor is used as a form of optimistic lock. See the
// comment of "adminSplitWithDescriptor" for more information on this pattern.
//
// Changing the replicas for a range is complicated. A change is initiated by
// the "replicate" queue when it encounters a range which has too many
// replicas, too few replicas or requires rebalancing. Addition and removal of
// a replica is divided into four phases. The first phase, which occurs in
// Replica.ChangeReplicas, is performed via a distributed transaction which
// updates the range descriptor and the meta range addressing information. This
// transaction includes a special ChangeReplicasTrigger on the EndTransaction
// request. A ConditionalPut of the RangeDescriptor implements the optimistic
// lock on the RangeDescriptor mentioned previously. Like all transactions, the
// requests within the transaction are replicated via Raft, including the
// EndTransaction request.
//
// The second phase of processing occurs when the batch containing the
// EndTransaction is proposed to raft. This proposing occurs on whatever
// replica received the batch, usually, but not always the range lease
// holder. defaultProposeRaftCommandLocked notices that the EndTransaction
// contains a ChangeReplicasTrigger and proposes a ConfChange to Raft (via
// raft.RawNode.ProposeConfChange).
//
// The ConfChange is propagated to all of the replicas similar to a normal Raft
// command, though additional processing is done inside of Raft. A Replica
// encounters the ConfChange in Replica.handleRaftReady and executes it using
// raft.RawNode.ApplyConfChange. If a new replica was added the Raft leader
// will start sending it heartbeat messages and attempting to bring it up to
// date. If a replica was removed, it is at this point that the Raft leader
// will stop communicating with it.
//
// The fourth phase of change replicas occurs when each replica for the range
// encounters the ChangeReplicasTrigger when applying the EndTransaction
// request. The replica will update its local range descriptor so as to contain
// the new set of replicas. If the replica is the one that is being removed, it
// will queue itself for removal with replicaGCQueue.
//
// Note that a removed replica may not see the EndTransaction containing the
// ChangeReplicasTrigger. The ConfChange operation will be applied as soon as a
// quorum of nodes have committed it. If the removed replica is down or the
// message is dropped for some reason the removed replica will not be
// notified. The replica GC queue will eventually discover and cleanup this
// state.
//
// When a new replica is added, it will have to catch up to the state of the
// other replicas. The Raft leader automatically handles this by either sending
// the new replica Raft log entries to apply, or by generating and sending a
// snapshot. See Replica.Snapshot and Replica.Entries.
//
// Note that Replica.ChangeReplicas returns when the distributed transaction
// has been committed to a quorum of replicas in the range. The actual
// replication of data occurs asynchronously via a snapshot or application of
// Raft log entries. This is important for the replicate queue to be aware
// of. A node can process hundreds or thousands of ChangeReplicas operations
// per second even though the actual replication of data proceeds at a much
// slower base. In order to avoid having this background replication overwhelm
// the system, replication is throttled via a reservation system. When
// allocating a new replica for a range, the replicate queue reserves space for
// that replica on the target store via a ReservationRequest. (See
// StorePool.reserve). The reservation is fulfilled when the snapshot is
// applied.
//
// TODO(peter): There is a rare scenario in which a replica can be brought up
// to date via Raft log replay. In this scenario, the reservation will be left
// dangling until it expires. See #7849.
//
// TODO(peter): Describe preemptive snapshots. Preemptive snapshots are needed
// for the replicate queue to function properly. Currently the replicate queue
// will fire off as many replica additions as possible until it starts getting
// reservations denied at which point it will ignore the replica until the next
// scanner cycle.
func (r *Replica) ChangeReplicas(
ctx context.Context,
changeType roachpb.ReplicaChangeType,
target roachpb.ReplicationTarget,
desc *roachpb.RangeDescriptor,
reason storagepb.RangeLogEventReason,
details string,
) error {
return r.changeReplicas(ctx, changeType, target, desc, SnapshotRequest_REBALANCE, reason, details)
}
func (r *Replica) changeReplicas(
ctx context.Context,
changeType roachpb.ReplicaChangeType,
target roachpb.ReplicationTarget,
desc *roachpb.RangeDescriptor,
priority SnapshotRequest_Priority,
reason storagepb.RangeLogEventReason,
details string,
) error {
repDesc := roachpb.ReplicaDescriptor{
NodeID: target.NodeID,
StoreID: target.StoreID,
}
repDescIdx := -1 // tracks NodeID && StoreID
nodeUsed := false // tracks NodeID only
for i, existingRep := range desc.Replicas {
nodeUsedByExistingRep := existingRep.NodeID == repDesc.NodeID
nodeUsed = nodeUsed || nodeUsedByExistingRep
if nodeUsedByExistingRep && existingRep.StoreID == repDesc.StoreID {
repDescIdx = i
repDesc.ReplicaID = existingRep.ReplicaID
break
}
}
rangeID := desc.RangeID
updatedDesc := *desc
updatedDesc.Replicas = append([]roachpb.ReplicaDescriptor(nil), desc.Replicas...)
switch changeType {
case roachpb.ADD_REPLICA:
// If the replica exists on the remote node, no matter in which store,
// abort the replica add.
if nodeUsed {
if repDescIdx != -1 {
return errors.Errorf("%s: unable to add replica %v which is already present", r, repDesc)
}
return errors.Errorf("%s: unable to add replica %v; node already has a replica", r, repDesc)
}
// Send a pre-emptive snapshot. Note that the replica to which this
// snapshot is addressed has not yet had its replica ID initialized; this
// is intentional, and serves to avoid the following race with the replica
// GC queue:
//
// - snapshot received, a replica is lazily created with the "real" replica ID
// - the replica is eligible for GC because it is not yet a member of the range
// - GC queue runs, creating a raft tombstone with the replica's ID
// - the replica is added to the range
// - lazy creation of the replica fails due to the raft tombstone
//
// Instead, the replica GC queue will create a tombstone with replica ID
// zero, which is never legitimately used, and thus never interferes with
// raft operations. Racing with the replica GC queue can still partially
// negate the benefits of pre-emptive snapshots, but that is a recoverable
// degradation, not a catastrophic failure.
//
// NB: A closure is used here so that we can release the snapshot as soon
// as it has been applied on the remote and before the ChangeReplica
// operation is processed. This is important to allow other ranges to make
// progress which might be required for this ChangeReplicas operation to
// complete. See #10409.
if err := r.sendSnapshot(ctx, repDesc, snapTypePreemptive, priority); err != nil {
return err
}
repDesc.ReplicaID = updatedDesc.NextReplicaID
updatedDesc.NextReplicaID++
updatedDesc.Replicas = append(updatedDesc.Replicas, repDesc)
case roachpb.REMOVE_REPLICA:
// If that exact node-store combination does not have the replica,
// abort the removal.
if repDescIdx == -1 {
return errors.Errorf("%s: unable to remove replica %v which is not present", r, repDesc)
}
updatedDesc.Replicas[repDescIdx] = updatedDesc.Replicas[len(updatedDesc.Replicas)-1]
updatedDesc.Replicas = updatedDesc.Replicas[:len(updatedDesc.Replicas)-1]
}
descKey := keys.RangeDescriptorKey(desc.StartKey)
if err := r.store.DB().Txn(ctx, func(ctx context.Context, txn *client.Txn) error {
log.Event(ctx, "attempting txn")
txn.SetDebugName(replicaChangeTxnName)
// TODO(tschottdorf): oldDesc is used for sanity checks related to #7224.
// Remove when that has been solved. The failure mode is likely based on
// prior divergence of the Replica (in which case the check below does not
// fire because everything reads from the local, diverged, set of data),
// so we don't expect to see this fail in practice ever.
oldDesc := new(roachpb.RangeDescriptor)
if err := txn.GetProto(ctx, descKey, oldDesc); err != nil {
return err
}
log.Infof(ctx, "change replicas (%v %s): read existing descriptor %s",
changeType, repDesc, oldDesc)
{
b := txn.NewBatch()
// Important: the range descriptor must be the first thing touched in the transaction
// so the transaction record is co-located with the range being modified.
if err := updateRangeDescriptor(b, descKey, desc, &updatedDesc); err != nil {
return err
}
// Run transaction up to this point to create txn record early (see #9265).
if err := txn.Run(ctx, b); err != nil {
return err
}
}
// Log replica change into range event log.
if err := r.store.logChange(
ctx, txn, changeType, repDesc, updatedDesc, reason, details,
); err != nil {
return err
}
// End the transaction manually instead of letting RunTransaction
// loop do it, in order to provide a commit trigger.
b := txn.NewBatch()
// Update range descriptor addressing record(s).
if err := updateRangeAddressing(b, &updatedDesc); err != nil {
return err
}
b.AddRawRequest(&roachpb.EndTransactionRequest{
Commit: true,
InternalCommitTrigger: &roachpb.InternalCommitTrigger{
// TODO(benesch): this trigger should just specify the updated
// descriptor, like the split and merge triggers, so that the receiver
// doesn't need to reconstruct the range descriptor update.
ChangeReplicasTrigger: &roachpb.ChangeReplicasTrigger{
ChangeType: changeType,
Replica: repDesc,
UpdatedReplicas: updatedDesc.Replicas,
NextReplicaID: updatedDesc.NextReplicaID,
},
},
})
if err := txn.Run(ctx, b); err != nil {
log.Event(ctx, err.Error())
return err
}
if oldDesc.RangeID != 0 && !oldDesc.Equal(desc) {
// We read the previous value, it wasn't what we supposedly used in
// the CPut, but we still overwrote in the CPut above.
panic(fmt.Sprintf("committed replica change, but oldDesc != assumedOldDesc:\n%+v\n%+v\nnew desc:\n%+v",
oldDesc, desc, updatedDesc))
}
return nil
}); err != nil {
log.Event(ctx, err.Error())
if msg, ok := maybeDescriptorChangedError(desc, err); ok {
err = &benignError{errors.New(msg)}
}
return errors.Wrapf(err, "change replicas of r%d failed", rangeID)
}
log.Event(ctx, "txn complete")
return nil
}
// sendSnapshot sends a snapshot of the replica state to the specified
// replica. This is used for both preemptive snapshots that are performed
// before adding a replica to a range, and for Raft-initiated snapshots that
// are used to bring a replica up to date that has fallen too far
// behind. Currently only invoked from replicateQueue and raftSnapshotQueue. Be
// careful about adding additional calls as generating a snapshot is moderately
// expensive.
func (r *Replica) sendSnapshot(
ctx context.Context,
repDesc roachpb.ReplicaDescriptor,
snapType string,
priority SnapshotRequest_Priority,
) error {
snap, err := r.GetSnapshot(ctx, snapType)
if err != nil {
return errors.Wrapf(err, "%s: failed to generate %s snapshot", r, snapType)
}
defer snap.Close()
log.Event(ctx, "generated snapshot")
fromRepDesc, err := r.GetReplicaDescriptor()
if err != nil {
return errors.Wrapf(err, "%s: change replicas failed", r)
}
status := r.RaftStatus()
if status == nil {
// This code path is sometimes hit during scatter for replicas that
// haven't woken up yet.
return &benignError{errors.New("raft status not initialized")}
}
usesReplicatedTruncatedState, err := engine.MVCCGetProto(
ctx, snap.EngineSnap, keys.RaftTruncatedStateLegacyKey(r.RangeID), hlc.Timestamp{}, nil, engine.MVCCGetOptions{},
)
if err != nil {
return errors.Wrap(err, "loading legacy truncated state")
}
if !usesReplicatedTruncatedState && snap.State.TruncatedState.Index < snap.State.RaftAppliedIndex {
// If we're not using a legacy (replicated) truncated state, we can
// avoid sending the (past) Raft log in the snapshot in the first place
// and send only those entries that are actually useful to the follower.
//
// TODO(tbg): sending any entries at all doesn't seem worth it as now
// again we're relying on some mechanisms (quota pool, Raft uncommitted
// size limit) to make sure this doesn't grow without bounds.
snap.State.TruncatedState = &roachpb.RaftTruncatedState{
Index: snap.RaftSnap.Metadata.Index,
Term: snap.RaftSnap.Metadata.Term,
}
}
req := SnapshotRequest_Header{
State: snap.State,
// Tell the recipient whether it needs to synthesize the new
// unreplicated TruncatedState. It could tell by itself by peeking into
// the data, but it uses a write only batch for performance which
// doesn't support that; this is easier. Notably, this is true if the
// snap index itself is the one at which the migration happens.
//
// See VersionUnreplicatedRaftTruncatedState.
UnreplicatedTruncatedState: !usesReplicatedTruncatedState,
RaftMessageRequest: RaftMessageRequest{
RangeID: r.RangeID,
FromReplica: fromRepDesc,
ToReplica: repDesc,
Message: raftpb.Message{
Type: raftpb.MsgSnap,
To: uint64(repDesc.ReplicaID),
From: uint64(fromRepDesc.ReplicaID),
Term: status.Term,
Snapshot: snap.RaftSnap,
},
},
RangeSize: r.GetMVCCStats().Total(),
// Recipients can choose to decline preemptive snapshots.
CanDecline: snapType == snapTypePreemptive,
Priority: priority,
Strategy: SnapshotRequest_KV_BATCH,
}
sent := func() {
r.store.metrics.RangeSnapshotsGenerated.Inc(1)
}
if err := r.store.cfg.Transport.SendSnapshot(
ctx,
&r.store.cfg.RaftConfig,
r.store.allocator.storePool,
req,
snap,
r.store.Engine().NewBatch,
sent,
); err != nil {
return &snapshotError{err}
}
return nil
}
// replicaSetsEqual is used in AdminMerge to ensure that the ranges are
// all collocate on the same set of replicas.
func replicaSetsEqual(a, b []roachpb.ReplicaDescriptor) bool {
if len(a) != len(b) {
return false
}
set := make(map[roachpb.StoreID]int)
for _, replica := range a {
set[replica.StoreID]++
}
for _, replica := range b {
set[replica.StoreID]--
}
for _, value := range set {
if value != 0 {
return false
}
}
return true
}
// updateRangeDescriptor adds a ConditionalPut on the range descriptor. The
// conditional put verifies that changes to the range descriptor are made in a
// well-defined order, preventing a scenario where a wayward replica which is
// no longer part of the original Raft group comes back online to form a
// splinter group with a node which was also a former replica, and hijacks the
// range descriptor. This is a last line of defense; other mechanisms should
// prevent rogue replicas from getting this far (see #768).
//
// oldDesc can be nil, meaning that the key is expected to not exist.
//
// Note that in addition to using this method to update the on-disk range
// descriptor, a CommitTrigger must be used to update the in-memory
// descriptor; it will not automatically be copied from newDesc.
func updateRangeDescriptor(
b *client.Batch,
descKey roachpb.Key,
oldDesc *roachpb.RangeDescriptor,
newDesc *roachpb.RangeDescriptor,
) error {