-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
txn_coord_sender.go
1470 lines (1305 loc) · 53.6 KB
/
txn_coord_sender.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2014 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package kvcoord
import (
"context"
"runtime/debug"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
"github.com/cockroachdb/cockroach/pkg/util/buildutil"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/randutil"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/logtags"
"github.com/cockroachdb/redact"
"go.opentelemetry.io/otel/attribute"
)
const (
// OpTxnCoordSender represents a txn coordinator send operation.
OpTxnCoordSender = "txn coordinator send"
)
// DisableCommitSanityCheck allows opting out of a fatal assertion error that was observed in the wild
// and for which a root cause is not yet available.
//
// See: https://github.com/cockroachdb/cockroach/pull/73512.
var DisableCommitSanityCheck = envutil.EnvOrDefaultBool("COCKROACH_DISABLE_COMMIT_SANITY_CHECK", false)
// forceTxnRetries enables random transaction retries for test builds
// even if they aren't enabled via testing knobs.
var forceTxnRetries = envutil.EnvOrDefaultBool("COCKROACH_FORCE_RANDOM_TXN_RETRIES", false)
// txnState represents states relating to whether an EndTxn request needs
// to be sent.
//
//go:generate stringer -type=txnState
type txnState int
const (
// txnPending is the normal state for ongoing transactions.
txnPending txnState = iota
// txnRetryableError means that the transaction encountered a
// TransactionRetryWithProtoRefreshError, and calls to Send() fail in this
// state. It is possible to move back to txnPending by calling
// ClearTxnRetryableErr().
txnRetryableError
// txnError means that a batch encountered a non-retriable error. Further
// batches except EndTxn(commit=false) will be rejected.
txnError
// txnFinalized means that an EndTxn(commit=true) has been executed
// successfully, or an EndTxn(commit=false) was sent - regardless of
// whether it executed successfully or not. Further batches except
// EndTxn(commit=false) will be rejected; a second rollback is allowed
// in case the first one fails.
// TODO(andrei): we'd probably benefit from splitting this state into at least
// two - transaction definitely cleaned up, and transaction potentially
// cleaned up.
txnFinalized
)
// A TxnCoordSender is the production implementation of client.TxnSender. It is
// a Sender which wraps a lower-level Sender (a DistSender) to which it sends
// commands. It works on behalf of the client to keep a transaction's state
// (e.g. intents) and to perform periodic heartbeating of the transaction
// required when necessary. Unlike other senders, TxnCoordSender is not a
// singleton - an instance is created for every transaction by the
// TxnCoordSenderFactory.
//
// Among the functions it performs are:
// - Heartbeating of the transaction record. Note that heartbeating is done only
// from the root transaction coordinator, in the event that multiple
// coordinators are active (i.e. in a distributed SQL flow).
// - Accumulating lock spans.
// - Attaching lock spans to EndTxn requests, for cleanup.
// - Handles retriable errors by either bumping the transaction's epoch or, in
// case of TransactionAbortedErrors, cleaning up the transaction (in this case,
// the client.Txn is expected to create a new TxnCoordSender instance
// transparently for the higher-level client).
//
// Since it is stateful, the TxnCoordSender needs to understand when a
// transaction is "finished" and the state can be destroyed. As such there's a
// contract that the client.Txn needs obey. Read-only transactions don't matter
// - they're stateless. For the others, once an intent write is sent by the
// client, the TxnCoordSender considers the transactions completed in the
// following situations:
// - A batch containing an EndTxns (commit or rollback) succeeds.
// - A batch containing an EndTxn(commit=false) succeeds or fails. Only
// more rollback attempts can follow a rollback attempt.
// - A batch returns a TransactionAbortedError. As mentioned above, the client
// is expected to create a new TxnCoordSender for the next transaction attempt.
//
// Note that "1PC" batches (i.e. batches containing both a Begin and an
// EndTxn) are no exception from the contract - if the batch fails, the
// client is expected to send a rollback (or perform another transaction attempt
// in case of retriable errors).
type TxnCoordSender struct {
mu struct {
syncutil.Mutex
txnState txnState
// storedRetryableErr is set when txnState == txnRetryableError. This
// storedRetryableErr is returned to clients on Send().
storedRetryableErr *kvpb.TransactionRetryWithProtoRefreshError
// storedErr is set when txnState == txnError. This storedErr is returned to
// clients on Send().
storedErr *kvpb.Error
// active is set whenever the transaction has sent any requests. Rolling
// back to a savepoint taken before the TxnCoordSender became active resets
// the field to false.
active bool
// closed is set once this transaction has either committed or rolled back
// (including when the heartbeat loop cleans it up asynchronously). If the
// client sends anything other than a rollback, it will get an error
// (a retryable TransactionAbortedError in case of the async abort).
closed bool
// txn is the Transaction proto attached to all the requests and updated on
// all the responses.
txn roachpb.Transaction
// userPriority is the txn's priority. Used when restarting the transaction.
// This field is only populated on rootTxns.
userPriority roachpb.UserPriority
// commitWaitDeferred is set to true when the transaction commit-wait
// state is deferred and should not be run automatically. Instead, the
// caller of DeferCommitWait has assumed responsibility for performing
// the commit-wait.
commitWaitDeferred bool
}
// A pointer member to the creating factory provides access to
// immutable factory settings.
*TxnCoordSenderFactory
// An ordered stack of pluggable request interceptors that can transform
// batch requests and responses while each maintaining targeted state.
// The stack is stored in a slice backed by the interceptorAlloc.arr and each
// txnInterceptor implementation is embedded in the interceptorAlloc struct,
// so the entire stack is allocated together with TxnCoordSender without any
// additional heap allocations necessary.
interceptorStack []txnInterceptor
interceptorAlloc struct {
arr [6]txnInterceptor
txnHeartbeater
txnSeqNumAllocator
txnPipeliner
txnSpanRefresher
txnCommitter
txnMetricRecorder
txnLockGatekeeper // not in interceptorStack array.
}
// typ specifies whether this transaction is the top level,
// or one of potentially many distributed transactions.
typ kv.TxnType
}
var _ kv.TxnSender = &TxnCoordSender{}
// txnInterceptors are pluggable request interceptors that transform requests
// and responses and can perform operations in the context of a transaction. A
// TxnCoordSender maintains a stack of txnInterceptors that it calls into under
// lock whenever it sends a request.
type txnInterceptor interface {
lockedSender
// setWrapped sets the txnInterceptor wrapped lockedSender.
setWrapped(wrapped lockedSender)
// populateLeafInputState populates the given input payload
// for a LeafTxn.
populateLeafInputState(*roachpb.LeafTxnInputState)
// populateLeafFinalState populates the final payload
// for a LeafTxn to bring back into a RootTxn.
populateLeafFinalState(*roachpb.LeafTxnFinalState)
// importLeafFinalState updates any internal state held inside the
// interceptor from the given LeafTxn final state.
importLeafFinalState(context.Context, *roachpb.LeafTxnFinalState) error
// epochBumpedLocked resets the interceptor in the case of a txn epoch
// increment.
epochBumpedLocked()
// createSavepointLocked is used to populate a savepoint with all the state
// that needs to be restored on a rollback.
createSavepointLocked(context.Context, *savepoint)
// rollbackToSavepointLocked is used to restore the state previously saved by
// createSavepointLocked().
rollbackToSavepointLocked(context.Context, savepoint)
// closeLocked closes the interceptor. It is called when the TxnCoordSender
// shuts down due to either a txn commit or a txn abort. The method will
// be called exactly once from cleanupTxnLocked.
closeLocked()
}
func newRootTxnCoordSender(
tcf *TxnCoordSenderFactory, txn *roachpb.Transaction, pri roachpb.UserPriority,
) kv.TxnSender {
txn.AssertInitialized(context.TODO())
if txn.Status != roachpb.PENDING {
log.Fatalf(context.TODO(), "unexpected non-pending txn in RootTransactionalSender: %s", txn)
}
if txn.Sequence != 0 {
log.Fatalf(context.TODO(), "cannot initialize root txn with seq != 0: %s", txn)
}
tcs := &TxnCoordSender{
typ: kv.RootTxn,
TxnCoordSenderFactory: tcf,
}
tcs.mu.txnState = txnPending
tcs.mu.userPriority = pri
// Create a stack of request/response interceptors. All of the objects in
// this stack are pre-allocated on the TxnCoordSender struct, so this just
// initializes the interceptors and pieces them together. It then adds a
// txnLockGatekeeper at the bottom of the stack to connect it with the
// TxnCoordSender's wrapped sender. First, each of the interceptor objects
// is initialized.
tcs.interceptorAlloc.txnHeartbeater.init(
tcf.AmbientContext,
tcs.stopper,
tcs.clock,
&tcs.metrics,
tcs.heartbeatInterval,
&tcs.interceptorAlloc.txnLockGatekeeper,
&tcs.mu.Mutex,
&tcs.mu.txn,
)
tcs.interceptorAlloc.txnCommitter = txnCommitter{
st: tcf.st,
stopper: tcs.stopper,
mu: &tcs.mu.Mutex,
}
tcs.interceptorAlloc.txnMetricRecorder = txnMetricRecorder{
metrics: &tcs.metrics,
clock: tcs.clock,
txn: &tcs.mu.txn,
}
tcs.initCommonInterceptors(tcf, txn, kv.RootTxn)
// Once the interceptors are initialized, piece them all together in the
// correct order.
tcs.interceptorAlloc.arr = [...]txnInterceptor{
&tcs.interceptorAlloc.txnHeartbeater,
// Various interceptors below rely on sequence number allocation,
// so the sequence number allocator is near the top of the stack.
&tcs.interceptorAlloc.txnSeqNumAllocator,
// The pipeliner sits above the span refresher because it will
// never generate transaction retry errors that could be avoided
// with a refresh.
&tcs.interceptorAlloc.txnPipeliner,
// The span refresher may resend entire batches to avoid transaction
// retries. Because of that, we need to be careful which interceptors
// sit below it in the stack.
&tcs.interceptorAlloc.txnSpanRefresher,
// The committer sits beneath the span refresher so that any
// retryable errors that it generates have a chance of being
// "refreshed away" without the need for a txn restart. Because the
// span refresher can re-issue batches, it needs to be careful about
// what parts of the batch it mutates. Any mutation needs to be
// idempotent and should avoid writing to memory when not changing
// it to avoid looking like a data race.
&tcs.interceptorAlloc.txnCommitter,
// The metrics recorder sits at the bottom of the stack so that it
// can observe all transformations performed by other interceptors.
&tcs.interceptorAlloc.txnMetricRecorder,
}
tcs.interceptorStack = tcs.interceptorAlloc.arr[:]
tcs.connectInterceptors()
tcs.mu.txn.Update(txn)
return tcs
}
func (tc *TxnCoordSender) initCommonInterceptors(
tcf *TxnCoordSenderFactory, txn *roachpb.Transaction, typ kv.TxnType,
) {
var riGen rangeIteratorFactory
if ds, ok := tcf.wrapped.(*DistSender); ok {
riGen.ds = ds
}
tc.interceptorAlloc.txnPipeliner = txnPipeliner{
st: tcf.st,
riGen: riGen,
txnMetrics: &tc.metrics,
condensedIntentsEveryN: &tc.TxnCoordSenderFactory.condensedIntentsEveryN,
}
tc.interceptorAlloc.txnSpanRefresher = txnSpanRefresher{
st: tcf.st,
knobs: &tcf.testingKnobs,
riGen: riGen,
// We can only allow refresh span retries on root transactions
// because those are the only places where we have all of the
// refresh spans. If this is a leaf, as in a distributed sql flow,
// we need to propagate the error to the root for an epoch restart.
canAutoRetry: typ == kv.RootTxn,
refreshSuccess: tc.metrics.RefreshSuccess,
refreshFail: tc.metrics.RefreshFail,
refreshFailWithCondensedSpans: tc.metrics.RefreshFailWithCondensedSpans,
refreshMemoryLimitExceeded: tc.metrics.RefreshMemoryLimitExceeded,
refreshAutoRetries: tc.metrics.RefreshAutoRetries,
}
tc.interceptorAlloc.txnLockGatekeeper = txnLockGatekeeper{
wrapped: tc.wrapped,
mu: &tc.mu.Mutex,
allowConcurrentRequests: typ == kv.LeafTxn,
}
tc.interceptorAlloc.txnSeqNumAllocator.writeSeq = txn.Sequence
}
func (tc *TxnCoordSender) connectInterceptors() {
for i, reqInt := range tc.interceptorStack {
if i < len(tc.interceptorStack)-1 {
reqInt.setWrapped(tc.interceptorStack[i+1])
} else {
reqInt.setWrapped(&tc.interceptorAlloc.txnLockGatekeeper)
}
}
}
func newLeafTxnCoordSender(
tcf *TxnCoordSenderFactory, tis *roachpb.LeafTxnInputState,
) kv.TxnSender {
txn := &tis.Txn
txn.AssertInitialized(context.TODO())
if txn.Status != roachpb.PENDING {
log.Fatalf(context.TODO(), "unexpected non-pending txn in LeafTransactionalSender: %s", tis)
}
tcs := &TxnCoordSender{
typ: kv.LeafTxn,
TxnCoordSenderFactory: tcf,
}
tcs.mu.txnState = txnPending
// No need to initialize tcs.mu.userPriority here,
// as this field is only used in root txns.
// Create a stack of request/response interceptors. All of the objects in
// this stack are pre-allocated on the TxnCoordSender struct, so this just
// initializes the interceptors and pieces them together. It then adds a
// txnLockGatekeeper at the bottom of the stack to connect it with the
// TxnCoordSender's wrapped sender. First, each of the interceptor objects
// is initialized.
tcs.initCommonInterceptors(tcf, txn, kv.LeafTxn)
// Per-interceptor leaf initialization. If/when more interceptors
// need leaf initialization, this should be turned into an interface
// method on txnInterceptor with a loop here.
tcs.interceptorAlloc.txnPipeliner.initializeLeaf(tis)
tcs.interceptorAlloc.txnSeqNumAllocator.initializeLeaf(tis)
// Once the interceptors are initialized, piece them all together in the
// correct order.
tcs.interceptorAlloc.arr = [cap(tcs.interceptorAlloc.arr)]txnInterceptor{
// LeafTxns never perform writes so the sequence number allocator
// should never increment its sequence number counter over its
// lifetime, but it still plays the important role of assigning each
// read request the latest sequence number.
&tcs.interceptorAlloc.txnSeqNumAllocator,
// The pipeliner is needed on leaves to ensure that in-flight writes
// are chained onto by reads that should see them.
&tcs.interceptorAlloc.txnPipeliner,
// The span refresher may be needed for accumulating the spans to
// be reported to the Root. See also: #24798.
//
// Note: this interceptor must be the last in the list; it is
// only conditionally included in the stack. See below.
&tcs.interceptorAlloc.txnSpanRefresher,
}
// All other interceptors are absent from a LeafTxn's interceptor stack
// because they do not serve a role on leaves.
// If the root has informed us that the read spans are not needed by
// the root, we don't need the txnSpanRefresher.
if tis.RefreshInvalid {
tcs.interceptorStack = tcs.interceptorAlloc.arr[:2]
} else {
tcs.interceptorStack = tcs.interceptorAlloc.arr[:3]
}
tcs.connectInterceptors()
tcs.mu.txn.Update(txn)
return tcs
}
// DisablePipelining is part of the client.TxnSender interface.
func (tc *TxnCoordSender) DisablePipelining() error {
tc.mu.Lock()
defer tc.mu.Unlock()
if tc.mu.active {
return errors.Errorf("cannot disable pipelining on a running transaction")
}
tc.interceptorAlloc.txnPipeliner.disabled = true
return nil
}
func generateTxnDeadlineExceededErr(txn *roachpb.Transaction, deadline hlc.Timestamp) *kvpb.Error {
exceededBy := txn.WriteTimestamp.GoTime().Sub(deadline.GoTime())
extraMsg := redact.Sprintf(
"txn timestamp pushed too much; deadline exceeded by %s (%s > %s)",
exceededBy, txn.WriteTimestamp, deadline)
return kvpb.NewErrorWithTxn(
kvpb.NewTransactionRetryError(kvpb.RETRY_COMMIT_DEADLINE_EXCEEDED, extraMsg), txn)
}
// finalizeNonLockingTxnLocked finalizes a non-locking txn, either marking it as
// committed or aborted. It is equivalent, but cheaper than, sending an
// EndTxnRequest. A non-locking txn doesn't have a transaction record, so
// there's no need to send any request to the server. An EndTxnRequest for a
// non-locking txn is elided by the txnCommitter interceptor. However, calling
// this and short-circuting even earlier is even more efficient (and shows in
// benchmarks).
// TODO(nvanbenschoten): we could have this call into txnCommitter's
// sendLockedWithElidedEndTxn method, but we would want to confirm
// that doing so doesn't cut into the speed-up we see from this fast-path.
func (tc *TxnCoordSender) finalizeNonLockingTxnLocked(
ctx context.Context, ba *kvpb.BatchRequest,
) *kvpb.Error {
et := ba.Requests[0].GetEndTxn()
if et.Commit {
deadline := et.Deadline
if !deadline.IsEmpty() && deadline.LessEq(tc.mu.txn.WriteTimestamp) {
txn := tc.mu.txn.Clone()
pErr := generateTxnDeadlineExceededErr(txn, deadline)
// We need to bump the epoch and transform this retriable error.
ba.Txn = txn
return tc.updateStateLocked(ctx, ba, nil /* br */, pErr)
}
// Mark the transaction as committed so that, in case this commit is done by
// the closure passed to db.Txn()), db.Txn() doesn't attempt to commit again.
// Also so that the correct metric gets incremented.
tc.mu.txn.Status = roachpb.COMMITTED
} else {
tc.mu.txn.Status = roachpb.ABORTED
}
tc.finalizeAndCleanupTxnLocked(ctx)
if et.Commit {
if err := tc.maybeCommitWait(ctx, false /* deferred */); err != nil {
return kvpb.NewError(err)
}
}
return nil
}
// Send is part of the client.TxnSender interface.
func (tc *TxnCoordSender) Send(
ctx context.Context, ba *kvpb.BatchRequest,
) (*kvpb.BatchResponse, *kvpb.Error) {
// NOTE: The locking here is unusual. Although it might look like it, we are
// NOT holding the lock continuously for the duration of the Send. We lock
// here, and unlock at the bottom of the interceptor stack, in the
// txnLockGatekeeper. Then we lock again in that interceptor when the response
// comes, and unlock again in the defer below.
tc.mu.Lock()
defer tc.mu.Unlock()
tc.mu.active = true
if pErr := tc.maybeRejectIncompatibleRequest(ctx, ba); pErr != nil {
return nil, pErr
}
if pErr := tc.maybeRejectClientLocked(ctx, ba); pErr != nil {
return nil, pErr
}
if ba.IsSingleEndTxnRequest() && !tc.interceptorAlloc.txnPipeliner.hasAcquiredLocks() {
return nil, tc.finalizeNonLockingTxnLocked(ctx, ba)
}
ctx, sp := tc.AnnotateCtxWithSpan(ctx, OpTxnCoordSender)
defer sp.Finish()
// Associate the txnID with the trace.
if tc.mu.txn.ID == (uuid.UUID{}) {
log.Fatalf(ctx, "cannot send transactional request through unbound TxnCoordSender")
}
if sp.IsVerbose() {
sp.SetTag("txnID", attribute.StringValue(tc.mu.txn.ID.String()))
ctx = logtags.AddTag(ctx, "txn", uuid.ShortStringer(tc.mu.txn.ID))
if log.V(2) {
ctx = logtags.AddTag(ctx, "ts", tc.mu.txn.WriteTimestamp)
}
}
// It doesn't make sense to use inconsistent reads in a transaction. However,
// we still need to accept it as a parameter for this to compile.
if ba.ReadConsistency != kvpb.CONSISTENT {
return nil, kvpb.NewErrorf("cannot use %s ReadConsistency in txn",
ba.ReadConsistency)
}
lastIndex := len(ba.Requests) - 1
if lastIndex < 0 {
return nil, nil
}
// Clone the Txn's Proto so that future modifications can be made without
// worrying about synchronization.
ba.Txn = tc.mu.txn.Clone()
// Send the command through the txnInterceptor stack.
br, pErr := tc.interceptorStack[0].SendLocked(ctx, ba)
pErr = tc.updateStateLocked(ctx, ba, br, pErr)
// If we succeeded to commit, or we attempted to rollback, we move to
// txnFinalized.
if req, ok := ba.GetArg(kvpb.EndTxn); ok {
et := req.(*kvpb.EndTxnRequest)
if (et.Commit && pErr == nil) || !et.Commit {
tc.finalizeAndCleanupTxnLocked(ctx)
if et.Commit {
if err := tc.maybeCommitWait(ctx, false /* deferred */); err != nil {
return nil, kvpb.NewError(err)
}
}
}
}
if pErr != nil {
return nil, pErr
}
if br != nil && br.Error != nil {
panic(kvpb.ErrorUnexpectedlySet(nil /* culprit */, br))
}
return br, nil
}
// maybeCommitWait performs a "commit-wait" sleep, if doing so is deemed
// necessary for consistency.
//
// By default, commit-wait is only necessary for transactions that commit
// with a future-time timestamp that leads the local HLC clock. This is
// because CockroachDB's consistency model depends on all transactions
// waiting until their commit timestamp is below their gateway clock. In
// doing so, transactions ensure that at the time that they complete, all
// other clocks in the system (i.e. on all possible gateways) will be no
// more than the max_offset below the transaction's commit timestamp. This
// property ensures that all causally dependent transactions will have an
// uncertainty interval (see GlobalUncertaintyLimit) that exceeds the
// original transaction's commit timestamp, preventing stale reads. Without
// the wait, it would be possible for a read-write transaction to write a
// future-time value and then for a causally dependent transaction to read
// below that future-time value, violating "read your writes".
//
// The property must also hold for read-only transactions, which may have a
// commit timestamp in the future due to an uncertainty restart after
// observing a future-time value in their uncertainty interval. In such
// cases, the property that the transaction must wait for the local HLC
// clock to exceed its commit timestamp is not necessary to prevent stale
// reads, but it is necessary to ensure monotonic reads. Without the wait,
// it would be possible for a read-only transaction coordinated on a gateway
// with a fast clock to return a future-time value and then for a causally
// dependent read-only transaction coordinated on a gateway with a slow
// clock to read below that future-time value, violating "monotonic reads".
//
// In practice, most transactions do not need to wait at all, because their
// commit timestamps were pulled from an HLC clock (either the local clock
// or a remote clock on a node whom the local node has communicated with)
// and so they will be guaranteed to lead the local HLC's clock, assuming
// proper HLC time propagation. Only transactions whose commit timestamps
// were pushed into the future will need to wait, like those who wrote to a
// global_read range and got bumped by the closed timestamp or those who
// conflicted (write-read or write-write) with an existing future-time
// value.
//
// However, CockroachDB also supports a stricter model of consistency
// through its "linearizable" flag. When in linearizable mode (also known as
// "strict serializable" mode), all writing transactions (but not read-only
// transactions) must wait an additional max_offset after committing to
// ensure that their commit timestamp is below the current HLC clock time of
// any other node in the system. In doing so, all causally dependent
// transactions are guaranteed to start with higher timestamps, regardless
// of the gateway they use. This ensures that all causally dependent
// transactions commit with higher timestamps, even if their read and writes
// sets do not conflict with the original transaction's. This prevents the
// "causal reverse" anomaly which can be observed by a third, concurrent
// transaction.
//
// Even when in linearizable mode and performing this extra wait on the commit
// of read-write transactions, uncertainty intervals are still necessary. This
// is to ensure that any two reads that touch overlapping keys but are executed
// on different nodes obey real-time ordering and do not violate the "monotonic
// reads" property. Without uncertainty intervals, it would be possible for a
// read on a node with a fast clock (ts@15) to observe a committed value (ts@10)
// and then a later read on a node with a slow clock (ts@5) to miss the
// committed value. When contrasting this with Google Spanner, we notice that
// Spanner performs a similar commit-wait but then does not include uncertainty
// intervals. The reason this works in Spanner is that read-write transactions
// in Spanner hold their locks across the commit-wait duration, which blocks
// concurrent readers and enforces real-time ordering between any two readers as
// well between the writer and any future reader. Read-write transactions in
// CockroachDB do not hold locks across commit-wait (they release them before),
// so the uncertainty interval is still needed.
//
// For more, see https://www.cockroachlabs.com/blog/consistency-model/ and
// docs/RFCS/20200811_non_blocking_txns.md.
func (tc *TxnCoordSender) maybeCommitWait(ctx context.Context, deferred bool) error {
if tc.mu.txn.Status != roachpb.COMMITTED {
log.Fatalf(ctx, "maybeCommitWait called when not committed")
}
if tc.mu.commitWaitDeferred && !deferred {
// If this is an automatic commit-wait call and the user of this
// transaction has opted to defer the commit-wait and handle it
// externally, there's nothing to do yet.
return nil
}
commitTS := tc.mu.txn.WriteTimestamp
readOnly := tc.mu.txn.Sequence == 0
linearizable := tc.linearizable
waitUntil := commitTS
if linearizable && !readOnly {
waitUntil = waitUntil.Add(tc.clock.MaxOffset().Nanoseconds(), 0)
}
if waitUntil.LessEq(tc.clock.Now()) {
// No wait fast-path. This is the common case for most transactions. Only
// transactions who have their commit timestamp bumped into the future will
// need to wait.
return nil
}
if fn := tc.testingKnobs.CommitWaitFilter; fn != nil {
fn()
}
before := tc.clock.PhysicalTime()
est := waitUntil.GoTime().Sub(before)
log.VEventf(ctx, 2, "performing commit-wait sleep for ~%s", est)
// NB: unlock while sleeping to avoid holding the lock for commit-wait.
tc.mu.Unlock()
err := tc.clock.SleepUntil(ctx, waitUntil)
tc.mu.Lock()
if err != nil {
return err
}
after := tc.clock.PhysicalTime()
log.VEventf(ctx, 2, "completed commit-wait sleep, took %s", after.Sub(before))
tc.metrics.CommitWaits.Inc(1)
return nil
}
// maybeRejectIncompatibleRequest checks if the TxnCoordSender is compatible with
// a given BatchRequest.
// Specifically, a Leaf TxnCoordSender is not compatible with locking requests.
func (tc *TxnCoordSender) maybeRejectIncompatibleRequest(
ctx context.Context, ba *kvpb.BatchRequest,
) *kvpb.Error {
switch tc.typ {
case kv.RootTxn:
return nil
case kv.LeafTxn:
if ba.IsLocking() {
return kvpb.NewError(errors.WithContextTags(errors.AssertionFailedf(
"LeafTxn %s incompatible with locking request %s", tc.mu.txn, ba.Summary()), ctx))
}
return nil
default:
panic("unexpected TxnType")
}
}
// maybeRejectClientLocked checks whether the transaction is in a state that
// prevents it from continuing, such as the heartbeat having detected the
// transaction to have been aborted.
//
// ba is the batch that the client is trying to send. It's inspected because
// rollbacks are always allowed. Can be nil.
func (tc *TxnCoordSender) maybeRejectClientLocked(
ctx context.Context, ba *kvpb.BatchRequest,
) *kvpb.Error {
rollback := ba != nil && ba.IsSingleAbortTxnRequest()
if rollback && tc.mu.txn.Status != roachpb.COMMITTED {
// As a special case, we allow rollbacks to be sent at any time. Any
// rollback attempt moves the TxnCoordSender state to txnFinalized, but higher
// layers are free to retry rollbacks if they want (and they do, for
// example, when the context was canceled while txn.Rollback() was running).
//
// However, we reject this if we know that the transaction has been
// committed, to avoid sending the rollback concurrently with the
// txnCommitter asynchronously making the commit explicit. See:
// https://github.com/cockroachdb/cockroach/issues/68643
return nil
}
// Check the transaction coordinator state.
switch tc.mu.txnState {
case txnPending:
// All good.
case txnRetryableError:
return kvpb.NewError(tc.mu.storedRetryableErr)
case txnError:
return tc.mu.storedErr
case txnFinalized:
msg := redact.Sprintf("client already committed or rolled back the transaction. "+
"Trying to execute: %s", ba.Summary())
if !rollback {
// If the client is trying to do anything other than rollback, it is
// unexpected for it to find the transaction already in a txnFinalized
// state. This may be a bug, so log a stack trace.
stack := string(debug.Stack())
log.Errorf(ctx, "%s. stack:\n%s", msg, stack)
}
reason := kvpb.TransactionStatusError_REASON_UNKNOWN
if tc.mu.txn.Status == roachpb.COMMITTED {
reason = kvpb.TransactionStatusError_REASON_TXN_COMMITTED
}
return kvpb.NewErrorWithTxn(kvpb.NewTransactionStatusError(reason, msg), &tc.mu.txn)
}
// Check the transaction proto state, along with any finalized transaction
// status observed by the transaction heartbeat loop.
protoStatus := tc.mu.txn.Status
hbObservedStatus := tc.interceptorAlloc.txnHeartbeater.mu.finalObservedStatus
switch {
case protoStatus == roachpb.ABORTED:
// The transaction was rolled back synchronously.
fallthrough
case protoStatus != roachpb.COMMITTED && hbObservedStatus == roachpb.ABORTED:
// The transaction heartbeat observed an aborted transaction record and
// this was not due to a synchronous transaction commit and transaction
// record garbage collection.
// See the comment on txnHeartbeater.mu.finalObservedStatus for more details.
abortedErr := kvpb.NewErrorWithTxn(
kvpb.NewTransactionAbortedError(kvpb.ABORT_REASON_CLIENT_REJECT), &tc.mu.txn)
return kvpb.NewError(tc.handleRetryableErrLocked(ctx, abortedErr))
case protoStatus != roachpb.PENDING || hbObservedStatus != roachpb.PENDING:
// The transaction proto is in an unexpected state.
return kvpb.NewErrorf(
"unexpected txn state: %s; heartbeat observed status: %s", tc.mu.txn, hbObservedStatus)
default:
// All good.
}
return nil
}
// ClientFinalized is part of the kv.TxnSender interface.
func (tc *TxnCoordSender) ClientFinalized() bool {
tc.mu.Lock()
defer tc.mu.Unlock()
return tc.mu.txnState == txnFinalized
}
// finalizeAndCleanupTxnLocked marks the transaction state as finalized and
// closes all interceptors.
func (tc *TxnCoordSender) finalizeAndCleanupTxnLocked(ctx context.Context) {
tc.mu.txnState = txnFinalized
tc.cleanupTxnLocked(ctx)
}
// cleanupTxnLocked closes all the interceptors.
func (tc *TxnCoordSender) cleanupTxnLocked(ctx context.Context) {
if tc.mu.closed {
return
}
tc.mu.closed = true
// Close each interceptor.
for _, reqInt := range tc.interceptorStack {
reqInt.closeLocked()
}
}
// UpdateStateOnRemoteRetryableErr is part of the TxnSender interface.
func (tc *TxnCoordSender) UpdateStateOnRemoteRetryableErr(
ctx context.Context, pErr *kvpb.Error,
) *kvpb.Error {
tc.mu.Lock()
defer tc.mu.Unlock()
return kvpb.NewError(tc.handleRetryableErrLocked(ctx, pErr))
}
// handleRetryableErrLocked takes a retriable error and creates a
// TransactionRetryWithProtoRefreshError containing the transaction that needs
// to be used by the next attempt. It also handles various aspects of updating
// the TxnCoordSender's state. Depending on the error, the TxnCoordSender might
// not be usable afterwards (in case of TransactionAbortedError). The caller is
// expected to check the ID of the resulting transaction. If the TxnCoordSender
// can still be used, it will have been prepared for a new epoch.
func (tc *TxnCoordSender) handleRetryableErrLocked(
ctx context.Context, pErr *kvpb.Error,
) *kvpb.TransactionRetryWithProtoRefreshError {
// If the transaction is already in a retryable state and the provided error
// does not have a higher priority than the existing error, return the
// existing error instead of attempting to handle the retryable error. This
// prevents the TxnCoordSender from losing information about a higher
// priority error.
if tc.mu.txnState == txnRetryableError &&
kvpb.ErrPriority(pErr.GoError()) <= kvpb.ErrPriority(tc.mu.storedRetryableErr) {
return tc.mu.storedRetryableErr
}
// If the error is a transaction retry error, update metrics to
// reflect the reason for the restart. More details about the
// different error types are documented above on the metaRestart
// variables.
switch tErr := pErr.GetDetail().(type) {
case *kvpb.TransactionRetryError:
switch tErr.Reason {
case kvpb.RETRY_WRITE_TOO_OLD:
tc.metrics.RestartsWriteTooOld.Inc()
case kvpb.RETRY_SERIALIZABLE:
tc.metrics.RestartsSerializable.Inc()
case kvpb.RETRY_ASYNC_WRITE_FAILURE:
tc.metrics.RestartsAsyncWriteFailure.Inc()
case kvpb.RETRY_COMMIT_DEADLINE_EXCEEDED:
tc.metrics.RestartsCommitDeadlineExceeded.Inc()
default:
tc.metrics.RestartsUnknown.Inc()
}
case *kvpb.WriteTooOldError:
tc.metrics.RestartsWriteTooOldMulti.Inc()
case *kvpb.ReadWithinUncertaintyIntervalError:
tc.metrics.RestartsReadWithinUncertainty.Inc()
case *kvpb.TransactionAbortedError:
tc.metrics.RestartsTxnAborted.Inc()
case *kvpb.TransactionPushError:
tc.metrics.RestartsTxnPush.Inc()
default:
tc.metrics.RestartsUnknown.Inc()
}
errTxnID := pErr.GetTxn().ID
newTxn := kvpb.PrepareTransactionForRetry(ctx, pErr, tc.mu.userPriority, tc.clock)
// We'll pass a TransactionRetryWithProtoRefreshError up to the next layer.
retErr := kvpb.NewTransactionRetryWithProtoRefreshError(
redact.Sprint(pErr),
errTxnID, // the id of the transaction that encountered the error
newTxn)
// Move to a retryable error state, where all Send() calls fail until the
// state is cleared.
tc.mu.txnState = txnRetryableError
tc.mu.storedRetryableErr = retErr
// If the ID changed, it means we had to start a new transaction and the
// old one is toast. This TxnCoordSender cannot be used any more - future
// Send() calls will be rejected; the client is supposed to create a new
// one.
if errTxnID != newTxn.ID {
// Remember that this txn is aborted to reject future requests.
tc.mu.txn.Status = roachpb.ABORTED
// Abort the old txn. The client is not supposed to use this
// TxnCoordSender anymore.
tc.interceptorAlloc.txnHeartbeater.abortTxnAsyncLocked(ctx)
tc.cleanupTxnLocked(ctx)
return retErr
}
// This is where we get a new epoch.
tc.mu.txn.Update(&newTxn)
// Reset state as this is a retryable txn error that is incrementing
// the transaction's epoch.
log.VEventf(ctx, 2, "resetting epoch-based coordinator state on retry")
for _, reqInt := range tc.interceptorStack {
reqInt.epochBumpedLocked()
}
return retErr
}
// updateStateLocked updates the transaction state in both the success and error
// cases. It also updates retryable errors with the updated transaction for use
// by client restarts.
func (tc *TxnCoordSender) updateStateLocked(
ctx context.Context, ba *kvpb.BatchRequest, br *kvpb.BatchResponse, pErr *kvpb.Error,
) *kvpb.Error {
// We handle a couple of different cases:
// 1) A successful response. If that response carries a transaction proto,
// we'll use it to update our proto.
// 2) A non-retriable error. We move to the txnError state and we cleanup. If
// the error carries a transaction in it, we update our proto with it
// (although Andrei doesn't know if that serves any purpose).
// 3) A retriable error. We "handle" it, in the sense that we call
// handleRetryableErrLocked() to transform the error. If the error instructs
// the client to start a new transaction (i.e. TransactionAbortedError), then
// the current transaction is automatically rolled-back. Otherwise, we update
// our proto for a new epoch.
// NOTE: We'd love to move to state txnError in case of new error but alas
// with the current interface we can't: there's no way for the client to ack
// the receipt of the error and control the switching to the new epoch. This
// is a major problem of the current txn interface - it means that concurrent
// users of a txn might operate at the wrong epoch if they race with the
// receipt of such an error.
if pErr == nil {
tc.mu.txn.Update(br.Txn)
return nil
}
if pErr.TransactionRestart() != kvpb.TransactionRestart_NONE {
if tc.typ == kv.LeafTxn {
// Leaves handle retriable errors differently than roots. The leaf
// transaction is not supposed to be used any more after a retriable
// error. Separately, the error needs to make its way back to the root.
// From now on, clients will get this error whenever they Send(). We want
// clients to get the same retriable error so we don't wrap it in
// TxnAlreadyEncounteredErrorError as we do elsewhere.
tc.mu.txnState = txnError
tc.mu.storedErr = pErr
// Cleanup.
tc.mu.txn.Update(pErr.GetTxn())
tc.cleanupTxnLocked(ctx)
return pErr
}
txnID := ba.Txn.ID
errTxnID := pErr.GetTxn().ID // The ID of the txn that needs to be restarted.
if errTxnID != txnID {
// KV should not return errors for transactions other than the one in
// the BatchRequest.
log.Fatalf(ctx, "retryable error for the wrong txn. ba.Txn: %s. pErr: %s",
ba.Txn, pErr)
}
return kvpb.NewError(tc.handleRetryableErrLocked(ctx, pErr))
}
// This is the non-retriable error case.
// Most errors cause the transaction to not accept further requests (except a
// rollback), but some errors are safe to allow continuing (in particular
// ConditionFailedError). In particular, SQL can recover by rolling back to a
// savepoint.
if kvpb.ErrPriority(pErr.GoError()) != kvpb.ErrorScoreUnambiguousError {
tc.mu.txnState = txnError
tc.mu.storedErr = kvpb.NewError(&kvpb.TxnAlreadyEncounteredErrorError{
PrevError: pErr.String(),
})
}
// Update our transaction with any information the error has.
if errTxn := pErr.GetTxn(); errTxn != nil {
if err := sanityCheckErrWithTxn(ctx, pErr, ba, &tc.testingKnobs); err != nil {
return kvpb.NewError(err)
}
tc.mu.txn.Update(errTxn)
}
return pErr
}
// sanityCheckErrWithTxn verifies whether the error (which must have a txn
// attached) contains a COMMITTED transaction. Only rollbacks should be able to
// encounter such errors. Marking a transaction as explicitly-committed can also
// encounter these errors, but those errors don't make it to the TxnCoordSender.
//
// Wraps the error in case of an assertion violation, otherwise returns as-is.
//
// This checks for the occurence of a known issue involving ambiguous write
// errors that occur alongside commits, which may race with transaction
// recovery requests started by contending operations.
// https://github.com/cockroachdb/cockroach/issues/103817
func sanityCheckErrWithTxn(
ctx context.Context, pErrWithTxn *kvpb.Error, ba *kvpb.BatchRequest, _ *ClientTestingKnobs,
) error {
txn := pErrWithTxn.GetTxn()
if txn.Status != roachpb.COMMITTED {