-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
api.proto
3087 lines (2815 loc) · 147 KB
/
api.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2014 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
syntax = "proto3";
package cockroach.roachpb;
option go_package = "roachpb";
import "errorspb/errors.proto";
import "kv/kvserver/concurrency/lock/locking.proto";
import "kv/kvserver/readsummary/rspb/summary.proto";
import "roachpb/data.proto";
import "roachpb/errors.proto";
import "roachpb/metadata.proto";
import "roachpb/span_config.proto";
import "settings/encoding.proto";
import "storage/enginepb/mvcc.proto";
import "storage/enginepb/mvcc3.proto";
import "util/hlc/timestamp.proto";
import "util/tracing/tracingpb/recorded_span.proto";
import "util/tracing/tracingpb/tracing.proto";
import "gogoproto/gogo.proto";
import "google/protobuf/duration.proto";
// ReadConsistencyType specifies what type of consistency is observed
// during read operations.
enum ReadConsistencyType {
option (gogoproto.goproto_enum_prefix) = false;
// CONSISTENT reads are guaranteed to read committed data; the
// mechanism relies on clocks to determine lease expirations.
CONSISTENT = 0;
// READ_UNCOMMITTED reads return both committed and uncommitted data.
// The consistency type is similar to INCONSISTENT in that using it
// can result in dirty reads. However, like the CONSISTENT type, it
// requires the replica performing the read to hold a valid read lease,
// meaning that it can't return arbitrarily stale data.
READ_UNCOMMITTED = 1;
// INCONSISTENT reads return the latest available, committed values.
// They are more efficient, but may read stale values as pending
// intents are ignored.
INCONSISTENT = 2;
}
// RoutingPolicy specifies how a request should be routed to the
// replicas of its target range(s) by the DistSender. Policies can
// dictate which replicas are considered to be targets and in which
// order.
enum RoutingPolicy {
// LEASEHOLDER means that the DistSender should route the request to the
// leaseholder replica(s) of its target range(s).
LEASEHOLDER = 0;
// NEAREST means that the DistSender should route the request to the
// nearest replica(s) of its target range(s).
NEAREST = 1;
}
// ResumeReason specifies why a ResumeSpan was generated instead of a
// complete result.
enum ResumeReason {
option (gogoproto.goproto_enum_prefix) = false;
// Zero value; no resume, or an unknown reason from a future or past cockroachdb version.
RESUME_UNKNOWN = 0;
// A key limit was exceeded, i.e. MaxSpanRequestKeys.
RESUME_KEY_LIMIT = 1;
// A byte limit was exceeded, i.e. TargetBytes.
// NB: 21.2 and below will return RESUME_KEY_LIMIT instead.
RESUME_BYTE_LIMIT = 2;
// An intent limit was exceeded. This is currently never returned to clients,
// since MVCCScan converts the result into a WriteIntentError.
// NB: 21.2 and below will return RESUME_KEY_LIMIT instead.
RESUME_INTENT_LIMIT = 3;
// The DistSender encountered a range boundary and returned a partial result,
// in response to return_on_range_boundary.
RESUME_RANGE_BOUNDARY = 4;
}
// RequestHeader is supplied with every storage node request.
message RequestHeader {
reserved 1, 2;
// The key for request. If the request operates on a range, this
// represents the starting key for the range.
bytes key = 3 [(gogoproto.casttype) = "Key"];
// The end key is empty if the request spans only a single key. Otherwise,
// it must order strictly after Key. In such a case, the header indicates
// that the operation takes place on the key range from Key to EndKey,
// including Key and excluding EndKey.
bytes end_key = 4 [(gogoproto.casttype) = "Key"];
// A zero-indexed transactional sequence number.
int32 sequence = 5 [
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/storage/enginepb.TxnSeq"];
}
// ResponseHeader is returned with every storage node response. Note that this is
// different from a BatchResponse's header.
message ResponseHeader {
// txn is non-nil if the request specified a non-nil transaction.
// The transaction timestamp and/or priority may have been updated,
// depending on the outcome of the request.
//
// Once txn is merged into the BatchResponse_Header.Txn, it will be
// reset to nil to avoid sending superfluous information over the
// network.
Transaction txn = 3;
// The next span to resume from when the response doesn't cover the full span
// requested. This can happen when a bound on the result size is set through
// max_span_request_keys or target_bytes in the batch header or when a scan
// has been stopped before covering the requested data because of
// scan_options.
//
// ResumeSpan is unset when the entire span of keys have been
// operated on. The span is set to the original span if the request
// was ignored because max_span_request_keys was hit due to another
// request in the batch. For a reverse scan the end_key is updated.
Span resume_span = 4;
// When resume_span is populated, this specifies the reason why the operation
// wasn't completed and needs to be resumed.
ResumeReason resume_reason = 7;
// When resume_reason is RESUME_BYTE_LIMIT, this may contain the size of the
// next result entry which caused the limit to be exceeded, i.e. the size of
// the first entry when reading from the resume span. It is only supported by
// Get and Scan. In a batch, this will only be set on the first response that
// exceeds the limit.
//
// NB: This is best-effort, and may be 0 in some rare cases. Specifically, if
// TargetBytes is exactly satisfied by a result that exhausted a range scan,
// or by a response from a multi-request batch, we won't do additional work
// (e.g. send another RPC to the next range) only to obtain resume_next_bytes.
//
// Also note that this is unaffected by whole_rows_of_size. The client may
// care about whole rows, but we'll only return the size of the next KV pair
// (which may just be part of the row), to avoid the cost of additional IO.
int64 resume_next_bytes = 9;
// The number of keys operated on.
int64 num_keys = 5;
// The number of bytes returned. Only populated for requests that support it
// (at the time of writing, Scan, ReverseScan and ExportRequest). The number
// returned here corresponds to the (Header).TargetBytes field and loosely
// measures the bytes in the timestamps, keys, and values of the returned
// rows.
int64 num_bytes = 8;
reserved 6;
}
// A GetRequest is the argument for the Get() method.
message GetRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// The desired key-level locking mode used during this get. When set to None
// (the default), no key-level locking mode is used - meaning that the get
// does not acquire a lock. When set to any other strength, a lock of that
// strength is acquired with the Unreplicated durability (i.e. best-effort)
// the key, if it exists.
kv.kvserver.concurrency.lock.Strength key_locking = 2;
}
// A GetResponse is the return value from the Get() method.
// If the key doesn't exist, Value will be nil.
message GetResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
Value value = 2;
// The intent seen, if any, when using the READ_UNCOMMITTED consistency level.
//
// NOTE: this field is not currently populated with intents for deletion
// tombstones. It probably should be because the value field may contain a
// value that is being deleted by a corresponding intent. We should revisit
// this decision if this ever becomes a problem.
Value intent_value = 3;
}
// A ProbeRequest is an internal request type used to send a replicated
// no-op through a Range as a means of probing write availability. The
// request will be serialized like a regular write, i.e. will acquire
// latches, and declare key access, but it will not check locks (i.e.
// if an intent exists on the key that is being probed, the probe will
// not observe it). ProbeRequest can be served by any Replica including
// followers, i.e. it can be used to verify that a given Replica is able
// to access the replication layer.
message ProbeRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// A ProbeResponse is the response to a ProbeRequest.
message ProbeResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// A PutRequest is the argument to the Put() method.
message PutRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
Value value = 2 [(gogoproto.nullable) = false];
// Specify as true to put the value without a corresponding
// timestamp. This option should be used with care as it precludes
// the use of this value with transactions.
bool inline = 3;
// NOTE: For internal use only! Set to indicate that the put is
// writing to virgin keyspace and no reads are necessary to
// rationalize MVCC.
bool blind = 4;
}
// A PutResponse is the return value from the Put() method.
message PutResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// A ConditionalPutRequest is the argument to the ConditionalPut() method.
//
// - Returns true and sets value if exp_bytes equals existing value.
// - If key doesn't exist and exp_bytes is empty, sets value.
// - Otherwise, returns a ConditionFailedError containing the actual value of the key.
//
// Note that the client is free to send more requests after a
// ConditionFailedError. This is not generally allowed after other errors
// because of fears over the ambiguity of the side-effects of failed requests
// (in particular, the timestamps at which intents might have been written).
// ConditionFailedError is a special case as we ensure there's no ambiguity; the
// error carries a WriteTimestamp that's the upper bound of the timestamps
// intents were written at.
message ConditionalPutRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// The value to put.
Value value = 2 [(gogoproto.nullable) = false];
// deprecated_exp_val represents the expected existing value for the key. If
// the existing value is different, the request will return a
// ConditionFailedError. A missing (Go nil) deprecated_exp_value.raw_bytes
// means that the key is expected to not exist.
//
// This is deprecated in 20.2 in favor of exp_bytes, which clarifies that the
// checksum and timestamp of the expected value are irrelevant. Remove in
// 21.1.
Value deprecated_exp_value = 3;
// exp_bytes represents the expected existing value for the key. If empty, the
// key is expected to not exist. If not empty, these bytes are expected to
// contain the tag and data of the existing value (without the existing
// value's checksum; the byte array is expected to come from
// Value.TagAndDataBytes()). A value's checksum covers the key in addition to
// covering the value, so not including a checksum here makes for a easier to
// use API - the creator of the ConditionalPutRequest can simply put in bytes
// coming from a different key.
// Note that there's no such thing as expecting a key to exist, but have an
// empty value. Such key-values don't exist.
//
// Note that the existing value's timestamp doesn't matter, only its data. So,
// the CPut will succeed in ABA situations (if a reader got value A and checks
// against it later, the check will succeed even if, in the meantime, there's
// been a subsequent write of value B and another one back to value A).
bytes exp_bytes = 6;
// NOTE: For internal use only! Set to indicate that the put is
// writing to virgin keyspace and no reads are necessary to
// rationalize MVCC.
bool blind = 4;
// Typically if a specific, non-empty expected value is supplied, it *must*
// exist with that value. Passing this indicates that it is also OK if the key
// does not exist. This is useful when a given value is expected but it is
// possible it has not yet been written.
bool allow_if_does_not_exist = 5;
// Specify as true to put the value without a corresponding
// timestamp. This option should be used with care as it precludes
// the use of this value with transactions.
bool inline = 7;
}
// A ConditionalPutResponse is the return value from the
// ConditionalPut() method.
message ConditionalPutResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// An InitPutRequest is the argument to the InitPut() method.
//
// - If key doesn't exist, sets value.
// - If key exists, returns a ConditionFailedError if value != existing value
// If failOnTombstones is set to true, tombstone values count as mismatched
// values and will cause a ConditionFailedError.
message InitPutRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
Value value = 2 [(gogoproto.nullable) = false];
// NOTE: For internal use only! Set to indicate that the put is
// writing to virgin keyspace and no reads are necessary to
// rationalize MVCC.
bool blind = 3;
// If true, tombstones cause ConditionFailedErrors.
bool failOnTombstones = 4;
}
// A InitPutResponse is the return value from the InitPut() method.
message InitPutResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// An IncrementRequest is the argument to the Increment() method. It
// increments the value for key, and returns the new value. If no
// value exists for a key, incrementing by 0 is not a noop, but will
// create a zero value. IncrementRequest cannot be called on a key set
// by Put() or ConditionalPut(). Similarly, Put() and ConditionalPut()
// cannot be invoked on an incremented key.
message IncrementRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
int64 increment = 2;
}
// An IncrementResponse is the return value from the Increment
// method. The new value after increment is specified in NewValue. If
// the value could not be decoded as specified, Error will be set.
message IncrementResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
int64 new_value = 2;
}
// A DeleteRequest is the argument to the Delete() method.
message DeleteRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// A DeleteResponse is the return value from the Delete() method.
message DeleteResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// A DeleteRangeRequest is the argument to the DeleteRange() method. It
// specifies the range of keys to delete.
//
// A DeleteRangeRequest populates the timestamp cache and is tracked for
// refreshes.
message DeleteRangeRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
reserved 2;
// return the keys that are deleted in the response.
bool return_keys = 3;
// delete "inline" keys which are stored without MVCC timestamps. Note that
// an "inline" DeleteRange will fail if it attempts to delete any keys which
// contain timestamped (non-inline) values; this option should only be used on
// keys which are known to store inline values, such as data in cockroach's
// time series system.
//
// Similarly, attempts to delete keys with inline values will fail unless this
// flag is set to true; the setting must match the data being deleted.
//
// Inline values cannot be deleted transactionally; a DeleteRange with
// "inline" set to true will fail if it is executed within a transaction.
bool inline = 4;
// If enabled, the range is deleted using an MVCC range tombstone, which is a
// cheaper constant-time write operation (but still requires a scan to check
// for conflicts and adjust MVCC stats). This option cannot be used in a
// transaction, and it cannot be combined with Inline or ReturnKeys.
//
// The caller must check the MVCCRangeTombstones version gate before using
// this parameter, as it is new in 22.2.
bool use_range_tombstone = 5;
}
// A DeleteRangeResponse is the return value from the DeleteRange()
// method.
message DeleteRangeResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// All the deleted keys if return_keys is set.
repeated bytes keys = 2 [(gogoproto.casttype) = "Key"];
}
// A ClearRangeRequest is the argument to the ClearRange() method. It
// specifies a range of keys to clear from the underlying engine. Note
// that this differs from the behavior of DeleteRange, which sets
// transactional intents and writes tombstones to the deleted
// keys. ClearRange is used when permanently dropping or truncating
// table data.
//
// ClearRange also updates the GC threshold for the range to the
// timestamp at which this command executes, to prevent reads at
// earlier timestamps from incorrectly returning empty results.
//
// NOTE: it is important that this method only be invoked on a key
// range which is guaranteed to be both inactive and not see future
// writes (until Deadline, if set, below).
// Ignoring this warning may result in data loss.
message ClearRangeRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// Deadline can be set to a time at or after which the server will refuse to
// execute this ClearRange request, providing a form of replay protection:
// if a caller sets this to a time in the near future, they can then wait for
// that time (plus max offset) to have passed at which point they can reuse
// the span they cleared without fear of this request being replayed later and
// clearing subsequent writes.
util.hlc.Timestamp deadline = 2 [(gogoproto.nullable) = false];
}
// A ClearRangeResponse is the return value from the ClearRange() method.
message ClearRangeResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// A RevertRangeRequest specifies a range of keys in which to clear all MVCC
// revisions more recent than some TargetTime from the underlying engine, thus
// reverting the range (from the perspective of an MVCC scan) to that time.
message RevertRangeRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// TargetTime specifies a the time to which to "revert" the range by clearing
// any MVCC key with a strictly higher timestamp. TargetTime must be higher
// than the GC Threshold for the replica - so that it is assured that the keys
// for that time are still there — or the request will fail.
util.hlc.Timestamp target_time = 2 [(gogoproto.nullable) = false];
// This parameter has no effect on 22.2, as TBI is always enabled. For
// compatibility with 22.1 nodes, callers must continue to set this as
// appropriate until 22.2.
//
// TODO(erikgrinaker): Remove this in 22.2.
bool enable_time_bound_iterator_optimization = 3;
// IgnoreGcThreshold can be set by a caller to ignore the target-time when
// checking that the earliest time at which the command operates is above the
// GC threshold. This is safe to set only in very specific situations, such as
// when the target span was OFFLINE since the target time as it is during
// IMPORT INTO. In this case, since the IMPORT knows it is the only writer and
// it only writes new keys, no keys to which it would need to revert have been
// shadowed / could have been GC'ed, so it can safely ignore the GC threshold.
bool ignore_gc_threshold = 4;
}
// A RevertRangeResponse is the return value from the RevertRange() method.
message RevertRangeResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// ScanFormat is an enumeration of the available response formats for MVCCScan
// operations.
enum ScanFormat {
option (gogoproto.goproto_enum_prefix) = false;
// The standard MVCCScan format: a slice of KeyValue messages.
KEY_VALUES = 0;
// The batch_response format: a byte slice of alternating keys and values,
// each prefixed by their length as a varint.
BATCH_RESPONSE = 1;
}
// A ScanRequest is the argument to the Scan() method. It specifies the
// start and end keys for an ascending scan of [start,end) and the maximum
// number of results (unbounded if zero).
message ScanRequest {
reserved 2, 3;
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// The desired format for the response. If set to BATCH_RESPONSE, the server
// will set the batch_responses field in the ScanResponse instead of the rows
// field.
ScanFormat scan_format = 4;
// The desired key-level locking mode used during this scan. When set to None
// (the default), no key-level locking mode is used - meaning that the scan
// does not acquire any locks. When set to any other strength, a lock of that
// strength is acquired with the Unreplicated durability (i.e. best-effort) on
// each of the keys scanned by the request, subject to any key limit applied
// to the batch which limits the number of keys returned.
//
// NOTE: the locks acquire with this strength are point locks on each of the
// keys returned by the request, not a single range lock over the entire span
// scanned by the request.
kv.kvserver.concurrency.lock.Strength key_locking = 5;
}
// A ScanResponse is the return value from the Scan() method.
message ScanResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// Empty if no rows were scanned.
repeated KeyValue rows = 2 [(gogoproto.nullable) = false];
// The intent rows seen when performing a scan at the READ_UNCOMMITTED
// consistency level. These rows do not count against the MaxSpanRequestKeys
// count.
//
// NOTE: this field is not currently populated with intents for deletion
// tombstones. It probably should be because the rows field may contain
// key-values that are being deleted by corresponding intents. We should
// revisit this decision if this ever becomes a problem.
repeated KeyValue intent_rows = 3 [(gogoproto.nullable) = false];
// If set, each item in this repeated bytes field contains part of the results
// in batch format - the key/value pairs are a buffer of varint-prefixed
// slices, alternating from key to value. Each entry in this field is
// complete - there are no key/value pairs that are split across more than one
// entry. There are num_keys total pairs across all entries, as defined by the
// ResponseHeader. If set, rows will not be set and vice versa.
repeated bytes batch_responses = 4;
}
// A ReverseScanRequest is the argument to the ReverseScan() method. It specifies the
// start and end keys for a descending scan of [start,end) and the maximum
// number of results (unbounded if zero).
message ReverseScanRequest {
reserved 2, 3;
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// The desired format for the response. If set to BATCH_RESPONSE, the server
// will set the batch_responses field in the ScanResponse instead of the rows
// field.
ScanFormat scan_format = 4;
// The desired key-level locking mode used during this scan. When set to None
// (the default), no key-level locking mode is used - meaning that the scan
// does not acquire any locks. When set to any other strength, a lock of that
// strength is acquired with the Unreplicated durability (i.e. best-effort) on
// each of the keys scanned by the request, subject to any key limit applied
// to the batch which limits the number of keys returned.
//
// NOTE: the locks acquire with this strength are point locks on each of the
// keys returned by the request, not a single range lock over the entire span
// scanned by the request.
kv.kvserver.concurrency.lock.Strength key_locking = 5;
}
// A ReverseScanResponse is the return value from the ReverseScan() method.
message ReverseScanResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// Empty if no rows were scanned.
repeated KeyValue rows = 2 [(gogoproto.nullable) = false];
// The intent rows seen when performing a scan at the READ_UNCOMMITTED
// consistency level. These rows do not count against the MaxSpanRequestKeys
// count.
//
// NOTE: this field is not currently populated with intents for deletion
// tombstones. It probably should be because the rows field may contain
// key-values that are being deleted by corresponding intents. We should
// revisit this decision if this ever becomes a problem.
repeated KeyValue intent_rows = 3 [(gogoproto.nullable) = false];
// If set, each item in this repeated bytes field contains part of the results
// in batch format - the key/value pairs are a buffer of varint-prefixed
// slices, alternating from key to value. Each entry in this field is
// complete - there are no key/value pairs that are split across more than one
// entry. There are num_keys total pairs across all entries, as defined by the
// ResponseHeader. If set, rows will not be set and vice versa.
repeated bytes batch_responses = 4;
}
enum ChecksumMode {
// CHECK_VIA_QUEUE is set for requests made from the consistency queue. In
// this mode, a full check is carried out, and depending on the result a
// recursive consistency check is triggered:
//
// 1. no inconsistency found: if recomputed stats don't match persisted stats,
// trigger a RecomputeStatsRequest.
// 2. inconsistency found: if a diff is available, print it and trigger fatal
// error. If no diff found, trigger recursive check with diff requested
// (which then triggers fatal error).
//
// TODO(tbg): these semantics are an artifact of how consistency checks were
// first implemented. The extra behavior here should move to the consistency
// check queue instead and this option dropped from the enum.
CHECK_VIA_QUEUE = 0;
// CHECK_FULL recomputes the hash of the replicate data in all replicas and
// uses this to determine whether there is an inconsistency.
CHECK_FULL = 1;
// CHECK_STATS only hashes the persisted lease applied state (which notably
// includes the persisted MVCCStats) only. This catches a large class of
// replica inconsistencies observed in the wild (where replicas apply a
// nonidentical log of commands, and as a result almost always have
// divergent stats), while doing work independent of the size of the data
// contained in the replicas.
CHECK_STATS = 2;
}
// A CheckConsistencyRequest is the argument to the CheckConsistency() method.
// It specifies the start and end keys for a span of ranges to which a
// consistency check should be applied. A consistency check on a range involves
// running a ComputeChecksum on the range followed by a storage.CollectChecksum.
message CheckConsistencyRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// log a diff of inconsistencies if such inconsistencies are found. This is only
// valid if mode == FROM_QUEUE
bool with_diff = 2;
ChecksumMode mode = 3;
// Whether to create a RocksDB checkpoint on each replica at the log position
// at which the SHA is computed. The checkpoint is essentially a cheap point-
// in-time backup of the database. It will be put into the engines' auxiliary
// directory and needs to be removed manually to avoid leaking disk space.
bool checkpoint = 4;
// A list of nodes that the consistency check wants to terminate. This is
// typically set when Checkpoint above is also set, as part of a second round
// after a first consistency check that did find a divergence. The second
// round is concerned with damage control and wants the nodes it suspects hold
// anomalous data to be shut down, so that this data isn't served to clients
// (or worse, spread to other replicas).
repeated ReplicaDescriptor terminate = 5 [(gogoproto.nullable) = false];
}
// A CheckConsistencyResponse is the return value from the CheckConsistency() method.
// It returns the status the range was found in.
message CheckConsistencyResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
enum Status {
// No inconsistency was detected, but not all replicas returned a checksum.
RANGE_INDETERMINATE = 0;
// A definite inconsistency was detected.
RANGE_INCONSISTENT = 1;
// All replicas of the range agreed on the checksum.
RANGE_CONSISTENT = 2;
// Like RANGE_CONSISTENT, but the recomputed stats disagreed with the
// persisted stats. The persisted stats indicates estimates, so this is
// expected.
RANGE_CONSISTENT_STATS_ESTIMATED = 3;
// Like RANGE_CONSISTENT_STATS_ESTIMATED, but the mismatch occurred with
// persisted stats that claimed to be accurate. This is unexpected and
// likely indicates a bug in our logic to incrementally update the stats
// as commands are evaluated and applied.
RANGE_CONSISTENT_STATS_INCORRECT = 4;
}
message Result {
int64 range_id = 1 [(gogoproto.customname) = "RangeID", (gogoproto.casttype) = "RangeID"];
// start_key of the range corresponding to range_id (at the time of the
// check). This is useful to send additional requests to only a subset of
// ranges contained within a result later, as requests can only be routed by
// key.
bytes start_key = 2;
Status status = 3;
// detail contains information related to the operation. If no inconsistency
// is found, it contains informational value such as observed stats. If an
// inconsistency is found, it contains information about that inconsistency
// including the involved replica and, if requested, the diff.
string detail = 4;
}
// result contains a Result for each Range checked, in no particular order.
repeated Result result = 2 [(gogoproto.nullable) = false];
}
// An RecomputeStatsRequest triggers a stats recomputation on the Range addressed by
// the request.
//
// An error will be returned if the start key does not match the start key of the
// target Range.
//
// The stats recomputation touches essentially the whole range, but the command
// avoids having to block other commands by taking care to not interleave
// with splits, and by using the commutativity of stats updates. As a result,
// it is safe to invoke at any time, including repeatedly, though it should be
// used conservatively due to performing a full scan of the Range.
message RecomputeStatsRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// When dry_run is true, the stats delta is computed, but no stats adjustment
// is performed. This isn't useful outside of testing since RecomputeStats is
// safe and idempotent.
bool dry_run = 2;
}
// An RecomputeStatsResponse is the response to an RecomputeStatsRequest.
message RecomputeStatsResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// added_delta is the adjustment made to the range's stats, i.e. `new_stats = old_stats + added_delta`.
storage.enginepb.MVCCStatsDelta added_delta = 2 [(gogoproto.nullable) = false];
}
// An EndTxnRequest is the argument to the EndTxn() method. It specifies
// whether to commit or roll back an extant transaction.
message EndTxnRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// False to abort and rollback.
bool commit = 2;
// If set, deadline represents the maximum (exclusive) timestamp at which the
// transaction can commit (i.e. the maximum timestamp for the txn's reads and
// writes).
// If EndTxn(Commit=true) finds that the txn's timestamp has been pushed above
// this deadline, an error will be returned and the client is supposed to
// rollback the txn.
util.hlc.Timestamp deadline = 3;
// commit triggers. Note that commit triggers are for
// internal use only and will cause an error if requested through the
// external-facing KV API.
InternalCommitTrigger internal_commit_trigger = 4;
// Set of spans that the transaction has acquired locks within. These are
// spans which must be resolved on txn completion. Note that these spans
// may be condensed to cover aggregate spans if the keys locked by the
// transaction exceeded a size threshold.
//
// The set logically extends to include the keys of all writes in the
// in-flight write set. However, those keys are not stored in this set
// to avoid duplication. This means that elements that are removed from
// that set should be merged into this one.
//
// The slice is maintained in sorted order and all spans are maximally
// merged such that no two spans here overlap each other.
repeated Span lock_spans = 5 [(gogoproto.nullable) = false];
// Set of in-flight intent writes that have been issued by the transaction but
// which may not have succeeded yet. If any promised writes are provided, a
// committing EndTxn request will move a PENDING transaction to the STAGING
// status instead of the COMMITTED status. These in-flight writes must then
// all be confirmed as successful before the transaction can be moved from
// STAGING to COMMITTED. For more, see txnCommitter.
//
// The slice is maintained in sorted order by sequence number. This provides
// O(log n) access to individual writes in this set based on their sequence
// number. See SequencedWriteBySeq.Find and its uses. The set can contain
// multiple SequencedWrites with the same key, but all sequence numbers are
// unique.
repeated SequencedWrite in_flight_writes = 17 [(gogoproto.nullable) = false];
// Requires that the transaction completes as a 1 phase commit. This
// guarantees that all writes are to the same range and that no
// intents are left in the event of an error.
//
// Note(andrei): Use this flag with care; retriable errors are not generated
// reliably for these transactions - a TransactionStatusError might be
// returned instead if 1PC execution fails.
bool require_1pc = 6 [(gogoproto.customname) = "Require1PC"];
// True to indicate that lock spans should be resolved with poison=true.
// This is used when the transaction is being aborted independently of the
// main thread of client operation, as in the case of an asynchronous abort
// from the TxnCoordSender on a failed heartbeat. It should only be set to
// true when commit=false.
bool poison = 9;
reserved 7, 8, 10;
}
// An EndTxnResponse is the return value from the EndTxn() method. The final
// transaction record is returned as part of the response header. In particular,
// transaction status and timestamp will be updated to reflect final committed
// values. Clients may propagate the transaction timestamp as the final txn
// commit timestamp in order to preserve causal ordering between subsequent
// transactions.
message EndTxnResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
reserved 2;
reserved 3;
// True if the transaction committed on the one phase commit path.
// This means that all writes which were part of the transaction
// were written as a single, atomic write batch to just one range.
bool one_phase_commit = 4;
// The commit timestamp of the STAGING transaction record written
// by the request. Only set if the transaction record was staged.
util.hlc.Timestamp staging_timestamp = 5 [(gogoproto.nullable) = false];
}
// An AdminSplitRequest is the argument to the AdminSplit() method. The
// existing range which contains header.key is split by
// split_key. If split_key is not specified, then this method will
// determine a split key that is roughly halfway through the
// range. The existing range is resized to cover only its start key to
// the split key. The new range created by the split starts at the
// split key and extends to the original range's end key. If split_key
// is known, header.key should also be set to split_key.
//
// New range IDs for each of the split range's replica and a new Raft
// ID are generated by the operation. Split requests are done in the
// context of a distributed transaction which updates range addressing
// records, range metadata and finally, provides a commit trigger to
// update bookkeeping and instantiate the new range on commit.
//
// The new range contains range replicas located on the same stores;
// no range data is moved during this operation. The split can be
// thought of as a mostly logical operation, though some other
// metadata (e.g. abort span and range stats must be copied or
// recomputed).
//
// expiration_time represents the time that this split expires. Any split that
// is not expired will not be considered for automatic merging by the merge
// queue. Any split requested by the split queue will have an expiration time
// of hlc.Timestamp{} (I.E. The zero timestamp so they are always eligible for
// automatic merging).
message AdminSplitRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
bytes split_key = 2 [(gogoproto.casttype) = "Key"];
reserved 3;
util.hlc.Timestamp expiration_time = 4 [(gogoproto.nullable) = false];
// PredicateKeys specifies keys which if not contained within the range should
// cause the split to be rejected. This can be used by a caller to effectively
// send a "conditional split" request, i.e. a split if not already split.
repeated bytes predicate_keys = 5 [(gogoproto.casttype) = "Key"];
}
// An AdminSplitResponse is the return value from the AdminSplit()
// method.
message AdminSplitResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// An AdminUnsplitRequest is the argument to the AdminUnsplit()
// method. The sticky bit of the existing range whose starting key is
// header.key is removed.
//
// Ranges that do not have the sticky bit set are eligible for
// automatic merging.
message AdminUnsplitRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// An AdminUnsplitResponse is the return value from the
// AdminUnsplit() method.
message AdminUnsplitResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// An AdminMergeRequest is the argument to the AdminMerge() method. A
// merge is performed by calling AdminMerge on the left-hand range of
// two consecutive ranges (i.e. the range which contains keys which
// sort first). This range will be the subsuming range and the right
// hand range will be subsumed. After the merge operation, the
// subsumed range will no longer exist and the subsuming range will
// now encompass all keys from its original start key to the end key
// of the subsumed range. If AdminMerge is called on the final range
// in the key space, it is a noop.
// The request must be addressed to the start key of the left hand side.
message AdminMergeRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// An AdminMergeResponse is the return value from the AdminMerge()
// method.
message AdminMergeResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// An AdminTransferLeaseRequest is the argument to the AdminTransferLease()
// method. A lease transfer allows an external entity to control the lease
// holder for a range. The target of the lease transfer needs to be a valid
// replica of the range.
message AdminTransferLeaseRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
int32 target = 2 [(gogoproto.casttype) = "StoreID"];
}
message AdminTransferLeaseResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// A ReplicationChange specifies the type and target of a replication change operation.
message ReplicationChange {
ReplicaChangeType change_type = 1;
ReplicationTarget target = 2 [(gogoproto.nullable) = false];
}
// An AdminChangeReplicasRequest is the argument to the AdminChangeReplicas()
// method. A change replicas operation allows adding or removing a set of
// replicas for a range.
message AdminChangeReplicasRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// Never access directly: use .Changes()
//
// TODO(tbg): remove in 20.1
ReplicaChangeType deprecated_change_type = 2;
// Never access directly: use .Changes()
//
// TODO(tbg): remove in 20.1
repeated ReplicationTarget deprecated_targets = 3 [(gogoproto.nullable) = false];
// ExpDesc is the expected current range descriptor to modify. If the range
// descriptor is not identical to ExpDesc for the request will fail.
//
// If there is more than one change specified in targets, this expectation
// will be applied to the first change and subsequent changes will use the
// resultant descriptor from successfully applying the previous change.
// If a change with more than one target occurs concurrently with another
// it is possible that an error will occur after partial application of the
// change. Changes are applied in the order they appear in the request.
RangeDescriptor exp_desc = 4 [(gogoproto.nullable) = false];
// The changes to apply to exp_desc. Never access directly: use .Changes().
//
// TODO(tbg): rename to 'changes' in 20.1 and remove Changes().
repeated ReplicationChange internal_changes = 5 [(gogoproto.nullable) = false];
}
message AdminChangeReplicasResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// Desc is the value of the range descriptor upon success.
RangeDescriptor desc = 2 [(gogoproto.nullable) = false];
}
// An AdminRelocateRangeRequest is the argument to the AdminRelocateRange()
// method. Relocates the replicas for a range to the specified target stores.
// The first store in the list of targets becomes the new leaseholder.
message AdminRelocateRangeRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
repeated ReplicationTarget voter_targets = 2 [(gogoproto.nullable) = false];
repeated ReplicationTarget non_voter_targets = 3 [(gogoproto.nullable) = false];
// TODO(a-robinson): Add "reason"/"details" string fields?
// As of 22.1 (specifically #74077), leaseholder replicas can remove
// themselves from the range. This means that now, in a joint state, the
// leaseholder that is removing itself chooses the best target replica to
// transfer the lease to, all inside of AdminChangeReplicas.
//
// This means that the pre-22.1 contract of `AdminRelocateRange` to transfer
// the lease to the first voter replica isn't required anymore. Only callers
// that rely on this contract should set this attribute.
bool transfer_lease_to_first_voter = 4;
// TODO(aayush): Migration path:
// 22.1: Send and consult the attribute.
// 22.2: Send but don't consult the attribute.
// 23.1: Stop sending or consulting the attribute. Remove this field.
bool transfer_lease_to_first_voter_accurate = 5;
}
message AdminRelocateRangeResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// A HeartbeatTxnRequest is arguments to the HeartbeatTxn()
// method. It's sent by transaction coordinators to let the system
// know that the transaction is still ongoing. Note that this
// heartbeat message is different from the heartbeat message in the
// gossip protocol.
message HeartbeatTxnRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// NOTE: this could use a ClockTimestamp type, but doing so results in a
// large diff that doesn't seem worth it, given that we never feed this
// timestamp back into a clock.
util.hlc.Timestamp now = 2 [(gogoproto.nullable) = false];
}
// A HeartbeatTxnResponse is the return value from the HeartbeatTxn()
// method. It returns the transaction info in the response header. The
// returned transaction lets the coordinator know the disposition of
// the transaction (i.e. aborted, committed, or pending).
message HeartbeatTxnResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// A GCRequest is arguments to the GC() method. It's sent by range
// lease holders after scanning range data to find expired MVCC values.
message GCRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
message GCKey {
bytes key = 1 [(gogoproto.casttype) = "Key"];
util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false];
}
repeated GCKey keys = 3 [(gogoproto.nullable) = false];
// Threshold is the expiration timestamp.
util.hlc.Timestamp threshold = 4 [(gogoproto.nullable) = false];
reserved 5;
}
// A GCResponse is the return value from the GC() method.
message GCResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// PushTxnType determines what action to take when pushing a transaction.
enum PushTxnType {
option (gogoproto.goproto_enum_prefix) = false;
// Push the timestamp forward if possible to accommodate a concurrent reader.
PUSH_TIMESTAMP = 0;
// Abort the transaction if possible to accommodate a concurrent writer.
PUSH_ABORT = 1;
// Abort the transaction if it's abandoned, but don't attempt to mutate it
// otherwise.
PUSH_TOUCH = 2;
reserved 3;
}
// A PushTxnRequest is arguments to the PushTxn() method. It's sent by
// readers or writers which have encountered an "intent" laid down by
// another transaction. The goal is to resolve the conflict. Note that
// args.Key should be set to the txn ID of args.PusheeTxn, not
// args.PusherTxn. This RPC is addressed to the range which owns the pushee's
// txn record.
//
// Resolution is trivial if the txn which owns the intent has either
// been committed or aborted already. Otherwise, the existing txn can
// either be aborted (for write/write conflicts), or its commit
// timestamp can be moved forward (for read/write conflicts). The
// course of action is determined by the specified push type, and by
// the owning txn's status and priority.
message PushTxnRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// Transaction which encountered the intent, if applicable. For a
// non-transactional pusher, pusher_txn will only have the priority set (in
// particular, ID won't be set). Used to compare priorities and timestamps if
// priorities are equal.
Transaction pusher_txn = 2 [(gogoproto.nullable) = false];
// Transaction to be pushed, as specified at the intent which led to
// the push transaction request. Note that this may not be the most
// up-to-date value of the transaction record, but will be set or
// merged as appropriate.
storage.enginepb.TxnMeta pushee_txn = 3 [(gogoproto.nullable) = false];
// PushTo is the timestamp which PusheeTxn should be pushed to. During
// conflict resolution, it should be set just after the timestamp of the
// conflicting read or write.
util.hlc.Timestamp push_to = 4 [(gogoproto.nullable) = false];
// Readers set this to PUSH_TIMESTAMP to move pushee_txn's provisional
// commit timestamp forward. Writers set this to PUSH_ABORT to request