-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
api.proto
3708 lines (3388 loc) · 183 KB
/
api.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2014 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
syntax = "proto3";
package cockroach.roachpb;
option go_package = "github.com/cockroachdb/cockroach/pkg/kv/kvpb";
import "errorspb/errors.proto";
import "kv/kvserver/concurrency/lock/locking.proto";
import "kv/kvserver/readsummary/rspb/summary.proto";
import "roachpb/data.proto";
import "kv/kvpb/errors.proto";
import "roachpb/metadata.proto";
import "roachpb/span_config.proto";
import "rpc/rpcpb/rpc.proto";
import "settings/encoding.proto";
import "sql/catalog/fetchpb/index_fetch.proto";
import "storage/enginepb/mvcc.proto";
import "storage/enginepb/mvcc3.proto";
import "util/hlc/timestamp.proto";
import "util/tracing/tracingpb/recorded_span.proto";
import "util/tracing/tracingpb/tracing.proto";
import "gogoproto/gogo.proto";
import "google/protobuf/duration.proto";
import "multitenant/tenantcapabilities/tenantcapabilitiespb/capabilities.proto";
// ReadConsistencyType specifies what type of consistency is observed
// during read operations.
enum ReadConsistencyType {
option (gogoproto.goproto_enum_prefix) = false;
// CONSISTENT reads are guaranteed to read committed data; the
// mechanism relies on clocks to determine lease expirations.
CONSISTENT = 0;
// READ_UNCOMMITTED reads return both committed and uncommitted data.
// The consistency type is similar to INCONSISTENT in that using it
// can result in dirty reads. However, like the CONSISTENT type, it
// requires the replica performing the read to hold a valid read lease,
// meaning that it can't return arbitrarily stale data. Note that
// read-committed does not imply any real-time constraints. If process A
// completes write w, then process B begins a read r, r is not necessarily
// guaranteed to observe w even if they are from the same client. This can
// occur due to the unbounded time delay between Raft appends and state
// machine application.
// TODO(baptist): Should we remove this level as it is virtually identical to
// INCONSISTENT. See #98862 as we may change the behavior to give stronger
// guarantees.
READ_UNCOMMITTED = 1;
// INCONSISTENT reads return the latest available, committed values.
// They are more efficient, but may read stale values as pending
// intents are ignored.
INCONSISTENT = 2;
}
// RoutingPolicy specifies how a request should be routed to the
// replicas of its target range(s) by the DistSender. Policies can
// dictate which replicas are considered to be targets and in which
// order.
enum RoutingPolicy {
// LEASEHOLDER means that the DistSender should route the request to the
// leaseholder replica(s) of its target range(s).
LEASEHOLDER = 0;
// NEAREST means that the DistSender should route the request to the
// nearest replica(s) of its target range(s).
NEAREST = 1;
}
// ResumeReason specifies why a ResumeSpan was generated instead of a
// complete result.
enum ResumeReason {
option (gogoproto.goproto_enum_prefix) = false;
// Zero value; no resume, or an unknown reason from a future or past cockroachdb version.
RESUME_UNKNOWN = 0;
// A key limit was exceeded, i.e. MaxSpanRequestKeys.
RESUME_KEY_LIMIT = 1;
// A byte limit was exceeded, i.e. TargetBytes.
// NB: 21.2 and below will return RESUME_KEY_LIMIT instead.
RESUME_BYTE_LIMIT = 2;
// An intent limit was exceeded. This is currently never returned to clients,
// since MVCCScan converts the result into a LockConflictError.
// NB: 21.2 and below will return RESUME_KEY_LIMIT instead.
RESUME_INTENT_LIMIT = 3;
// The DistSender encountered a range boundary and returned a partial result,
// in response to return_on_range_boundary.
RESUME_RANGE_BOUNDARY = 4;
// The ElasticCPUHandle signalled that the command evaluation exceeded its
// allotted CPU time. It is the callers responsibility to resume from the
// returned resume key.
RESUME_ELASTIC_CPU_LIMIT = 5;
}
// RequestHeaderPure is not to be used directly. It's generated only for use of
// its marshaling methods by RequestHeader. See the comment there.
message RequestHeaderPure {
bytes key = 3 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.Key"];
bytes end_key = 4 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.Key"];
int32 sequence = 5 [
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/storage/enginepb.TxnSeq"];
}
// RequestHeaderPure is not to be used directly. It's generated only for use of
// its marshaling methods by RequestHeader. See the comment there.
message RequestHeaderCrdbTest {
message Empty{};
Empty kvnemesis_seq = 6 [
(gogoproto.customname) = "KVNemesisSeq",
(gogoproto.nullable) = false,
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/kv/kvnemesis/kvnemesisutil.Container"];
bytes key = 3 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.Key"];
bytes end_key = 4 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.Key"];
int32 sequence = 5 [
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/storage/enginepb.TxnSeq"];
}
// RequestHeader is supplied with every storage node request.
//
// This message has some trickery attached to it to allow it to carry a
// testing-only field (KVNemesisSeq) in a way that avoids overhead in production
// code. The field is nominally an empty Message, but is cast to a Go type that
// is an int64 when under the crdb_test tag, and an empty struct otherwise. This
// ensures that production code does not have to pay for the increased struct
// size that would result from having the field always present. Additionally, we
// want to avoid encoding extra bytes on the wire; (gogoproto.nullable) will
// still encode the field tag even when the referenced field is empty. To work
// around this, we don't auto-generate the Size() and various marshal methods
// for RequestHeader. Instead, we have two sibling messages, RequestHeaderPure
// and RequestHeaderCrdbTest. The former is RequestHeader without the
// KVNemesisSeq field, and the latter is identical to RequestHeader but does
// have the generated Size and Marshal methods. We then implement the
// corresponding methods on RequestHeader by delegating to either, making sure
// to only delegate to RequestHeaderCrdbTest when we're under the crdb_test
// build tag *and* the KVNemesisSeq is nonzero. This effectively simulates a
// version of (gogoproto.nullable) in which the zero value is not represented on
// the wire. No change to unmarshalling code is necessary, as the generated
// unmarshaler on RequestHeader can deal with the presence of the field just
// fine.
//
// NB: unfortunately there isn't a way to have the {Pure,CrdbTest} messages
// unexported.
//
// NB: ensure that RequestHeader{,Pure,CrdbTest} stay in sync when adding
// or removing fields.
message RequestHeader {
option (gogoproto.marshaler) = false;
option (gogoproto.sizer) = false;
reserved 1, 2;
message Empty{};
// Empty is zero-size in production. It's an int64 under the crdb_test build tag.
// This is used to enable kvnemesis testing, which builds on uniqueness of values
// in the MVCC history. Deletions don't have a user-definable value, so we need
// an extra identifier, which is provided by this field.
//
// NB: it's important that this isn't at the end of the message, see:
// https://dave.cheney.net/2015/10/09/padding-is-hard
Empty kvnemesis_seq = 6 [
(gogoproto.customname) = "KVNemesisSeq",
(gogoproto.nullable) = false,
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/kv/kvnemesis/kvnemesisutil.Container"];
// The key for request. If the request operates on a range, this
// represents the starting key for the range.
bytes key = 3 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.Key"];
// The end key is empty if the request spans only a single key. Otherwise,
// it must order strictly after Key. In such a case, the header indicates
// that the operation takes place on the key range from Key to EndKey,
// including Key and excluding EndKey.
bytes end_key = 4 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.Key"];
// A zero-indexed transactional sequence number.
int32 sequence = 5 [
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/storage/enginepb.TxnSeq"];
}
// ResponseHeader is returned with every storage node response. Note that this is
// different from a BatchResponse's header.
message ResponseHeader {
// txn is non-nil if the request specified a non-nil transaction.
// The transaction timestamp and/or priority may have been updated,
// depending on the outcome of the request.
//
// Once txn is merged into the BatchResponse_Header.Txn, it will be
// reset to nil to avoid sending superfluous information over the
// network.
Transaction txn = 3;
// The next span to resume from when the response doesn't cover the full span
// requested. This can happen when a bound on the result size is set through
// max_span_request_keys or target_bytes in the batch header or when a scan
// has been stopped before covering the requested data because of
// scan_options.
//
// ResumeSpan is unset when the entire span of keys have been
// operated on. The span is set to the original span if the request
// was ignored because max_span_request_keys was hit due to another
// request in the batch. For a reverse scan the end_key is updated.
Span resume_span = 4;
// When resume_span is populated, this specifies the reason why the operation
// wasn't completed and needs to be resumed.
ResumeReason resume_reason = 7;
// When resume_reason is RESUME_BYTE_LIMIT, this may contain the size of the
// next result entry which caused the limit to be exceeded, i.e. the size of
// the first entry when reading from the resume span. It is only supported by
// Get and Scan. In a batch, this will only be set on the first response that
// exceeds the limit.
//
// NB: This is best-effort, and may be 0 in some rare cases. Specifically, if
// TargetBytes is exactly satisfied by a result that exhausted a range scan,
// or by a response from a multi-request batch, we won't do additional work
// (e.g. send another RPC to the next range) only to obtain resume_next_bytes.
//
// Also note that this is unaffected by whole_rows_of_size. The client may
// care about whole rows, but we'll only return the size of the next KV pair
// (which may just be part of the row), to avoid the cost of additional IO.
int64 resume_next_bytes = 9;
// The number of keys operated on.
int64 num_keys = 5;
// The number of bytes returned. Only populated for requests that support it
// (at the time of writing, Scan, ReverseScan and ExportRequest). The number
// returned here corresponds to the (Header).TargetBytes field and loosely
// measures the bytes in the timestamps, keys, and values of the returned
// rows.
int64 num_bytes = 8;
reserved 6;
}
// A GetRequest is the argument for the Get() method.
message GetRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// The desired key-level locking mode used during this get. When set to None
// (the default), no key-level locking mode is used - meaning that the get
// does not acquire a lock. When set to any other strength, a lock of that
// strength is acquired with the associated durability guarantees on the key,
// if it exists.
kv.kvserver.concurrency.lock.Strength key_locking_strength = 2;
// KeyLockingDurability denotes the durability with which locks, if any are
// acquired, should be acquired with. It should only be set in conjunction
// with a non-None KeyLockingStrength.
//
// Unreplicated locks are kept in-memory on the leaseholder of the locked key.
// As such, their existence until a transaction commits is best-effort. They
// are susceptible to things like lease transfers and node crashes. However,
// they are faster to acquire and resolve when compared to replicated locks.
// This makes them an appealing choice when locks are not required for
// correctness. This includes things like (non-exhaustive list):
// 1. Transactions that run under serializable isolation level.
// 2. Implicit SFU for weaker isolation levels, where we know we will
// subsequently perform a (replicated) intent write on the key being locked.
//
// Replicated locks on the other hand, once acquired, are guaranteed to exist
// until the transaction finalizes (commits or aborts). They are not
// susceptible to things like lease transfers, range {splits,merges}, memory
// limits, node crashes etc. Replication adds a performance penalty for lock
// acquisition and resolution; as such, they should only be used by
// transactions that need guaranteed locks for correctness (read:
// read-committed or snapshot isolation transactions).
kv.kvserver.concurrency.lock.Durability key_locking_durability = 3;
}
// A GetResponse is the return value from the Get() method.
// If the key doesn't exist, Value will be nil.
message GetResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
Value value = 2;
// The intent seen, if any, when using the READ_UNCOMMITTED consistency level.
//
// NOTE: this field is not currently populated with intents for deletion
// tombstones. It probably should be because the value field may contain a
// value that is being deleted by a corresponding intent. We should revisit
// this decision if this ever becomes a problem.
Value intent_value = 3;
}
// A ProbeRequest is an internal request type used to send a replicated
// no-op through a Range as a means of probing write availability. The
// request will be serialized like a regular write, i.e. will acquire
// latches, and declare key access, but it will not check locks (i.e.
// if an intent exists on the key that is being probed, the probe will
// not observe it). ProbeRequest can be served by any Replica including
// followers, i.e. it can be used to verify that a given Replica is able
// to access the replication layer.
message ProbeRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// A ProbeResponse is the response to a ProbeRequest.
message ProbeResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// IsSpanEmptyRequest is used to determine whether a span contains any keys,
// garbage or otherwise. It is used to determine whether data deleted by a
// DeleteRange tombstone has been fully removed.
//
// Generally, the caller should set the MaxSpanKeys header on the BatchRequest
// to 1 so that the DistSender will process the overlapping ranges sequentially
// and stop after the first non-empty range.
message IsSpanEmptyRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// IsSpanEmptyResponse is the response to an IsSpanEmptyRequest.
// If there is any data in the queried span, the NumKeys field of the
// ResponseHeader will have a positive value; if NumKeys is zero, then the
// span is empty.
message IsSpanEmptyResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// A PutRequest is the argument to the Put() method.
message PutRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
Value value = 2 [(gogoproto.nullable) = false];
// Specify as true to put the value without a corresponding
// timestamp. This option should be used with care as it precludes
// the use of this value with transactions.
bool inline = 3;
// NOTE: For internal use only! Set to indicate that the put is
// writing to virgin keyspace and no reads are necessary to
// rationalize MVCC.
bool blind = 4;
}
// A PutResponse is the return value from the Put() method.
message PutResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// A ConditionalPutRequest is the argument to the ConditionalPut() method.
//
// - Returns true and sets value if exp_bytes equals existing value.
// - If key doesn't exist and exp_bytes is empty, sets value.
// - Otherwise, returns a ConditionFailedError containing the actual value of the key.
//
// Note that the client is free to send more requests after a
// ConditionFailedError. This is not generally allowed after other errors
// because of fears over the ambiguity of the side-effects of failed requests
// (in particular, the timestamps at which intents might have been written).
// ConditionFailedError is a special case as we ensure there's no ambiguity; the
// error carries a WriteTimestamp that's the upper bound of the timestamps
// intents were written at.
message ConditionalPutRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// The value to put.
Value value = 2 [(gogoproto.nullable) = false];
reserved 3;
// exp_bytes represents the expected existing value for the key. If empty, the
// key is expected to not exist. If not empty, these bytes are expected to
// contain the tag and data of the existing value (without the existing
// value's checksum; the byte array is expected to come from
// Value.TagAndDataBytes()). A value's checksum covers the key in addition to
// covering the value, so not including a checksum here makes for a easier to
// use API - the creator of the ConditionalPutRequest can simply put in bytes
// coming from a different key.
// Note that there's no such thing as expecting a key to exist, but have an
// empty value. Such key-values don't exist.
//
// Note that the existing value's timestamp doesn't matter, only its data. So,
// the CPut will succeed in ABA situations (if a reader got value A and checks
// against it later, the check will succeed even if, in the meantime, there's
// been a subsequent write of value B and another one back to value A).
bytes exp_bytes = 6;
// NOTE: For internal use only! Set to indicate that the put is
// writing to virgin keyspace and no reads are necessary to
// rationalize MVCC.
bool blind = 4;
// Typically if a specific, non-empty expected value is supplied, it *must*
// exist with that value. Passing this indicates that it is also OK if the key
// does not exist. This is useful when a given value is expected but it is
// possible it has not yet been written.
bool allow_if_does_not_exist = 5;
// Specify as true to put the value without a corresponding
// timestamp. This option should be used with care as it precludes
// the use of this value with transactions.
bool inline = 7;
}
// A ConditionalPutResponse is the return value from the
// ConditionalPut() method.
message ConditionalPutResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// An InitPutRequest is the argument to the InitPut() method.
//
// - If key doesn't exist, sets value.
// - If key exists, returns a ConditionFailedError if value != existing value
// If failOnTombstones is set to true, tombstone values count as mismatched
// values and will cause a ConditionFailedError.
message InitPutRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
Value value = 2 [(gogoproto.nullable) = false];
// NOTE: For internal use only! Set to indicate that the put is
// writing to virgin keyspace and no reads are necessary to
// rationalize MVCC.
bool blind = 3;
// If true, tombstones cause ConditionFailedErrors.
bool failOnTombstones = 4;
}
// A InitPutResponse is the return value from the InitPut() method.
message InitPutResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// An IncrementRequest is the argument to the Increment() method. It
// increments the value for key, and returns the new value. If no
// value exists for a key, incrementing by 0 is not a noop, but will
// create a zero value. IncrementRequest cannot be called on a key set
// by Put() or ConditionalPut(). Similarly, Put() and ConditionalPut()
// cannot be invoked on an incremented key.
message IncrementRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
int64 increment = 2;
}
// An IncrementResponse is the return value from the Increment
// method. The new value after increment is specified in NewValue. If
// the value could not be decoded as specified, Error will be set.
message IncrementResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
int64 new_value = 2;
}
// A DeleteRequest is the argument to the Delete() method.
message DeleteRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// A DeleteResponse is the return value from the Delete() method.
message DeleteResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// True if there was a key that got deleted. A tombstone is written
// unconditionally, regardless of whether the key is found.
bool found_key = 2;
}
// A DeleteRangeRequest is the argument to the DeleteRange() method. It
// specifies the range of keys to delete.
//
// A DeleteRangeRequest populates the timestamp cache and is tracked for
// refreshes.
message DeleteRangeRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
reserved 2;
// return the keys that are deleted in the response.
bool return_keys = 3;
// delete "inline" keys which are stored without MVCC timestamps. Note that
// an "inline" DeleteRange will fail if it attempts to delete any keys which
// contain timestamped (non-inline) values; this option should only be used on
// keys which are known to store inline values, such as data in cockroach's
// time series system.
//
// Similarly, attempts to delete keys with inline values will fail unless this
// flag is set to true; the setting must match the data being deleted.
//
// Inline values cannot be deleted transactionally; a DeleteRange with
// "inline" set to true will fail if it is executed within a transaction.
bool inline = 4;
// If enabled, the range is deleted using an MVCC range tombstone, which is a
// cheaper constant-time write operation (but still requires a scan to check
// for conflicts and adjust MVCC stats). This option cannot be used in a
// transaction, and it cannot be combined with Inline or ReturnKeys.
//
// The caller must check storage.CanUseMVCCRangeTombstones before using this
// parameter: it is new in 22.2, and controlled by the default-off cluster
// setting storage.mvcc.range_tombstones.enabled.
bool use_range_tombstone = 5;
// If enabled together with UseRangeTombstone, the MVCC range tombstone will
// only be written if there exists point key/tombstones in the span that
// aren't already covered by an MVCC range tombstone. As a convenience, it
// considers empty spans equivalent to being covered by an MVCC range
// tombstone, so it will omit the write across an entirely empty span too.
bool idempotent_tombstone = 7;
// If enabled, and a span is deleted using a range tombstone, then the GCHint
// on the corresponding Range will be updated. The hint instructs MVCC GC
// queue to delete this data as soon as it can, and helps optimizing GC for
// bulk deletions.
bool update_range_delete_gc_hint = 8 [(gogoproto.customname) = "UpdateRangeDeleteGCHint"];
DeleteRangePredicates predicates = 6 [(gogoproto.nullable) = false];
}
// DeleteRangePredicates if specified, will conduct predicate based DeleteRange.
// Predicate based delete range will issue tombstones on live keys that match the
// filters provided by the caller. In particular, long runs of matched keys will
// get deleted with a range tombstone, while smaller runs will get deleted with
// point tombstones. Note that the keyspace across runs does not overlap.
//
// To pass DeleteRangePredicates, the client must also pass UseRangeTombstone.
message DeleteRangePredicates {
// ImportEpoch specifies that all keys with a non-zero
// MVCCValueHeader.ImportEpoch == ImportEmpoch should be deleted.
uint32 import_epoch = 1;
// StartTime specifies an exclusive lower bound to surface keys
// for deletion. If specified, DeleteRange will only issue tombstones to keys
// within the span [startKey, endKey) that also have MVCC versions with
// timestamps between (startTime, endTime), where endTime is the request timestamp.
//
// The main application for this is a rollback of IMPORT INTO on a non-empty
// table. Here, DeleteRange with startTime = ImportStartTime, must only delete
// keys written by the import. In other words, older, pre-import, data cannot
// be touched. Because IMPORT INTO takes a table offline and does not allow
// masking an existing key, this operation will not issue tombstones to
// pre-import data that were written at or below StartTime.
//
// In other words, this operation assumes that for a k@t in the importing table:
// - t must be < endTime
// - if t in (startTime, endTime), then there is no other k@t' where t' <= startTime.
util.hlc.Timestamp start_time = 6 [(gogoproto.nullable) = false];
}
// A DeleteRangeResponse is the return value from the DeleteRange()
// method.
message DeleteRangeResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// All the deleted keys if return_keys is set.
repeated bytes keys = 2 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.Key"];
}
// A ClearRangeRequest is the argument to the ClearRange() method. It
// specifies a range of keys to clear from the underlying engine. Note
// that this differs from the behavior of DeleteRange, which sets
// transactional intents and writes tombstones to the deleted
// keys. ClearRange is used when permanently dropping or truncating
// table data.
//
// ClearRange also updates the GC threshold for the range to the
// timestamp at which this command executes, to prevent reads at
// earlier timestamps from incorrectly returning empty results.
//
// NOTE: it is important that this method only be invoked on a key
// range which is guaranteed to be both inactive and not see future
// writes (until Deadline, if set, below).
// Ignoring this warning may result in data loss.
message ClearRangeRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// Deadline can be set to a time at or after which the server will refuse to
// execute this ClearRange request, providing a form of replay protection:
// if a caller sets this to a time in the near future, they can then wait for
// that time (plus max offset) to have passed at which point they can reuse
// the span they cleared without fear of this request being replayed later and
// clearing subsequent writes.
util.hlc.Timestamp deadline = 2 [(gogoproto.nullable) = false];
}
// A ClearRangeResponse is the return value from the ClearRange() method.
message ClearRangeResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// A RevertRangeRequest specifies a range of keys in which to clear all MVCC
// revisions more recent than some TargetTime from the underlying engine, thus
// reverting the range (from the perspective of an MVCC scan) to that time.
message RevertRangeRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// TargetTime specifies a the time to which to "revert" the range by clearing
// any MVCC key with a strictly higher timestamp. TargetTime must be higher
// than the GC Threshold for the replica - so that it is assured that the keys
// for that time are still there — or the request will fail.
util.hlc.Timestamp target_time = 2 [(gogoproto.nullable) = false];
// IgnoreGcThreshold can be set by a caller to ignore the target-time when
// checking that the earliest time at which the command operates is above the
// GC threshold. This is safe to set only in very specific situations, such as
// when the target span was OFFLINE since the target time as it is during
// IMPORT INTO. In this case, since the IMPORT knows it is the only writer and
// it only writes new keys, no keys to which it would need to revert have been
// shadowed / could have been GC'ed, so it can safely ignore the GC threshold.
bool ignore_gc_threshold = 4;
reserved 3;
}
// A RevertRangeResponse is the return value from the RevertRange() method.
message RevertRangeResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// ScanFormat is an enumeration of the available response formats for MVCCScan
// operations.
enum ScanFormat {
option (gogoproto.goproto_enum_prefix) = false;
// The standard MVCCScan format: a slice of KeyValue messages.
KEY_VALUES = 0;
// The batch_response format: a byte slice of alternating keys and values,
// each prefixed by their length as a varint.
BATCH_RESPONSE = 1;
// The coldata.Batch response format: only necessary (according to the
// fetchpb.IndexFetchSpec) columns are populated in the coldata.Batch'es which
// are either serialized (in the Apache Arrow format) in the batch_responses
// field or passed as is in the col_batches field of the response,
// respectively.
COL_BATCH_RESPONSE = 2;
}
// ColBatches is a way to pass []coldata.Batch without serialization through the
// protobufs for Scans and ReverseScans, when they are executed locally and with
// the COL_BATCH_RESPONSE scan format.
message ColBatches {
option (gogoproto.goproto_stringer) = false;
option (gogoproto.equal) = false;
option (gogoproto.marshaler) = false;
option (gogoproto.sizer) = false;
option (gogoproto.unmarshaler) = false;
repeated bytes col_batches = 1 [(gogoproto.nullable) = false,
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/col/coldata.Batch"];
}
// A ScanRequest is the argument to the Scan() method. It specifies the
// start and end keys for an ascending scan of [start,end) and the maximum
// number of results (unbounded if zero).
message ScanRequest {
reserved 2, 3;
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// The desired format for the response. If set to BATCH_RESPONSE or
// COL_BATCH_RESPONSE, the server will set either the batch_responses field or
// the col_batches field in the ScanResponse instead of the rows field.
ScanFormat scan_format = 4;
// The desired key-level locking mode used during this scan. When set to None
// (the default), no key-level locking mode is used - meaning that the scan
// does not acquire any locks. When set to any other strength, a lock of that
// strength is acquired with the associated durability guarantees on each of
// the keys scanned by the request, subject to any key limit applied to the
// batch which limits the number of keys returned.
//
// NOTE: the locks acquire with this strength are point locks on each of the
// keys returned by the request, not a single range lock over the entire span
// scanned by the request.
kv.kvserver.concurrency.lock.Strength key_locking_strength = 5;
// KeyLockingDurability denotes the durability with which locks, if any are
// acquired, should be acquired with. It should only be set in conjunction
// with a non-None KeyLockingStrength.
//
// Unreplicated locks are kept in-memory on the leaseholder of the locked key.
// As such, their existence until a transaction commits is best-effort. They
// are susceptible to things like lease transfers and node crashes. However,
// they are faster to acquire and resolve when compared to replicated locks.
// This makes them an appealing choice when locks are not required for
// correctness. This includes things like (non-exhaustive list):
// 1. Transactions that run under serializable isolation level.
// 2. Implicit SFU for weaker isolation levels, where we know we will
// subsequently perform a (replicated) intent write on the key being locked.
//
// Replicated locks on the other hand, once acquired, are guaranteed to exist
// until the transaction finalizes (commits or aborts). They are not
// susceptible to things like lease transfers, range {splits,merges}, memory
// limits, node crashes etc. Replication adds a performance penalty for lock
// acquisition and resolution; as such, they should only be used by
// transactions that need guaranteed locks for correctness (read:
// read-committed or snapshot isolation transactions).
kv.kvserver.concurrency.lock.Durability key_locking_durability = 6;
}
// A ScanResponse is the return value from the Scan() method.
message ScanResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// Empty if no rows were scanned or BATCH_RESPONSE / COL_BATCH_RESPONSE scan
// format was used.
repeated KeyValue rows = 2 [(gogoproto.nullable) = false];
// The intent rows seen when performing a scan at the READ_UNCOMMITTED
// consistency level. These rows do not count against the MaxSpanRequestKeys
// count.
//
// NOTE: this field is not currently populated with intents for deletion
// tombstones. It probably should be because the rows field may contain
// key-values that are being deleted by corresponding intents. We should
// revisit this decision if this ever becomes a problem.
repeated KeyValue intent_rows = 3 [(gogoproto.nullable) = false];
// If set, then depending on the ScanFormat, each item in this repeated bytes
// field contains part of the results in batch format:
// - for BATCH_RESPONSE - the key/value pairs are a buffer of varint-prefixed
// slices, alternating from key to value. Each entry in this field is complete
// (i.e. there are no key/value pairs that are split across more than one
// entry). There are num_keys total pairs across all entries, as defined by
// the ResponseHeader.
// - for COL_BATCH_RESPONSE - each []byte is a single serialized (in the
// Apache Arrow format) coldata.Batch. Each SQL row in that coldata.Batch is
// complete. num_keys total key-value pairs were used to populate all of the
// coldata.Batch'es in this field.
//
// If set, rows and col_batches will not be set.
repeated bytes batch_responses = 4;
// If set, then each element in the slice is a single non-serialized
// coldata.Batch. Each SQL row in that coldata.Batch is complete. num_keys
// total key-value pairs were used to populate all of the coldata.Batch'es in
// this field.
//
// If set, rows and batch_responses will not be set.
ColBatches col_batches = 5 [(gogoproto.nullable) = false];
}
// A ReverseScanRequest is the argument to the ReverseScan() method. It specifies the
// start and end keys for a descending scan of [start,end) and the maximum
// number of results (unbounded if zero).
message ReverseScanRequest {
reserved 2, 3;
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// The desired format for the response. If set to BATCH_RESPONSE or
// COL_BATCH_RESPONSE, the server will set either the batch_responses field or
// the col_batches field in the ReverseScanResponse instead of the rows field.
ScanFormat scan_format = 4;
// The desired key-level locking mode used during this scan. When set to None
// (the default), no key-level locking mode is used - meaning that the scan
// does not acquire any locks. When set to any other strength, a lock of that
// strength is acquired with the associated durability guarantees on each of
// the keys scanned by the request, subject to any key limit applied to the
// batch which limits the number of keys returned.
//
// NOTE: the locks acquire with this strength are point locks on each of the
// keys returned by the request, not a single range lock over the entire span
// scanned by the request.
kv.kvserver.concurrency.lock.Strength key_locking_strength = 5;
// KeyLockingDurability denotes the durability with which locks, if any are
// acquired, should be acquired with. It should only be set in conjunction
// with a non-None KeyLockingStrength.
//
// Unreplicated locks are kept in-memory on the leaseholder of the locked key.
// As such, their existence until a transaction commits is best-effort. They
// are susceptible to things like lease transfers and node crashes. However,
// they are faster to acquire and resolve when compared to replicated locks.
// This makes them an appealing choice when locks are not required for
// correctness. This includes things like (non-exhaustive list):
// 1. Transactions that run under serializable isolation level.
// 2. Implicit SFU for weaker isolation levels, where we know we will
// subsequently perform a (replicated) intent write on the key being locked.
//
// Replicated locks on the other hand, once acquired, are guaranteed to exist
// until the transaction finalizes (commits or aborts). They are not
// susceptible to things like lease transfers, range {splits,merges}, memory
// limits, node crashes etc. Replication adds a performance penalty for lock
// acquisition and resolution; as such, they should only be used by
// transactions that need guaranteed locks for correctness (read:
// read-committed or snapshot isolation transactions).
kv.kvserver.concurrency.lock.Durability key_locking_durability = 6;
}
// A ReverseScanResponse is the return value from the ReverseScan() method.
message ReverseScanResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// Empty if no rows were scanned or BATCH_RESPONSE / COL_BATCH_RESPONSE scan
// format was used.
repeated KeyValue rows = 2 [(gogoproto.nullable) = false];
// The intent rows seen when performing a scan at the READ_UNCOMMITTED
// consistency level. These rows do not count against the MaxSpanRequestKeys
// count.
//
// NOTE: this field is not currently populated with intents for deletion
// tombstones. It probably should be because the rows field may contain
// key-values that are being deleted by corresponding intents. We should
// revisit this decision if this ever becomes a problem.
repeated KeyValue intent_rows = 3 [(gogoproto.nullable) = false];
// If set, then depending on the ScanFormat, each item in this repeated bytes
// field contains part of the results in batch format:
// - for BATCH_RESPONSE - the key/value pairs are a buffer of varint-prefixed
// slices, alternating from key to value. Each entry in this field is complete
// (i.e. there are no key/value pairs that are split across more than one
// entry). There are num_keys total pairs across all entries, as defined by
// the ResponseHeader.
// - for COL_BATCH_RESPONSE - each []byte is a single serialized (in the
// Apache Arrow format) coldata.Batch. Each SQL row in that coldata.Batch is
// complete. num_keys total key-value pairs were used to populate all of the
// coldata.Batch'es in this field.
//
// If set, rows and col_batches will not be set.
repeated bytes batch_responses = 4;
// If set, then each element in the slice is a single non-serialized
// coldata.Batch. Each SQL row in that coldata.Batch is complete. num_keys
// total key-value pairs were used to populate all of the coldata.Batch'es in
// this field.
//
// If set, rows and batch_responses will not be set.
ColBatches col_batches = 5 [(gogoproto.nullable) = false];
}
enum ChecksumMode {
// CHECK_VIA_QUEUE is set for requests made from the consistency queue. In
// this mode, a full check is carried out, and depending on the result a
// recursive consistency check is triggered:
//
// 1. no inconsistency found: if recomputed stats don't match persisted stats,
// trigger a RecomputeStatsRequest.
// 2. inconsistency found: if a diff is available, print it and trigger fatal
// error. If no diff found, trigger recursive check with diff requested
// (which then triggers fatal error).
//
// TODO(tbg): these semantics are an artifact of how consistency checks were
// first implemented. The extra behavior here should move to the consistency
// check queue instead and this option dropped from the enum.
CHECK_VIA_QUEUE = 0;
// CHECK_FULL recomputes the hash of the replicate data in all replicas and
// uses this to determine whether there is an inconsistency.
CHECK_FULL = 1;
// CHECK_STATS only hashes the persisted lease applied state (which notably
// includes the persisted MVCCStats) only. This catches a large class of
// replica inconsistencies observed in the wild (where replicas apply a
// nonidentical log of commands, and as a result almost always have
// divergent stats), while doing work independent of the size of the data
// contained in the replicas.
CHECK_STATS = 2;
}
// A CheckConsistencyRequest is the argument to the CheckConsistency() method.
// It specifies the start and end keys for a span of ranges to which a
// consistency check should be applied. A consistency check on a range involves
// running a ComputeChecksum on the range followed by a storage.CollectChecksum.
message CheckConsistencyRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
ChecksumMode mode = 3;
reserved 2, 4, 5;
}
// A CheckConsistencyResponse is the return value from the CheckConsistency() method.
// It returns the status the range was found in.
message CheckConsistencyResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
enum Status {
// No inconsistency was detected, but not all replicas returned a checksum.
RANGE_INDETERMINATE = 0;
// A definite inconsistency was detected.
RANGE_INCONSISTENT = 1;
// All replicas of the range agreed on the checksum.
RANGE_CONSISTENT = 2;
// Like RANGE_CONSISTENT, but the recomputed stats disagreed with the
// persisted stats. The persisted stats indicates estimates, so this is
// expected.
RANGE_CONSISTENT_STATS_ESTIMATED = 3;
// Like RANGE_CONSISTENT_STATS_ESTIMATED, but the mismatch occurred with
// persisted stats that claimed to be accurate. This is unexpected and
// likely indicates a bug in our logic to incrementally update the stats
// as commands are evaluated and applied.
RANGE_CONSISTENT_STATS_INCORRECT = 4;
}
message Result {
int64 range_id = 1 [(gogoproto.customname) = "RangeID",
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.RangeID"];
// start_key of the range corresponding to range_id (at the time of the
// check). This is useful to send additional requests to only a subset of
// ranges contained within a result later, as requests can only be routed by
// key.
bytes start_key = 2;
Status status = 3;
// detail contains information related to the operation. If no inconsistency
// is found, it contains informational value such as observed stats. If an
// inconsistency is found, it contains information about that inconsistency
// including the involved replica and, if requested, the diff.
string detail = 4;
}
// result contains a Result for each Range checked, in no particular order.
repeated Result result = 2 [(gogoproto.nullable) = false];
}
// An RecomputeStatsRequest triggers a stats recomputation on the Range addressed by
// the request.
//
// An error will be returned if the start key does not match the start key of the
// target Range.
//
// The stats recomputation touches essentially the whole range, but the command
// avoids having to block other commands by taking care to not interleave
// with splits, and by using the commutativity of stats updates. As a result,
// it is safe to invoke at any time, including repeatedly, though it should be
// used conservatively due to performing a full scan of the Range.
message RecomputeStatsRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// When dry_run is true, the stats delta is computed, but no stats adjustment
// is performed. This isn't useful outside of testing since RecomputeStats is
// safe and idempotent.
bool dry_run = 2;
}
// An RecomputeStatsResponse is the response to an RecomputeStatsRequest.
message RecomputeStatsResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// added_delta is the adjustment made to the range's stats, i.e. `new_stats = old_stats + added_delta`.
storage.enginepb.MVCCStatsDelta added_delta = 2 [(gogoproto.nullable) = false];
}
// An EndTxnRequest is the argument to the EndTxn() method. It specifies
// whether to commit or roll back an extant transaction.
message EndTxnRequest {
RequestHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// False to abort and rollback.
bool commit = 2;
// If set, deadline represents the maximum (exclusive) timestamp at which the
// transaction can commit (i.e. the maximum timestamp for the txn's reads and
// writes).
// If EndTxn(Commit=true) finds that the txn's timestamp has been pushed to or
// above this deadline, an error will be returned and the client is supposed
// to rollback the txn.
util.hlc.Timestamp deadline = 3 [(gogoproto.nullable) = false];
// commit triggers. Note that commit triggers are for
// internal use only and will cause an error if requested through the
// external-facing KV API.
InternalCommitTrigger internal_commit_trigger = 4;
// Set of spans that the transaction has acquired locks within. These are
// spans which must be resolved on txn completion. Note that these spans
// may be condensed to cover aggregate spans if the keys locked by the
// transaction exceeded a size threshold.
//
// The set logically extends to include the keys of all writes in the
// in-flight write set. However, those keys are not stored in this set
// to avoid duplication. This means that elements that are removed from
// that set should be merged into this one.
//
// The slice is maintained in sorted order and all spans are maximally
// merged such that no two spans here overlap each other.
repeated Span lock_spans = 5 [(gogoproto.nullable) = false];
// Set of in-flight intent writes that have been issued by the transaction but
// which may not have succeeded yet. If any promised writes are provided, a
// committing EndTxn request will move a PENDING transaction to the STAGING
// status instead of the COMMITTED status. These in-flight writes must then
// all be confirmed as successful before the transaction can be moved from
// STAGING to COMMITTED. For more, see txnCommitter.
//
// The slice is maintained in sorted order by sequence number. This provides
// O(log n) access to individual writes in this set based on their sequence
// number. See SequencedWriteBySeq.Find and its uses. The set can contain
// multiple SequencedWrites with the same key, but all sequence numbers are
// unique.
repeated SequencedWrite in_flight_writes = 17 [(gogoproto.nullable) = false];
// Requires that the transaction completes as a 1 phase commit. This
// guarantees that all writes are to the same range and that no
// intents are left in the event of an error.
//
// Note(andrei): Use this flag with care; retriable errors are not generated
// reliably for these transactions - a TransactionStatusError might be
// returned instead if 1PC execution fails.
bool require_1pc = 6 [(gogoproto.customname) = "Require1PC"];
// Disables the transaction from attempting 1 phase commit. Cannot be used in
// conjunction with the Require1PC flag.
bool disable_1pc = 11 [(gogoproto.customname) = "Disable1PC"];
// True to indicate that lock spans should be resolved with poison=true.
// This is used when the transaction is being aborted independently of the
// main thread of client operation, as in the case of an asynchronous abort
// from the TxnCoordSender on a failed heartbeat. It should only be set to
// true when commit=false.
bool poison = 9;
reserved 7, 8, 10;
}
// An EndTxnResponse is the return value from the EndTxn() method. The final
// transaction record is returned as part of the response header. In particular,
// transaction status and timestamp will be updated to reflect final committed
// values. Clients may propagate the transaction timestamp as the final txn
// commit timestamp in order to preserve causal ordering between subsequent
// transactions.
message EndTxnResponse {
ResponseHeader header = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
reserved 2;
reserved 3;
// True if the transaction committed on the one phase commit path.
// This means that all writes which were part of the transaction
// were written as a single, atomic write batch to just one range.
bool one_phase_commit = 4;
// The commit timestamp of the STAGING transaction record written
// by the request. Only set if the transaction record was staged.
util.hlc.Timestamp staging_timestamp = 5 [(gogoproto.nullable) = false];
// ReplicatedLocksReleasedOnCommit, if non-empty, indicate that replicated
// locks with strength Shared or Exclusive were released in the referenced key
// spans when committing this transaction. Notably, this field is left unset
// if only write intents were resolved. The field is also left unset for
// transactions that aborted.
//
// The caller must bump the timestamp cache across these spans to the
// transaction's commit timestamp. Doing so ensures that the released locks
// (acquired by the now committed transaction) continue to provide protection
// against other writers up to the commit timestamp, even after the locks have
// been released.