-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
proposer_kv.proto
395 lines (354 loc) · 19.7 KB
/
proposer_kv.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
// Copyright 2016 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
syntax = "proto3";
package cockroach.kv.kvserver.storagepb;
option go_package = "kvserverpb";
import "roachpb/api.proto";
import "roachpb/data.proto";
import "roachpb/metadata.proto";
import "storage/enginepb/mvcc.proto";
import "storage/enginepb/mvcc3.proto";
import "kv/kvserver/kvserverpb/state.proto";
import "kv/kvserver/readsummary/rspb/summary.proto";
import "util/hlc/timestamp.proto";
import "gogoproto/gogo.proto";
// Split is emitted when a Replica commits a split trigger. It signals that the
// Replica has prepared the on-disk state for both the left and right hand
// sides of the split, and that the left hand side Replica should be updated as
// well as the right hand side created.
message Split {
option (gogoproto.equal) = true;
roachpb.SplitTrigger trigger = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
// RHSDelta holds the statistics for what was written to what is now the
// right-hand side of the split during the batch which executed it.
// The on-disk state of the right-hand side is already correct, but the
// Store must learn about this delta to update its counters appropriately.
storage.enginepb.MVCCStats rhs_delta = 2 [(gogoproto.nullable) = false,
(gogoproto.customname) = "RHSDelta"];
}
// Merge is emitted by a Replica which commits a transaction with
// a MergeTrigger (i.e. absorbs its right neighbor).
message Merge {
option (gogoproto.equal) = true;
roachpb.MergeTrigger trigger = 1 [(gogoproto.nullable) = false,
(gogoproto.embed) = true];
}
// ChangeReplicas is emitted by a Replica which commits a transaction with
// a ChangeReplicasTrigger.
message ChangeReplicas {
option (gogoproto.goproto_stringer) = false;
roachpb.ChangeReplicasTrigger trigger = 1 [(gogoproto.nullable) = false,
(gogoproto.embed) = true];
}
// ComputeChecksum is emitted when a ComputeChecksum request is evaluated. It
// instructs the replica to compute a checksum at the time the command is
// applied.
message ComputeChecksum {
option (gogoproto.equal) = true;
// ChecksumID is a handle by which the checksum can be retrieved in a later
// CollectChecksum request.
bytes checksum_id = 1 [
(gogoproto.nullable) = false,
(gogoproto.customname) = "ChecksumID",
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID"
];
// The version used to pick the checksum method. Only when the version matches
// that hardcoded in the binary will a computation be carried out.
uint32 version = 5;
reserved 2;
roachpb.ChecksumMode mode = 3;
// If set, a checkpoint (i.e. cheap backup) of the engine will be taken. This
// is expected to be set only if we already know that there is an
// inconsistency and we want to preserve as much state as possible.
bool checkpoint = 4;
// Replicas processing this command which find themselves in this slice will
// terminate. See `ComputeChecksumRequest.Terminate`.
repeated roachpb.ReplicaDescriptor terminate = 6 [(gogoproto.nullable) = false];
}
// Compaction holds core details about a suggested compaction.
message Compaction {
option (gogoproto.equal) = true;
// bytes indicates the expected space reclamation from compaction.
int64 bytes = 1;
// suggested_at is nanoseconds since the epoch.
int64 suggested_at_nanos = 2;
}
// SuggestedCompaction holds start and end keys in conjunction with
// the compaction details.
message SuggestedCompaction {
option (gogoproto.equal) = true;
bytes start_key = 1 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.Key"];
bytes end_key = 2 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.Key"];
Compaction compaction = 3 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
}
// ReplicatedEvalResult is the structured information which together with a
// RocksDB WriteBatch constitutes the proposal payload.
// For the majority of proposals, we expect ReplicatedEvalResult to be
// trivial; only changes to the metadata state (splits, merges, rebalances,
// leases, log truncation, ...) of the Replica or certain special commands must
// sideline information here based on which all Replicas must take action.
message ReplicatedEvalResult {
// Updates to the Replica's ReplicaState. By convention and as outlined on
// the comment on the ReplicaState message, this field is sparsely populated
// and any field set overwrites the corresponding field in the state, perhaps
// with additional side effects (for instance on a descriptor update).
kv.kvserver.storagepb.ReplicaState state = 2;
Split split = 3;
Merge merge = 4;
ComputeChecksum compute_checksum = 21;
bool is_lease_request = 6;
bool is_probe = 23;
// The timestamp at which this command is writing. Used to verify the validity
// of the command against the GC threshold and to update the followers'
// clocks. Only set if the request that produced this command is a write that
// cares about the timestamp cache.
util.hlc.Timestamp write_timestamp = 8 [(gogoproto.nullable) = false];
// The stats delta corresponding to the data in this WriteBatch. On
// a split, contains only the contributions to the left-hand side.
storage.enginepb.MVCCStats deprecated_delta = 10; // See #18828
storage.enginepb.MVCCStatsDelta delta = 18 [(gogoproto.nullable) = false];
ChangeReplicas change_replicas = 12;
// RaftLogDelta is the delta in bytes caused by truncation of the raft log.
// It is only populated when evaluating a TruncateLogRequest. The inclusive
// index for the truncation is specified in State.TruncatedState. This delta
// is computed under the assumption that the truncation is happening over
// the interval [RaftExpectedFirstIndex, index]. If the actual truncation at
// a replica is over some interval [x, interval] where x !=
// RaftExpectedFirstIndex it is that replica's job to recalculate this delta
// in order to be accurate, or to make note of the fact that its raft log
// size stats may now be inaccurate.
//
// NB: this delta does not include the byte size of sideloaded entries.
// Sideloaded entries are not expected to be common enough that it is worth
// the optimization to calculate the delta once (at the leaseholder).
int64 raft_log_delta = 13;
// RaftExpectedFirstIndex is populated starting at cluster version
// LooselyCoupledRaftLogTruncation. When this is not populated, the replica
// should not delay enacting the truncation.
uint64 raft_expected_first_index = 25;
// MVCCHistoryMutation describes mutations of MVCC history that may violate
// the closed timestamp, timestamp cache, and guarantees that rely on these
// (e.g. linearizability and serializability). It is currently used to
// disconnect rangefeeds that overlap these spans, as a safeguard -- the
// caller is expected to ensure there are no rangefeeds over such spans in the
// first place.
//
// This is a separate message type to keep the base struct comparable in Go.
message MVCCHistoryMutation {
repeated roachpb.Span spans = 1 [(gogoproto.nullable) = false];
}
MVCCHistoryMutation mvcc_history_mutation = 24 [(gogoproto.customname) = "MVCCHistoryMutation"];
// AddSSTable is a side effect that must execute before the Raft application
// is committed. It must be idempotent to account for an ill-timed crash after
// applying the side effect, but before committing the batch.
//
// TODO(tschottdorf): additionally, after the crash, the node must not serve
// traffic until the persisted committed log has fully applied. Otherwise, we
// risk exposing data created through such a side effect whose corresponding
// Raft command hasn't committed yet. This isn't so much an issue with AddSSTable
// since these Ranges are not user-visible, but it is a general concern assuming
// other such side effects are added.
message AddSSTable {
bytes data = 1;
uint32 crc32 = 2 [(gogoproto.customname) = "CRC32"];
roachpb.Span span = 3 [(gogoproto.nullable) = false];
// If true, all SST MVCC timestamps equal the WriteTimestamp. This is given
// by the SSTTimestampToRequestTimestamp request parameter, which was
// introduced in 22.1 and only used with the MVCCAddSSTable cluster version.
//
// TODO(erikgrinaker): This field currently controls whether to emit an
// AddSSTable event across the rangefeed. We could equivalently check
// MVCCHistoryMutation == nil, but that field was introduced in 22.1,
// and so for log entries written by 21.2 or older it will always be nil.
// It may be possible to remove this field and check MVCCHistoryMutation
// instead in 22.2, but _ONLY_ if a below-Raft cluster migration has also
// taken place since 22.1, to make sure all Raft log entries from 21.2 or
// older have been applied on all replicas.
bool at_write_timestamp = 4;
}
AddSSTable add_sstable = 17 [(gogoproto.customname) = "AddSSTable"];
// This is the proposal timestamp for the active lease while evaluating a lease request.
// It will be used to make sure we know if a lease was extended after we sent out the request
// but before we tried to apply it.
util.hlc.Timestamp prev_lease_proposal = 20 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/util/hlc.ClockTimestamp"];
// PriorReadSummary is a summary of the reads that have been served on the
// range prior to this proposal, which must be a lease change (request or
// transfer) if the field is set. The read summary is used to update the new
// leaseholder's timestamp cache to prevent them from serving writes that
// violate previously served reads.
//
// The summary, when available, can be used in place of bumping the new
// leaseholder's timestamp cache to the new lease's start time. It has two
// distinct advantages:
// 1. it can transfer a higher-resolution snapshot of the reads on the range
// through a lease transfer, to make the lease transfers less disruptive to
// writes because the timestamp cache won't be bumped as high.
// 2. it can transfer information about reads with synthetic timestamps, which
// are not otherwise captured by the new lease's start time.
//
// When a ReadSummary is set in a ReplicatedEvalResult, there is always also a
// write to the RangePriorReadSummaryKey in the RaftCommand.WriteBatch. The
// persisted summary may be identical to the summary in this field, but it
// does not have to be. Notably, we intended for the summary included in the
// ReplicatedEvalResult to eventually be a much higher-resolution version of
// the ReadSummmary than the version persisted. This scheme of persisting a
// compressed ReadSummary indefinitely and including a higher-resolution
// ReadSummary on the RaftCommand allows us to optimize for the common case
// where the lease transfer is applied on the new leaseholder through Raft log
// application while ensuring correctness in the case where the lease transfer
// is applied on the new leaseholder through a Raft snapshot.
kv.kvserver.readsummary.ReadSummary prior_read_summary = 22;
reserved 1, 5, 7, 9, 14, 15, 16, 19, 10001 to 10013;
}
// WriteBatch is the serialized representation of a RocksDB write
// batch. A wrapper message is used so that the absence of the field
// can be distinguished from a zero-length batch, and so structs
// containing pointers to it can be compared with the == operator.
message WriteBatch {
bytes data = 1;
}
// LogicalOpLog is a log of logical MVCC operations. A wrapper message
// is used so that the absence of the field can be distinguished from a
// zero-length batch, and so structs containing pointers to it can be
// compared with the == operator.
message LogicalOpLog {
repeated storage.enginepb.MVCCLogicalOp ops = 1 [(gogoproto.nullable) = false];
}
// RaftCommand is the message written to the raft log. It contains some metadata
// about the proposal itself and a ReplicatedEvalResult + WriteBatch
message RaftCommand {
// Metadata about the proposal itself. These fields exist at
// top-level instead of being grouped in a sub-message for
// backwards-compatibility.
// proposer_lease_seq is provided to verify at raft command apply-time
// that the lease under which the command was proposed remains in effect.
//
// To see why lease verification downstream of Raft is required, consider the
// following example:
// - replica 1 receives a client request for a write
// - replica 1 checks the lease; the write is permitted
// - replica 1 proposes the command
// - time passes, replica 2 commits a new lease
// - the command applies on replica 1
// - replica 2 serves anomalous reads which don't see the write
// - the command applies on replica 2
int64 proposer_lease_sequence = 6 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.LeaseSequence"];
// deprecated_proposer_lease served the same purpose as proposer_lease_seq.
// As of VersionLeaseSequence, it is no longer in use.
//
// However, unless we add a check that all existing Raft logs on all nodes
// in the cluster contain only "new" leases, we won't be able to remove the
// legacy code.
roachpb.Lease deprecated_proposer_lease = 5;
// When the command is applied, its result is an error if the lease log
// counter has already reached (or exceeded) max_lease_index.
//
// The lease index is a reorder protection mechanism - we don't want Raft
// commands (proposed by a single node, the one with proposer_lease) executing
// in a different order than the one in which the corresponding KV requests
// were evaluated and the commands were proposed. This is important because
// latching does not fully serialize commands - mostly when it comes to
// updates to the internal state of the range (this should be re-evaluated
// once proposer-evaluated KV is completed - see #10413).
// Similar to the Raft applied index, it is strictly increasing, but may have
// gaps. A command will only apply successfully if its max_lease_index has not
// been surpassed by the Range's applied lease index (in which case the
// command may need to be retried, that is, regenerated with a higher
// max_lease_index). When the command applies, the new lease index will
// increase to max_lease_index (so a potential later replay will fail).
//
// This mechanism was introduced as a simpler alternative to using the Raft
// applied index, which is fraught with complexity due to the need to predict
// exactly the log position at which a command will apply, even when the Raft
// leader is not colocated with the lease holder (which usually proposes all
// commands).
//
// Pinning the lease-index to the assigned slot (as opposed to allowing gaps
// as we do now) is an interesting venue to explore from the standpoint of
// parallelization: One could hope to enforce command ordering in that way
// (without recourse to a higher-level locking primitive such as the command
// queue). This is a hard problem: First of all, managing the pending
// commands gets more involved; a command must not be removed if others have
// been added after it, and on removal, the assignment counters must be
// updated accordingly. Managing retry of proposals becomes trickier as
// well as that uproots whatever ordering was originally envisioned.
//
// This field is set through MaxLeaseFooter hackery. Unlike with the
// ClosedTimestamp, which needs to be nullable in this proto (see comment),
// there are no nullability concerns with this field. This is because
// max_lease_index is a primitive type, so it does not get encoded when zero.
// This alone ensures that the field is not encoded twice in the combined
// RaftCommand+MaxLeaseFooter proto.
uint64 max_lease_index = 4;
// The closed timestamp carried by this command. Once a follower is told to
// apply this command, it knows that there will be no further writes at
// timestamps <= closed_timestamp. Note that the command itself might
// represent a write at a lower timestamp, so the closed timestamp can only be
// used after this command is applied.
//
// The field can be zero, which is to be interpreted as no closed timestamp
// update. If the value is not zero, the value is greater or equal to that of
// the previous commands (and all before it).
//
// This field is set through ClosedTimestampFooter hackery. The field is
// nullable so that it does not get encoded when empty. This prevents the
// field from being encoded twice in the combined
// RaftCommand+ClosedTimestampFooter proto (encoding it twice is not illegal
// as far as proto goes - the last value wins when decoding - but it is a
// problem for sideloading, which reduces the size of the proto).
util.hlc.Timestamp closed_timestamp = 17;
reserved 3;
// replicated_eval_result is a set of structured information that instructs
// replicated state changes to the part of a Range's replicated state machine
// that exists outside of RocksDB.
ReplicatedEvalResult replicated_eval_result = 13 [(gogoproto.nullable) = false];
// write_batch is a RocksDB WriteBatch that will be applied to RockDB during
// the application of the Raft command. The batch can be thought of as a
// series of replicated instructions that inform a RocksDB engine on how to
// change.
WriteBatch write_batch = 14;
// logical_op_log contains a series of logical MVCC operations that correspond
// to the physical operations being made in the write_batch.
LogicalOpLog logical_op_log = 15;
// trace_data, if not empty, contains details of the proposer's trace as
// returned by Tracer.InjectMetaInto(sp.Meta(), ...). This is used to create
// spans for the command application process on all the replicas that "follow
// from" the proposer.
map<string, string> trace_data = 16;
// Fields used below-raft for replication admission control. See
// kvflowcontrolpb.RaftAdmissionMeta for how this data is selectively decoded.
// AdmissionPriority of the command (maps to admission.WorkPriority); used
// within a tenant below-raft for replication admission control.
int32 admission_priority = 18;
// AdmissionCreateTime is equivalent to Time.UnixNano() from the creation time
// of the request (or a parent request) for which this command is a part of.
// It's used within a tenant below-raft for replication admission control; see
// admission.WorkInfo.CreateTime for details.
int64 admission_create_time = 19;
// AdmissionOriginNode captures where this raft command originated. It's used
// to inform said node of this raft command's (virtual) admission in order for
// it to release flow tokens for subsequent commands.
int32 admission_origin_node = 20 [(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.NodeID"];
reserved 1, 2, 10001 to 10014;
}
// RaftCommandFooter contains a subset of the fields in RaftCommand. It is used
// to optimize a pattern where most of the fields in RaftCommand are marshaled
// outside of a heavily contended critical section, except for the fields in the
// footer, which are assigned and marshaled inside of the critical section and
// appended to the marshaled byte buffer. This minimizes the memory allocation
// and marshaling work performed under lock.
message RaftCommandFooter {
uint64 max_lease_index = 4;
// NOTE: unlike in RaftCommand, there's no reason to make this field nullable
// and so we make it non-nullable in order to save allocations. This means
// that the field on a decoded RaftCommand will also never be nil, but we
// don't rely on that.
util.hlc.Timestamp closed_timestamp = 17 [(gogoproto.nullable) = false];
}