-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
mvcc3.proto
262 lines (243 loc) · 11.3 KB
/
mvcc3.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
// Copyright 2017 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
syntax = "proto3";
package cockroach.storage.engine.enginepb;
import "util/hlc/timestamp.proto";
import "gogoproto/gogo.proto";
// TxnMeta is the metadata of a Transaction record.
message TxnMeta {
option (gogoproto.equal) = true;
option (gogoproto.populate) = true;
// id is a unique UUID value which identifies the transaction.
// This field is always filled in.
bytes id = 1 [(gogoproto.customname) = "ID",
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
(gogoproto.nullable) = false];
reserved 2;
// key is the key which anchors the transaction. This is typically
// the first key read or written during the transaction and
// determines which range in the cluster will hold the transaction
// record.
bytes key = 3; // TODO(tschottdorf): [(gogoproto.casttype) = "Key"];
// Incremented on txn retry.
int32 epoch = 4 [(gogoproto.casttype) = "TxnEpoch"];
// The proposed timestamp for the transaction. This starts as the current wall
// time on the txn coordinator, and is forwarded by the timestamp cache if the
// txn attempts to write "beneath" another txn's writes.
//
// Writes within the txn are performed using the most up-to-date value of this
// timestamp that is available. For example, suppose a txn starts at some
// timestamp, writes a key/value, and has its timestamp forwarded while doing
// so because a later version already exists at that key. As soon as the txn
// coordinator learns of the updated timestamp, it will begin performing
// writes at the updated timestamp. The coordinator may, however, continue
// issuing writes at the original timestamp before it learns about the
// forwarded timestamp. The process of resolving the intents when the txn
// commits will bump any intents written at an older timestamp to the final
// commit timestamp.
//
// Note that reads do not occur at this timestamp; they instead occur at
// OrigTimestamp, which is tracked in the containing roachpb.Transaction.
//
// Writes used to be performed at the txn's original timestamp, which was
// necessary to avoid lost update anomalies in snapshot isolation mode. We no
// longer support snapshot isolation mode, and there are now several important
// reasons that writes are performed at this timestamp instead of the txn's
// original timestamp:
//
// 1. This timestamp is forwarded by the timestamp cache when this
// transaction attempts to write beneath a more recent read. Leaving the
// intent at the original timestamp would write beneath that read, which
// would violate an invariant that time-bound iterators rely on.
//
// For example, consider a client that uses a time-bound iterator to
// poll for changes to a key. The client reads (ts5, ts10], sees no
// writes, and reports that no changes have occurred up to t10. Then a
// txn writes an intent at its original timestamp ts7. The txn's
// timestamp is forwarded to ts11 by the timestamp cache thanks to the
// client's read. Meanwhile, the client reads (ts10, ts15] and, again
// seeing no intents, reports that no changes have occurred to the key
// up to t15. Now the txn commits at ts11 and bumps the intent to ts11.
// But the client thinks it has seen all changes up to t15, and so never
// sees the intent! We avoid this problem by writing intents at the
// provisional commit timestamp insteadr. In this example, the intent
// would instead be written at ts11 and picked up by the client's next
// read from (ts10, ts15].
//
// 2. Unnecessary PushTxn roundtrips are avoided. If a transaction is
// forwarded from ts5 to ts10, the rest of its intents will be written
// at ts10. Reads at t < ts10 that encounter these intents can ignore
// them; if the intents had instead been left at ts5, these reads would
// have needed to send PushTxn requests just to find out that the txn
// had, in fact, been forwarded to a non-conflicting time.
//
// 3. Unnecessary intent rewriting is avoided. Writing at the original
// timestamp when this timestamp has been forwarded guarantees that the
// value will need to be rewritten at the forwarded timestamp if the
// transaction commits.
//
util.hlc.Timestamp timestamp = 5 [(gogoproto.nullable) = false];
// The timestamp that the transaction was assigned by its gateway when it
// began its first epoch. This is the earliest timestamp that the transaction
// could have written any of its intents at.
//
// The timestamp is currently used in two places:
// 1. by the transaction itself and by concurrent transactions when
// determining whether this transaction's record can be initially
// written. The timestamp is compared against the transaction's
// corresponding write timestamp cache entry to ensure that a
// finalized transaction can never commit, either after a replay
// or a transaction abort. See CanCreateTxnRecord.
// 2. by intent resolution to efficiently scan for intents while
// using a time-bound iterator - i.e. there can be intents to
// resolve up to the timestamp that the txn started with.
util.hlc.Timestamp min_timestamp = 9 [(gogoproto.nullable) = false];
// The transaction's priority, ratcheted on transaction pushes.
int32 priority = 6 [(gogoproto.casttype) = "TxnPriority"];
// A zero-indexed sequence number which is increased on each request
// sent as part of the transaction. When set in the header of a batch of
// requests, the value will correspond to the sequence number of the
// last request. Used to provide idempotency and to protect against
// out-of-order application (by means of a transaction retry).
int32 sequence = 7 [(gogoproto.casttype) = "TxnSeq"];
reserved 8;
}
// MVCCStatsDelta is convertible to MVCCStats, but uses signed variable width
// encodings for most fields that make it more efficient to store negative
// values. This makes the encodings incompatible.
message MVCCStatsDelta {
option (gogoproto.equal) = true;
// TODO(nvanbenschoten): now that we've split MVCCPersistentStats
// from this MVCCStatsDelta type, we can turn contains_estimates
// into a three-valued type ('UNCHANGED', 'NO', and 'YES').
bool contains_estimates = 14;
sfixed64 last_update_nanos = 1;
sfixed64 intent_age = 2;
sfixed64 gc_bytes_age = 3 [(gogoproto.customname) = "GCBytesAge"];
sint64 live_bytes = 4;
sint64 live_count = 5;
sint64 key_bytes = 6;
sint64 key_count = 7;
sint64 val_bytes = 8;
sint64 val_count = 9;
sint64 intent_bytes = 10;
sint64 intent_count = 11;
sint64 sys_bytes = 12;
sint64 sys_count = 13;
}
// MVCCPersistentStats is convertible to MVCCStats, but uses signed variable
// width encodings for most fields that make it efficient to store positive
// values but inefficient to store negative values. This makes the encodings
// incompatible.
message MVCCPersistentStats {
option (gogoproto.equal) = true;
option (gogoproto.populate) = true;
bool contains_estimates = 14;
sfixed64 last_update_nanos = 1;
sfixed64 intent_age = 2;
sfixed64 gc_bytes_age = 3 [(gogoproto.customname) = "GCBytesAge"];
int64 live_bytes = 4;
int64 live_count = 5;
int64 key_bytes = 6;
int64 key_count = 7;
int64 val_bytes = 8;
int64 val_count = 9;
int64 intent_bytes = 10;
int64 intent_count = 11;
int64 sys_bytes = 12;
int64 sys_count = 13;
}
// RangeAppliedState combines the raft and lease applied indices with
// mvcc stats. These are all persisted on each transition of the Raft
// state machine (i.e. on each Raft application), so they are stored
// in the same RocksDB key for efficiency.
message RangeAppliedState {
option (gogoproto.equal) = true;
option (gogoproto.populate) = true;
// raft_applied_index is the highest (and last) index applied to the Raft
// state machine.
uint64 raft_applied_index = 1;
// lease_applied_index is the highest (and last) lease index applied to the
// Raft state machine.
uint64 lease_applied_index = 2;
// range_stats is the set of mvcc stats that accounts for the current value
// of the Raft state machine.
MVCCPersistentStats range_stats = 3 [(gogoproto.nullable) = false];
}
// MVCCWriteValueOp corresponds to a value being written outside of a
// transaction.
message MVCCWriteValueOp {
bytes key = 1;
util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false];
bytes value = 3;
}
// MVCCUpdateIntentOp corresponds to an intent being written for a given
// transaction.
message MVCCWriteIntentOp {
bytes txn_id = 1 [
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
(gogoproto.customname) = "TxnID",
(gogoproto.nullable) = false];
bytes txn_key = 2;
util.hlc.Timestamp txn_min_timestamp = 4 [(gogoproto.nullable) = false];
util.hlc.Timestamp timestamp = 3 [(gogoproto.nullable) = false];
}
// MVCCUpdateIntentOp corresponds to an intent being updates at a larger
// timestamp for a given transaction.
message MVCCUpdateIntentOp {
bytes txn_id = 1 [
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
(gogoproto.customname) = "TxnID",
(gogoproto.nullable) = false];
util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false];
}
// MVCCCommitIntentOp corresponds to an intent being committed for a given
// transaction.
message MVCCCommitIntentOp {
bytes txn_id = 1 [
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
(gogoproto.customname) = "TxnID",
(gogoproto.nullable) = false];
bytes key = 2;
util.hlc.Timestamp timestamp = 3 [(gogoproto.nullable) = false];
bytes value = 4;
}
// MVCCAbortIntentOp corresponds to an intent being aborted for a given
// transaction.
//
// This operation does not necessarily indicate that the intent's transaction
// was aborted, just that an intent was removed without being committed. For
// instance, a committed transaction will abort any intents it decided not to
// write in its final epoch.
message MVCCAbortIntentOp {
bytes txn_id = 1 [
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
(gogoproto.customname) = "TxnID",
(gogoproto.nullable) = false];
}
// MVCCAbortTxnOp corresponds to an entire transaction being aborted. The
// operation indicates that none of the transaction's intents will ever be
// committed.
message MVCCAbortTxnOp {
bytes txn_id = 1 [
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID",
(gogoproto.customname) = "TxnID",
(gogoproto.nullable) = false];
}
// MVCCLogicalOp is a union of all logical MVCC operation types.
message MVCCLogicalOp {
option (gogoproto.onlyone) = true;
MVCCWriteValueOp write_value = 1;
MVCCWriteIntentOp write_intent = 2;
MVCCUpdateIntentOp update_intent = 3;
MVCCCommitIntentOp commit_intent = 4;
MVCCAbortIntentOp abort_intent = 5;
MVCCAbortTxnOp abort_txn = 6;
}