-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
Copy pathreplica_evaluate.go
461 lines (425 loc) · 16.1 KB
/
replica_evaluate.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package storage
import (
"bytes"
"context"
"math"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/storage/batcheval"
"github.com/cockroachdb/cockroach/pkg/storage/batcheval/result"
"github.com/cockroachdb/cockroach/pkg/storage/engine"
"github.com/cockroachdb/cockroach/pkg/storage/engine/enginepb"
"github.com/cockroachdb/cockroach/pkg/storage/storagebase"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/kr/pretty"
"github.com/pkg/errors"
)
// optimizePuts searches for contiguous runs of Put & CPut commands in
// the supplied request union. Any run which exceeds a minimum length
// threshold employs a full order iterator to determine whether the
// range of keys being written is empty. If so, then the run can be
// set to put "blindly", meaning no iterator need be used to read
// existing values during the MVCC write.
// The caller should use the returned slice (which is either equal to
// the input slice, or has been shallow-copied appropriately to avoid
// mutating the original requests).
func optimizePuts(
batch engine.ReadWriter, origReqs []roachpb.RequestUnion, distinctSpans bool,
) []roachpb.RequestUnion {
var minKey, maxKey roachpb.Key
var unique map[string]struct{}
if !distinctSpans {
unique = make(map[string]struct{}, len(origReqs))
}
// Returns false on occurrence of a duplicate key.
maybeAddPut := func(key roachpb.Key) bool {
// Note that casting the byte slice key to a string does not allocate.
if unique != nil {
if _, ok := unique[string(key)]; ok {
return false
}
unique[string(key)] = struct{}{}
}
if minKey == nil || bytes.Compare(key, minKey) < 0 {
minKey = key
}
if maxKey == nil || bytes.Compare(key, maxKey) > 0 {
maxKey = key
}
return true
}
firstUnoptimizedIndex := len(origReqs)
for i, r := range origReqs {
switch t := r.GetInner().(type) {
case *roachpb.PutRequest:
if maybeAddPut(t.Key) {
continue
}
case *roachpb.ConditionalPutRequest:
if maybeAddPut(t.Key) {
continue
}
case *roachpb.InitPutRequest:
if maybeAddPut(t.Key) {
continue
}
}
firstUnoptimizedIndex = i
break
}
if firstUnoptimizedIndex < optimizePutThreshold { // don't bother if below this threshold
return origReqs
}
iter := batch.NewIterator(engine.IterOptions{
// We want to include maxKey in our scan. Since UpperBound is exclusive, we
// need to set it to the key after maxKey.
UpperBound: maxKey.Next(),
})
defer iter.Close()
// If there are enough puts in the run to justify calling seek,
// we can determine whether any part of the range being written
// is "virgin" and set the puts to write blindly.
// Find the first non-empty key in the run.
iter.SeekGE(engine.MakeMVCCMetadataKey(minKey))
var iterKey roachpb.Key
if ok, err := iter.Valid(); err != nil {
// TODO(bdarnell): return an error here instead of silently
// running without the optimization?
log.Errorf(context.TODO(), "Seek returned error; disabling blind-put optimization: %+v", err)
return origReqs
} else if ok && bytes.Compare(iter.Key().Key, maxKey) <= 0 {
iterKey = iter.Key().Key
}
// Set the prefix of the run which is being written to virgin
// keyspace to "blindly" put values.
reqs := append([]roachpb.RequestUnion(nil), origReqs...)
for i := range reqs[:firstUnoptimizedIndex] {
inner := reqs[i].GetInner()
if iterKey == nil || bytes.Compare(iterKey, inner.Header().Key) > 0 {
switch t := inner.(type) {
case *roachpb.PutRequest:
shallow := *t
shallow.Blind = true
reqs[i].MustSetInner(&shallow)
case *roachpb.ConditionalPutRequest:
shallow := *t
shallow.Blind = true
reqs[i].MustSetInner(&shallow)
case *roachpb.InitPutRequest:
shallow := *t
shallow.Blind = true
reqs[i].MustSetInner(&shallow)
default:
log.Fatalf(context.TODO(), "unexpected non-put request: %s", t)
}
}
}
return reqs
}
// evaluateBatch evaluates a batch request by splitting it up into its
// individual commands, passing them to evaluateCommand, and combining
// the results.
func evaluateBatch(
ctx context.Context,
idKey storagebase.CmdIDKey,
batch engine.ReadWriter,
rec batcheval.EvalContext,
ms *enginepb.MVCCStats,
ba *roachpb.BatchRequest,
readOnly bool,
) (*roachpb.BatchResponse, result.Result, *roachpb.Error) {
// NB: Don't mutate BatchRequest directly.
baReqs := ba.Requests
baHeader := ba.Header
br := ba.CreateReply()
maxKeys := int64(math.MaxInt64)
if baHeader.MaxSpanRequestKeys != 0 {
// We have a batch of requests with a limit. We keep track of how many
// remaining keys we can touch.
maxKeys = baHeader.MaxSpanRequestKeys
}
// Optimize any contiguous sequences of put and conditional put ops.
if len(baReqs) >= optimizePutThreshold && !readOnly {
baReqs = optimizePuts(batch, baReqs, baHeader.DistinctSpans)
}
// Create a clone of the transaction to store the new txn state produced on
// the return/error path.
if baHeader.Txn != nil {
baHeader.Txn = baHeader.Txn.Clone()
// Check whether this transaction has been aborted, if applicable.
// This applies to writes that leave intents (the use of the
// IsTransactionWrite flag excludes operations like HeartbeatTxn),
// and reads that occur in a transaction that has already written
// (see #2231 for more about why we check for aborted transactions
// on reads). Note that 1PC transactions have had their
// transaction field cleared by this point so we do not execute
// this check in that case.
if ba.IsTransactionWrite() || baHeader.Txn.IsWriting() {
// We don't check the abort span for a couple of special requests:
// - if the request is asking to abort the transaction, then don't check the
// AbortSpan; we don't want the request to be rejected if the transaction
// has already been aborted.
// - heartbeats don't check the abort span. If the txn is aborted, they'll
// return an aborted proto in their otherwise successful response.
// TODO(nvanbenschoten): Let's remove heartbeats from this whitelist when
// we rationalize the TODO in txnHeartbeater.heartbeat.
singleAbort := ba.IsSingleEndTransactionRequest() &&
!baReqs[0].GetInner().(*roachpb.EndTransactionRequest).Commit
if !singleAbort && !ba.IsSingleHeartbeatTxnRequest() {
if pErr := checkIfTxnAborted(ctx, rec, batch, *baHeader.Txn); pErr != nil {
return nil, result.Result{}, pErr
}
}
}
}
var result result.Result
// WriteTooOldErrors are unique: When one is returned, we also lay
// down an intent at our new proposed timestamp. We have the option
// of continuing past a WriteTooOldError to the end of the
// transaction (at which point the txn.WriteTooOld flag will trigger
// a RefreshSpan and possibly a client-side retry).
//
// Within a batch, there's no downside to continuing past the
// WriteTooOldError, so we at least defer returning the error to the
// end of the batch.
//
// Across batches, it's more complicated. We want to avoid
// client-side retries whenever possible. However, if a client-side
// retry is inevitable, it's probably best to continue and lay down
// as many intents as possible before that retry (this can avoid n^2
// behavior in some scenarios with high contention on multiple keys,
// although we haven't verified this in practice).
//
// The SQL layer will transparently retry on the server side if
// we're in the first statement in a transaction. If we're in a
// first statement, we want to return WriteTooOldErrors immediately
// to take advantage of this. We don't have this information
// available at this level currently, so we err on the side of
// returning the WriteTooOldError immediately to get the server-side
// retry when it is available.
//
// TODO(bdarnell): Plumb the SQL CanAutoRetry field through to
// !baHeader.DeferWriteTooOldError.
//
// A more subtle heuristic is also possible: If we get a
// WriteTooOldError while writing to a key that we have already read
// (either earlier in the transaction, or as a part of the same
// operation for a ConditionalPut, Increment, or InitPut), a
// WriteTooOldError that is deferred to the end of the transaction
// is guarantee to result in a failed RefreshSpans and therefore a
// client-side retry. In some cases it may be possible to
// successfully retry at the TxnCoordSender, avoiding the
// client-side retry (this is likely for Increment, but unlikely for
// the others). In such cases, we may want to return the
// WriteTooOldError even if the SQL CanAutoRetry is false. As of
// this writing, nearly all writes issued by SQL are preceded by
// reads of the same key.
var writeTooOldErr *roachpb.Error
mustReturnWriteTooOldErr := false
for index, union := range baReqs {
// Execute the command.
args := union.GetInner()
if baHeader.Txn != nil {
// Set the Request's sequence number on the TxnMeta for this
// request. The MVCC layer (currently) uses TxnMeta to
// pass input arguments, such as the seqnum at which a
// request operates.
baHeader.Txn.Sequence = args.Header().Sequence
}
// Note that responses are populated even when an error is returned.
// TODO(tschottdorf): Change that. IIRC there is nontrivial use of it currently.
reply := br.Responses[index].GetInner()
curResult, pErr := evaluateCommand(ctx, idKey, index, batch, rec, ms, baHeader, maxKeys, args, reply)
if err := result.MergeAndDestroy(curResult); err != nil {
// TODO(tschottdorf): see whether we really need to pass nontrivial
// Result up on error and if so, formalize that.
log.Fatalf(
ctx,
"unable to absorb Result: %s\ndiff(new, old): %s",
err, pretty.Diff(curResult, result),
)
}
if pErr != nil {
// Initialize the error index.
pErr.SetErrorIndex(int32(index))
switch tErr := pErr.GetDetail().(type) {
case *roachpb.WriteTooOldError:
// We got a WriteTooOldError. We continue on to run all
// commands in the batch in order to determine the highest
// timestamp for more efficient retries. If the batch is
// transactional, we continue to lay down intents so that
// other concurrent overlapping transactions are forced
// through intent resolution and the chances of this batch
// succeeding when it will be retried are increased.
if writeTooOldErr != nil {
writeTooOldErr.GetDetail().(*roachpb.WriteTooOldError).ActualTimestamp.Forward(tErr.ActualTimestamp)
} else {
writeTooOldErr = pErr
}
// Requests which are both read and write are not currently
// accounted for in RefreshSpans, so they rely on eager
// returning of WriteTooOldErrors.
// TODO(bdarnell): add read+write requests to the read refresh spans
// in TxnCoordSender, and then I think this can go away.
if roachpb.IsReadAndWrite(args) {
mustReturnWriteTooOldErr = true
}
if baHeader.Txn != nil {
baHeader.Txn.WriteTimestamp.Forward(tErr.ActualTimestamp)
baHeader.Txn.WriteTooOld = true
}
// Clear pErr; we're done processing it by having moved the
// batch or txn timestamps forward and set WriteTooOld if this
// is a transactional write. If we don't return the
// WriteTooOldError from this method, we will detect the
// pushed timestamp at commit time and refresh or retry the
// transaction.
pErr = nil
default:
return nil, result, pErr
}
}
if maxKeys != math.MaxInt64 {
retResults := reply.Header().NumKeys
if retResults > maxKeys {
log.Fatalf(ctx, "received %d results, limit was %d", retResults, maxKeys)
}
maxKeys -= retResults
}
// If transactional, we use ba.Txn for each individual command and
// accumulate updates to it. Once accumulated, we then remove the Txn
// from each individual response.
// TODO(spencer,tschottdorf): need copy-on-write behavior for the
// updated batch transaction / timestamp.
if baHeader.Txn != nil {
if header := reply.Header(); header.Txn != nil {
baHeader.Txn.Update(header.Txn)
header.Txn = nil
reply.SetHeader(header)
}
}
}
// If there was an EndTransaction in the batch that finalized the transaction,
// the WriteTooOld status has been fully processed and we can discard the error.
if baHeader.Txn != nil && baHeader.Txn.Status.IsFinalized() {
writeTooOldErr = nil
} else if baHeader.Txn == nil {
// Non-transactional requests are unable to defer WriteTooOldErrors
// because there is no where to defer them to.
mustReturnWriteTooOldErr = true
}
// If there's a write too old error, return now that we've found
// the high water timestamp for retries.
if writeTooOldErr != nil && (mustReturnWriteTooOldErr || !baHeader.DeferWriteTooOldError) {
return nil, result, writeTooOldErr
}
if baHeader.Txn != nil {
// If transactional, send out the final transaction entry with the reply.
br.Txn = baHeader.Txn
// If the transaction committed, forward the response
// timestamp to the commit timestamp in case we were able to
// optimize and commit at a higher timestamp without higher-level
// retry (i.e. there were no refresh spans and the commit timestamp
// wasn't leaked).
if baHeader.Txn.Status == roachpb.COMMITTED {
br.Timestamp.Forward(baHeader.Txn.WriteTimestamp)
}
}
// Always update the batch response timestamp field to the timestamp at
// which the batch executed.
br.Timestamp.Forward(baHeader.Timestamp)
return br, result, nil
}
// evaluateCommand delegates to the eval method for the given
// roachpb.Request. The returned Result may be partially valid
// even if an error is returned. maxKeys is the number of scan results
// remaining for this batch (MaxInt64 for no limit).
func evaluateCommand(
ctx context.Context,
raftCmdID storagebase.CmdIDKey,
index int,
batch engine.ReadWriter,
rec batcheval.EvalContext,
ms *enginepb.MVCCStats,
h roachpb.Header,
maxKeys int64,
args roachpb.Request,
reply roachpb.Response,
) (result.Result, *roachpb.Error) {
// If a unittest filter was installed, check for an injected error; otherwise, continue.
if filter := rec.EvalKnobs().TestingEvalFilter; filter != nil {
filterArgs := storagebase.FilterArgs{
Ctx: ctx,
CmdID: raftCmdID,
Index: index,
Sid: rec.StoreID(),
Req: args,
Hdr: h,
}
if pErr := filter(filterArgs); pErr != nil {
log.Infof(ctx, "test injecting error: %s", pErr)
return result.Result{}, pErr
}
}
var err error
var pd result.Result
if cmd, ok := batcheval.LookupCommand(args.Method()); ok {
cArgs := batcheval.CommandArgs{
EvalCtx: rec,
Header: h,
Args: args,
MaxKeys: maxKeys,
Stats: ms,
}
pd, err = cmd.Eval(ctx, batch, cArgs, reply)
} else {
err = errors.Errorf("unrecognized command %s", args.Method())
}
if h.ReturnRangeInfo {
returnRangeInfo(reply, rec)
}
// TODO(peter): We'd like to assert that the hlc clock is always updated
// correctly, but various tests insert versioned data without going through
// the proper channels. See TestPushTxnUpgradeExistingTxn for an example.
//
// if header.Txn != nil && !header.Txn.Timestamp.Less(h.Timestamp) {
// if now := r.store.Clock().Now(); now.Less(header.Txn.Timestamp) {
// log.Fatalf(ctx, "hlc clock not updated: %s < %s", now, header.Txn.Timestamp)
// }
// }
if log.V(2) {
log.Infof(ctx, "evaluated %s command %+v: %+v, err=%v", args.Method(), args, reply, err)
}
// Create a roachpb.Error by initializing txn from the request/response header.
var pErr *roachpb.Error
if err != nil {
txn := reply.Header().Txn
if txn == nil {
txn = h.Txn
}
pErr = roachpb.NewErrorWithTxn(err, txn)
}
return pd, pErr
}
// returnRangeInfo populates RangeInfos in the response if the batch
// requested them.
func returnRangeInfo(reply roachpb.Response, rec batcheval.EvalContext) {
header := reply.Header()
lease, _ := rec.GetLease()
desc := rec.Desc()
header.RangeInfos = []roachpb.RangeInfo{
{
Desc: *desc,
Lease: lease,
},
}
reply.SetHeader(header)
}