-
Notifications
You must be signed in to change notification settings - Fork 3.9k
/
Copy pathraft_log_queue.go
618 lines (558 loc) · 25.2 KB
/
raft_log_queue.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
// Copyright 2015 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package storage
import (
"context"
"fmt"
"sort"
"strings"
"time"
"github.com/cockroachdb/cockroach/pkg/config"
"github.com/cockroachdb/cockroach/pkg/gossip"
"github.com/cockroachdb/cockroach/pkg/internal/client"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/pkg/errors"
"go.etcd.io/etcd/raft"
"go.etcd.io/etcd/raft/tracker"
)
const (
// raftLogQueueTimerDuration is the duration between truncations.
raftLogQueueTimerDuration = 0 // zero duration to process truncations greedily
// RaftLogQueueStaleThreshold is the minimum threshold for stale raft log
// entries. A stale entry is one which all replicas of the range have
// progressed past and thus is no longer needed and can be truncated.
RaftLogQueueStaleThreshold = 100
// RaftLogQueueStaleSize is the minimum size of the Raft log that we'll
// truncate even if there are fewer than RaftLogQueueStaleThreshold entries
// to truncate. The value of 64 KB was chosen experimentally by looking at
// when Raft log truncation usually occurs when using the number of entries
// as the sole criteria.
RaftLogQueueStaleSize = 64 << 10
// Allow a limited number of Raft log truncations to be processed
// concurrently.
raftLogQueueConcurrency = 4
// While a snapshot is in flight, we won't truncate past the snapshot's log
// index. This behavior is extended to a grace period after the snapshot is
// marked as completed as it is applied at the receiver only a little later,
// leaving a window for a truncation that requires another snapshot.
raftLogQueuePendingSnapshotGracePeriod = 3 * time.Second
)
// raftLogQueue manages a queue of replicas slated to have their raft logs
// truncated by removing unneeded entries.
type raftLogQueue struct {
*baseQueue
db *client.DB
logSnapshots util.EveryN
}
// newRaftLogQueue returns a new instance of raftLogQueue. Replicas are passed
// to the queue both proactively (triggered by write load) and periodically
// (via the scanner). When processing a replica, the queue decides whether the
// Raft log can be truncated, which is a tradeoff between wanting to keep the
// log short overall and allowing slower followers to catch up before they get
// cut off by a truncation and need a snapshot. See newTruncateDecision for
// details on this decision making process.
func newRaftLogQueue(store *Store, db *client.DB, gossip *gossip.Gossip) *raftLogQueue {
rlq := &raftLogQueue{
db: db,
logSnapshots: util.Every(10 * time.Second),
}
rlq.baseQueue = newBaseQueue(
"raftlog", rlq, store, gossip,
queueConfig{
maxSize: defaultQueueMaxSize,
maxConcurrency: raftLogQueueConcurrency,
needsLease: false,
needsSystemConfig: false,
acceptsUnsplitRanges: true,
successes: store.metrics.RaftLogQueueSuccesses,
failures: store.metrics.RaftLogQueueFailures,
pending: store.metrics.RaftLogQueuePending,
processingNanos: store.metrics.RaftLogQueueProcessingNanos,
},
)
return rlq
}
// newTruncateDecision returns a truncateDecision for the given Replica if no
// error occurs. If input data to establish a truncateDecision is missing, a
// zero decision is returned.
//
// At a high level, a truncate decision operates based on the Raft log size, the
// number of entries in the log, and the Raft status of the followers. In an
// ideal world and most of the time, followers are reasonably up to date, and a
// decision to truncate to the index acked on all replicas will be made whenever
// there is at least a little bit of log to truncate (think a hundred records or
// ~100kb of data). If followers fall behind, are offline, or are waiting for a
// snapshot, a second strategy is needed to make sure that the Raft log is
// eventually truncated: when the raft log size exceeds a limit (4mb at time of
// writing), truncations become willing and able to cut off followers as long as
// a quorum has acked the truncation index. The quota pool ensures that the delta
// between "acked by quorum" and "acked by all" is bounded, while Raft limits the
// size of the uncommitted, i.e. not "acked by quorum", part of the log; thus
// the "quorum" truncation strategy bounds the absolute size of the log on all
// followers.
//
// Exceptions are made for replicas for which information is missing ("probing
// state") as long as they are known to have been online recently, and for
// in-flight snapshots (in particular preemptive snapshots) which are not
// adequately reflected in the Raft status and would otherwise be cut off with
// regularity. Probing live followers should only remain in this state for a
// short moment and so we deny a log truncation outright (as there's no safe
// index to truncate to); for snapshots, we can still truncate, but not past
// the snapshot's index.
//
// A challenge for log truncation is to deal with sideloaded log entries, that
// is, entries which contain SSTables for direct ingestion into the storage
// engine. Such log entries are very large, and failing to account for them in
// the heuristics can trigger overly aggressive truncations.
//
// The raft log size used in the decision making process is principally updated
// in the main Raft command apply loop, and adds a Replica to this queue
// whenever the log size has increased by a non-negligible amount that would be
// worth truncating (~100kb).
//
// Unfortunately, the size tracking is not very robust as it suffers from two
// limitations at the time of writing:
// 1. it may undercount as it is in-memory and incremented only as proposals
// are handled; that is, a freshly started node will believe its Raft log to be
// zero-sized independent of its actual size, and
// 2. the addition and corresponding subtraction happen in very different places
// and are difficult to keep bug-free, meaning that there is low confidence that
// we maintain the delta in a completely accurate manner over time. One example
// of potential errors are sideloaded proposals, for which the subtraction needs
// to load the size of the file on-disk (i.e. supplied by the fs), whereas
// the addition uses the in-memory representation of the file.
//
// Ideally, a Raft log that grows large for whichever reason (for instance the
// queue being stuck on another replica) wouldn't be more than a nuisance on
// nodes with sufficient disk space. Unfortunately, at the time of writing, the
// Raft log is included in Raft snapshots. On the other hand, IMPORT/RESTORE's
// split/scatter phase interacts poorly with overly aggressive truncations and
// can DDOS the Raft snapshot queue.
func newTruncateDecision(ctx context.Context, r *Replica) (truncateDecision, error) {
rangeID := r.RangeID
now := timeutil.Now()
// NB: we need an exclusive lock due to grabbing the first index.
r.mu.Lock()
raftLogSize := r.mu.raftLogSize
// A "cooperative" truncation (i.e. one that does not cut off followers from
// the log) takes place whenever there are more than
// RaftLogQueueStaleThreshold entries or the log's estimated size is above
// RaftLogQueueStaleSize bytes. This is fairly aggressive, so under normal
// conditions, the log is very small.
//
// If followers start falling behind, at some point the logs still need to
// be truncated. We do this either when the size of the log exceeds
// RaftLogTruncationThreshold (or, in eccentric configurations, the zone's
// RangeMaxBytes). This captures the heuristic that at some point, it's more
// efficient to catch up via a snapshot than via applying a long tail of log
// entries.
targetSize := r.store.cfg.RaftLogTruncationThreshold
if targetSize > *r.mu.zone.RangeMaxBytes {
targetSize = *r.mu.zone.RangeMaxBytes
}
raftStatus := r.raftStatusRLocked()
firstIndex, err := r.raftFirstIndexLocked()
const anyRecipientStore roachpb.StoreID = 0
pendingSnapshotIndex := r.getAndGCSnapshotLogTruncationConstraintsLocked(now, anyRecipientStore)
lastIndex := r.mu.lastIndex
logSizeTrusted := r.mu.raftLogSizeTrusted
r.mu.Unlock()
if err != nil {
return truncateDecision{}, errors.Errorf("error retrieving first index for r%d: %s", rangeID, err)
}
if raftStatus == nil {
if log.V(6) {
log.Infof(ctx, "the raft group doesn't exist for r%d", rangeID)
}
return truncateDecision{}, nil
}
// Is this the raft leader? We only perform log truncation on the raft leader
// which has the up to date info on followers.
if raftStatus.RaftState != raft.StateLeader {
return truncateDecision{}, nil
}
// For all our followers, overwrite the RecentActive field (which is always
// true since we don't use CheckQuorum) with our own activity check.
r.mu.RLock()
log.Eventf(ctx, "raft status before lastUpdateTimes check: %+v", raftStatus.Progress)
log.Eventf(ctx, "lastUpdateTimes: %+v", r.mu.lastUpdateTimes)
updateRaftProgressFromActivity(
ctx, raftStatus.Progress, r.descRLocked().Replicas().All(), r.mu.lastUpdateTimes, now,
)
log.Eventf(ctx, "raft status after lastUpdateTimes check: %+v", raftStatus.Progress)
r.mu.RUnlock()
if pr, ok := raftStatus.Progress[raftStatus.Lead]; ok {
// TODO(tschottdorf): remove this line once we have picked up
// https://github.com/etcd-io/etcd/pull/10279
pr.State = tracker.StateReplicate
raftStatus.Progress[raftStatus.Lead] = pr
}
input := truncateDecisionInput{
RaftStatus: *raftStatus,
LogSize: raftLogSize,
MaxLogSize: targetSize,
LogSizeTrusted: logSizeTrusted,
FirstIndex: firstIndex,
LastIndex: lastIndex,
PendingPreemptiveSnapshotIndex: pendingSnapshotIndex,
}
decision := computeTruncateDecision(input)
return decision, nil
}
func updateRaftProgressFromActivity(
ctx context.Context,
prs map[uint64]tracker.Progress,
replicas []roachpb.ReplicaDescriptor,
lastUpdate lastUpdateTimesMap,
now time.Time,
) {
for _, replDesc := range replicas {
replicaID := replDesc.ReplicaID
pr, ok := prs[uint64(replicaID)]
if !ok {
continue
}
pr.RecentActive = lastUpdate.isFollowerActive(ctx, replicaID, now)
// Override this field for safety since we don't use it. Instead, we use
// pendingSnapshotIndex from above which is also populated for preemptive
// snapshots.
//
// NOTE: We don't rely on PendingSnapshot because PendingSnapshot is
// initialized by the leader when it realizes the follower needs a snapshot,
// and it isn't initialized with the index of the snapshot that is actually
// sent by us (out of band), which likely is lower.
pr.PendingSnapshot = 0
prs[uint64(replicaID)] = pr
}
}
const (
truncatableIndexChosenViaQuorumIndex = "quorum"
truncatableIndexChosenViaFollowers = "followers"
truncatableIndexChosenViaProbingFollower = "probing follower"
truncatableIndexChosenViaPendingSnap = "pending snapshot"
truncatableIndexChosenViaFirstIndex = "first index"
truncatableIndexChosenViaLastIndex = "last index"
)
type truncateDecisionInput struct {
RaftStatus raft.Status
LogSize, MaxLogSize int64
LogSizeTrusted bool // false when LogSize might be off
FirstIndex, LastIndex uint64
PendingPreemptiveSnapshotIndex uint64
}
func (input truncateDecisionInput) LogTooLarge() bool {
return input.LogSize > input.MaxLogSize
}
type truncateDecision struct {
Input truncateDecisionInput
QuorumIndex uint64 // largest index known to be present on quorum
NewFirstIndex uint64 // first index of the resulting log after truncation
ChosenVia string
}
func (td *truncateDecision) raftSnapshotsForIndex(index uint64) int {
var n int
for _, p := range td.Input.RaftStatus.Progress {
if p.State != tracker.StateReplicate {
// If the follower isn't replicating, we can't trust its Match in
// the first place. But note that this shouldn't matter in practice
// as we already take care to not cut off these followers when
// computing the truncate decision. See:
_ = truncatableIndexChosenViaProbingFollower // guru ref
continue
}
// When a log truncation happens at the "current log index" (i.e. the
// most recently committed index), it is often still in flight to the
// followers not required for quorum, and it is likely that they won't
// need a truncation to catch up. A follower in that state will have a
// Match equaling committed-1, but a Next of committed+1 (indicating that
// an append at 'committed' is already ongoing).
if p.Match < index && p.Next <= index {
n++
}
}
if td.Input.PendingPreemptiveSnapshotIndex != 0 && td.Input.PendingPreemptiveSnapshotIndex < index {
n++
}
return n
}
func (td *truncateDecision) NumNewRaftSnapshots() int {
return td.raftSnapshotsForIndex(td.NewFirstIndex) - td.raftSnapshotsForIndex(td.Input.FirstIndex)
}
func (td *truncateDecision) String() string {
var buf strings.Builder
_, _ = fmt.Fprintf(&buf, "should truncate: %t [", td.ShouldTruncate())
_, _ = fmt.Fprintf(
&buf,
"truncate %d entries to first index %d (chosen via: %s)",
td.NumTruncatableIndexes(), td.NewFirstIndex, td.ChosenVia,
)
if td.Input.LogTooLarge() {
_, _ = fmt.Fprintf(
&buf,
"; log too large (%s > %s)",
humanizeutil.IBytes(td.Input.LogSize),
humanizeutil.IBytes(td.Input.MaxLogSize),
)
}
if n := td.NumNewRaftSnapshots(); n > 0 {
_, _ = fmt.Fprintf(&buf, "; implies %d Raft snapshot%s", n, util.Pluralize(int64(n)))
}
if !td.Input.LogSizeTrusted {
_, _ = fmt.Fprintf(&buf, "; log size untrusted")
}
buf.WriteRune(']')
return buf.String()
}
func (td *truncateDecision) NumTruncatableIndexes() int {
if td.NewFirstIndex < td.Input.FirstIndex {
return 0
}
return int(td.NewFirstIndex - td.Input.FirstIndex)
}
func (td *truncateDecision) ShouldTruncate() bool {
n := td.NumTruncatableIndexes()
return n >= RaftLogQueueStaleThreshold ||
(n > 0 && td.Input.LogSize >= RaftLogQueueStaleSize)
}
// ProtectIndex attempts to "protect" a position in the log by making sure it's
// not truncated away. Specifically it lowers the proposed truncation point
// (which will be the new first index after the truncation) to the given index
// if it would be truncating at a point past it. If a change is made, the
// ChosenVia is updated with the one given. This protection is not guaranteed if
// the protected index is outside of the existing [FirstIndex,LastIndex] bounds.
func (td *truncateDecision) ProtectIndex(index uint64, chosenVia string) {
if td.NewFirstIndex > index {
td.NewFirstIndex = index
td.ChosenVia = chosenVia
}
}
// computeTruncateDecision returns the oldest index that cannot be
// truncated. If there is a behind node, we want to keep old raft logs so it
// can catch up without having to send a full snapshot. However, if a node down
// is down long enough, sending a snapshot is more efficient and we should
// truncate the log to the next behind node or the quorum committed index. We
// currently truncate when the raft log size is bigger than the range
// size.
//
// Note that when a node is behind we continue to let the raft log build up
// instead of truncating to the commit index. Consider what would happen if we
// truncated to the commit index whenever a node is behind and thus needs to be
// caught up via a snapshot. While we're generating the snapshot, sending it to
// the behind node and waiting for it to be applied we would continue to
// truncate the log. If the snapshot generation and application takes too long
// the behind node will be caught up to a point behind the current first index
// and thus require another snapshot, likely entering a never ending loop of
// snapshots. See #8629.
func computeTruncateDecision(input truncateDecisionInput) truncateDecision {
decision := truncateDecision{Input: input}
decision.QuorumIndex = getQuorumIndex(&input.RaftStatus)
// The last index is most aggressive possible truncation that we could do.
// Everything else in this method makes the truncation less aggressive.
decision.NewFirstIndex = decision.Input.LastIndex
decision.ChosenVia = truncatableIndexChosenViaLastIndex
// Start by trying to truncate at the quorum index. Naively, you would expect
// lastIndex to never be smaller than quorumIndex, but
// RaftStatus.Progress.Match is updated on the leader when a command is
// proposed and in a single replica Raft group this also means that
// RaftStatus.Commit is updated at propose time.
decision.ProtectIndex(decision.QuorumIndex, truncatableIndexChosenViaQuorumIndex)
for _, progress := range input.RaftStatus.Progress {
// Snapshots are expensive, so we try our best to avoid truncating past
// where a follower is.
// First, we never truncate off a recently active follower, no matter how
// large the log gets. Recently active shares the (currently 10s) constant
// as the quota pool, so the quota pool should put a bound on how much the
// raft log can grow due to this.
//
// For live followers which are being probed (i.e. the leader doesn't know
// how far they've caught up), the Match index is too large, and so the
// quorum index can be, too. We don't want these followers to require a
// snapshot since they are most likely going to be caught up very soon (they
// respond with the "right index" to the first probe or don't respond, in
// which case they should end up as not recently active). But we also don't
// know their index, so we can't possible make a truncation decision that
// avoids that at this point and make the truncation a no-op.
//
// The scenario in which this is most relevant is during restores, where we
// split off new ranges that rapidly receive very large log entries while
// the Raft group is still in a state of discovery (a new leader starts
// probing followers at its own last index). Additionally, these ranges will
// be split many times over, resulting in a flurry of snapshots with
// overlapping bounds that put significant stress on the Raft snapshot
// queue.
if progress.RecentActive {
if progress.State == tracker.StateProbe {
decision.ProtectIndex(decision.Input.FirstIndex, truncatableIndexChosenViaProbingFollower)
} else {
decision.ProtectIndex(progress.Match, truncatableIndexChosenViaFollowers)
}
continue
}
// Second, if the follower has not been recently active, we don't
// truncate it off as long as the raft log is not too large.
if !input.LogTooLarge() {
decision.ProtectIndex(progress.Match, truncatableIndexChosenViaFollowers)
}
// Otherwise, we let it truncate to the quorum index.
}
// The pending snapshot index acts as a placeholder for a replica that is
// about to be added to the range (or is in Raft recovery). We don't want to
// truncate the log in a way that will require that new replica to be caught
// up via yet another Raft snapshot.
if input.PendingPreemptiveSnapshotIndex > 0 {
decision.ProtectIndex(input.PendingPreemptiveSnapshotIndex, truncatableIndexChosenViaPendingSnap)
}
// If new first index dropped below first index, make them equal (resulting
// in a no-op).
if decision.NewFirstIndex < decision.Input.FirstIndex {
decision.NewFirstIndex = decision.Input.FirstIndex
decision.ChosenVia = truncatableIndexChosenViaFirstIndex
}
return decision
}
// getQuorumIndex returns the index which a quorum of the nodes have
// committed. The snapshotLogTruncationConstraints indicates the index of a pending
// snapshot which is considered part of the Raft group even though it hasn't
// been added yet. Note that getQuorumIndex may return 0 if the progress map
// doesn't contain information for a sufficient number of followers (e.g. the
// local replica has only recently become the leader). In general, the value
// returned by getQuorumIndex may be smaller than raftStatus.Commit which is
// the log index that has been committed by a quorum of replicas where that
// quorum was determined at the time the index was written. If you're thinking
// of using getQuorumIndex for some purpose, consider that raftStatus.Commit
// might be more appropriate (e.g. determining if a replica is up to date).
func getQuorumIndex(raftStatus *raft.Status) uint64 {
match := make([]uint64, 0, len(raftStatus.Progress))
for _, progress := range raftStatus.Progress {
if progress.State == tracker.StateReplicate {
match = append(match, progress.Match)
} else {
match = append(match, 0)
}
}
sort.Sort(uint64Slice(match))
quorum := computeQuorum(len(match))
return match[len(match)-quorum]
}
// shouldQueue determines whether a range should be queued for truncating. This
// is true only if the replica is the raft leader and if the total number of
// the range's raft log's stale entries exceeds RaftLogQueueStaleThreshold.
func (rlq *raftLogQueue) shouldQueue(
ctx context.Context, now hlc.Timestamp, r *Replica, _ *config.SystemConfig,
) (shouldQ bool, priority float64) {
decision, err := newTruncateDecision(ctx, r)
if err != nil {
log.Warning(ctx, err)
return false, 0
}
shouldQ, _, prio := rlq.shouldQueueImpl(ctx, decision)
return shouldQ, prio
}
// shouldQueueImpl returns whether the given truncate decision should lead to
// a log truncation. This is either the case if the decision says so or if
// we want to recompute the log size (in which case `recomputeRaftLogSize` and
// `shouldQ` are both true and a reasonable priority is returned).
func (rlq *raftLogQueue) shouldQueueImpl(
ctx context.Context, decision truncateDecision,
) (shouldQ bool, recomputeRaftLogSize bool, priority float64) {
if decision.ShouldTruncate() {
return true, !decision.Input.LogSizeTrusted, float64(decision.Input.LogSize)
}
if decision.Input.LogSizeTrusted ||
decision.Input.LastIndex == decision.Input.FirstIndex {
return false, false, 0
}
// We have a nonempty log (first index != last index) and can't vouch that
// the bytes in the log are known. Queue the replica; processing it will
// force a recomputation. For the priority, we have to pick one as we
// usually use the log size which is not available here. Going half-way
// between zero and the MaxLogSize should give a good tradeoff between
// processing the recomputation quickly, and not starving replicas which see
// a significant amount of write traffic until they run over and truncate
// more aggressively than they need to.
return true, true, 1.0 + float64(decision.Input.MaxLogSize)/2.0
}
// process truncates the raft log of the range if the replica is the raft
// leader and if the total number of the range's raft log's stale entries
// exceeds RaftLogQueueStaleThreshold.
func (rlq *raftLogQueue) process(ctx context.Context, r *Replica, _ *config.SystemConfig) error {
decision, err := newTruncateDecision(ctx, r)
if err != nil {
return err
}
if _, recompute, _ := rlq.shouldQueueImpl(ctx, decision); recompute {
log.VEventf(ctx, 2, "recomputing raft log based on decision %+v", decision)
// We need to hold raftMu both to access the sideloaded storage and to
// make sure concurrent Raft activity doesn't foul up our update to the
// cached in-memory values.
r.raftMu.Lock()
n, err := ComputeRaftLogSize(ctx, r.RangeID, r.Engine(), r.raftMu.sideloaded)
if err == nil {
r.mu.Lock()
r.mu.raftLogSize = n
r.mu.raftLogLastCheckSize = n
r.mu.raftLogSizeTrusted = true
r.mu.Unlock()
}
r.raftMu.Unlock()
if err != nil {
return errors.Wrap(err, "recomputing raft log size")
}
log.VEventf(ctx, 2, "recomputed raft log size to %s", humanizeutil.IBytes(n))
// Override the decision, now that an accurate log size is available.
decision, err = newTruncateDecision(ctx, r)
if err != nil {
return err
}
}
// Can and should the raft logs be truncated?
if decision.ShouldTruncate() {
if n := decision.NumNewRaftSnapshots(); log.V(1) || n > 0 && rlq.logSnapshots.ShouldProcess(timeutil.Now()) {
log.Info(ctx, decision.String())
} else {
log.VEvent(ctx, 1, decision.String())
}
b := &client.Batch{}
b.AddRawRequest(&roachpb.TruncateLogRequest{
RequestHeader: roachpb.RequestHeader{Key: r.Desc().StartKey.AsRawKey()},
Index: decision.NewFirstIndex,
RangeID: r.RangeID,
})
if err := rlq.db.Run(ctx, b); err != nil {
return err
}
r.store.metrics.RaftLogTruncated.Inc(int64(decision.NumTruncatableIndexes()))
} else {
log.VEventf(ctx, 3, decision.String())
}
return nil
}
// timer returns interval between processing successive queued truncations.
func (*raftLogQueue) timer(_ time.Duration) time.Duration {
return raftLogQueueTimerDuration
}
// purgatoryChan returns nil.
func (*raftLogQueue) purgatoryChan() <-chan time.Time {
return nil
}
var _ sort.Interface = uint64Slice(nil)
// uint64Slice implements sort.Interface
type uint64Slice []uint64
// Len implements sort.Interface
func (a uint64Slice) Len() int { return len(a) }
// Swap implements sort.Interface
func (a uint64Slice) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
// Less implements sort.Interface
func (a uint64Slice) Less(i, j int) bool { return a[i] < a[j] }