From 57eed26cb34d2d31bd6d128495fa693b4acaa792 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Fri, 23 Apr 2021 18:20:22 -0400 Subject: [PATCH] kvserver: include Raft log in assertion This patch improves the closed timestamp regression assertion we've seen fire in #61981 to include a tail of the Raft log. Hopefully we never see that assertion fire again, but still I'd like to introduce a precedent for easily printing the log programatically. Also, the assertion now tells people about COCKROACH_RAFT_CLOSEDTS_ASSERTIONS_ENABLED. If the assertion fires and crashes nodes, those nodes will continue crashing on restart as they try to apply the same entries over and over. Release note: None --- .../replica_application_state_machine.go | 16 ++++- pkg/kv/kvserver/replica_raft.go | 64 +++++++++++++++++++ 2 files changed, 78 insertions(+), 2 deletions(-) diff --git a/pkg/kv/kvserver/replica_application_state_machine.go b/pkg/kv/kvserver/replica_application_state_machine.go index 6d363b25e56e..233656aecb63 100644 --- a/pkg/kv/kvserver/replica_application_state_machine.go +++ b/pkg/kv/kvserver/replica_application_state_machine.go @@ -1083,11 +1083,23 @@ func (b *replicaAppBatch) assertNoCmdClosedTimestampRegression(cmd *replicatedCm prevReq.SafeString("") } + logTail, err := b.r.printRaftTail(cmd.ctx, 100 /* maxEntries */, 2000 /* maxCharsPerEntry */) + if err != nil { + if logTail != "" { + logTail = logTail + "\n; error printing log: " + err.Error() + } else { + logTail = "error printing log: " + err.Error() + } + } + return errors.AssertionFailedf( "raft closed timestamp regression in cmd: %x (term: %d, index: %d); batch state: %s, command: %s, lease: %s, req: %s, applying at LAI: %d.\n"+ - "Closed timestamp was set by req: %s under lease: %s; applied at LAI: %d. Batch idx: %d.", + "Closed timestamp was set by req: %s under lease: %s; applied at LAI: %d. Batch idx: %d.\n"+ + "This assertion will fire again on restart; to ignore run with env var COCKROACH_RAFT_CLOSEDTS_ASSERTIONS_ENABLED=true"+ + "Raft log tail:\n%s", cmd.idKey, cmd.ent.Term, cmd.ent.Index, existingClosed, newClosed, b.state.Lease, req, cmd.leaseIndex, - prevReq, b.closedTimestampSetter.lease, b.closedTimestampSetter.leaseIdx, b.entries) + prevReq, b.closedTimestampSetter.lease, b.closedTimestampSetter.leaseIdx, b.entries, + logTail) } return nil } diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index 4f8ee3e79066..80edc046788b 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -15,6 +15,7 @@ import ( "fmt" "math/rand" "sort" + "strings" "time" "github.com/cockroachdb/cockroach/pkg/keys" @@ -1866,3 +1867,66 @@ func getNonDeterministicFailureExplanation(err error) string { } return "???" } + +// printRaftTail pretty-prints the tail of the log and returns it as a string, +// with the same format as `cockroach debug raft-log`. The entries are printed +// from newest to oldest. maxEntries and maxCharsPerEntry control the size of +// the output. +// +// If an error is returned, it's possible that a string with some entries is +// still returned. +func (r *Replica) printRaftTail( + ctx context.Context, maxEntries, maxCharsPerEntry int, +) (string, error) { + start := keys.RaftLogPrefix(r.RangeID) + end := keys.RaftLogPrefix(r.RangeID).PrefixEnd() + + // NB: raft log does not have intents. + it := r.Engine().NewEngineIterator(storage.IterOptions{LowerBound: start, UpperBound: end}) + valid, err := it.SeekEngineKeyLT(storage.EngineKey{Key: end}) + if err != nil { + return "", err + } + if !valid { + return "", errors.AssertionFailedf("iterator invalid but no error") + } + + var sb strings.Builder + for i := 0; i < maxEntries; i++ { + key, err := it.EngineKey() + if err != nil { + return sb.String(), err + } + mvccKey, err := key.ToMVCCKey() + if err != nil { + return sb.String(), err + } + kv := storage.MVCCKeyValue{ + Key: mvccKey, + Value: it.Value(), + } + sb.WriteString(truncateEntryString(SprintKeyValue(kv, true /* printKey */), 2000)) + sb.WriteRune('\n') + + valid, err := it.PrevEngineKey() + if err != nil { + return sb.String(), err + } + if !valid { + // We've finished the log. + break + } + } + return sb.String(), nil +} + +func truncateEntryString(s string, maxChars int) string { + res := s + if len(s) > maxChars { + if maxChars > 3 { + maxChars -= 3 + } + res = s[0:maxChars] + "..." + } + return res +}