From df95e7182fd35d7155a22f81ed83edac12ee9e08 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 16 Dec 2022 14:39:48 +0000 Subject: [PATCH] liveness: clarify `IncrementEpoch` race error and log as INFO Epic: none Release note: None --- pkg/kv/kvserver/liveness/liveness.go | 23 ++++++++++++++++++++++- pkg/kv/kvserver/replica_range_lease.go | 9 ++++++++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/pkg/kv/kvserver/liveness/liveness.go b/pkg/kv/kvserver/liveness/liveness.go index a5ceb1da1fd9..38a5eebc878d 100644 --- a/pkg/kv/kvserver/liveness/liveness.go +++ b/pkg/kv/kvserver/liveness/liveness.go @@ -67,6 +67,24 @@ var ( ErrEpochAlreadyIncremented = errors.New("epoch already incremented") ) +type ErrEpochCondFailed struct { + expected, actual livenesspb.Liveness +} + +// SafeFormatError implements errors.SafeFormatter. +func (e *ErrEpochCondFailed) SafeFormatError(p errors.Printer) error { + p.Printf( + "liveness record changed while incrementing epoch for %+v; actual is %+v; is the node still live?", + redact.Safe(e.expected), redact.Safe(e.actual)) + return nil +} + +func (e *ErrEpochCondFailed) Format(s fmt.State, verb rune) { errors.FormatError(e, s, verb) } + +func (e *ErrEpochCondFailed) Error() string { + return fmt.Sprint(e) +} + type errRetryLiveness struct { error } @@ -1223,7 +1241,10 @@ func (nl *NodeLiveness) IncrementEpoch(ctx context.Context, liveness livenesspb. } else if actual.Epoch < liveness.Epoch { return errors.Errorf("unexpected liveness epoch %d; expected >= %d", actual.Epoch, liveness.Epoch) } - return errors.Errorf("mismatch incrementing epoch for %+v; actual is %+v", liveness, actual) + return &ErrEpochCondFailed{ + expected: liveness, + actual: actual.Liveness, + } }) if err != nil { return err diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go index a256f68cc17d..a9afbd680813 100644 --- a/pkg/kv/kvserver/replica_range_lease.go +++ b/pkg/kv/kvserver/replica_range_lease.go @@ -511,7 +511,14 @@ func (p *pendingLeaseRequest) requestLease( // so we don't log it as an error. // // https://github.com/cockroachdb/cockroach/issues/35986 - if !errors.Is(err, liveness.ErrEpochAlreadyIncremented) { + if errors.Is(err, liveness.ErrEpochAlreadyIncremented) { + // ignore + } else if errors.HasType(err, &liveness.ErrEpochCondFailed{}) { + // ErrEpochCondFailed indicates that someone else changed the liveness + // record while we were incrementing it. The node could still be + // alive, or someone else updated it. Don't log this as an error. + log.Infof(ctx, "failed to increment leaseholder's epoch: %s", err) + } else { log.Errorf(ctx, "failed to increment leaseholder's epoch: %s", err) } }