From 6562b3093873f61b6db248a8228e3d82ed292f36 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Fri, 8 Apr 2022 12:10:15 +0200 Subject: [PATCH 1/2] [wip] kvserver: reproduce cluster-wide outage if liveness leaseholder deadlocks This provides a reproduction of (internal) https://github.com/cockroachlabs/support/issues/1520. Usage: ``` ./run.sh pkill -HUP cockroach ``` Release note: None --- pkg/kv/kvserver/replica_raft.go | 21 +++++++++++++++++++++ run.sh | 9 +++++++++ 2 files changed, 30 insertions(+) create mode 100755 run.sh diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index 148734cf2874..0484c2bba134 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -13,8 +13,11 @@ package kvserver import ( "context" "math/rand" + "os" + "os/signal" "sort" "strings" + "syscall" "time" "github.com/cockroachdb/cockroach/pkg/clusterversion" @@ -560,6 +563,12 @@ func (r *Replica) handleRaftReady( return r.handleRaftReadyRaftMuLocked(ctx, inSnap) } +var sigCh = func() chan os.Signal { + ch := make(chan os.Signal, 1) + signal.Notify(ch, syscall.SIGHUP) + return ch +}() + // handleRaftReadyRaftMuLocked is the same as handleRaftReady but requires that // the replica's raftMu be held. // @@ -633,6 +642,18 @@ func (r *Replica) handleRaftReadyRaftMuLocked( return stats, "", nil } + if r.RangeID == 2 { + select { + case <-sigCh: + if l, _ := r.GetLease(); l.OwnedBy(r.store.StoreID()) { + log.Warningf(ctx, "deadlocking liveness leaseholder") + r.mu.Lock() + r.mu.Lock() + } + default: + } + } + logRaftReady(ctx, rd) refreshReason := noReason diff --git a/run.sh b/run.sh new file mode 100755 index 000000000000..e608e0ba35be --- /dev/null +++ b/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +roachprod destroy local || true +roachprod create -n 5 local +roachprod put local cockroach +roachprod start local +tail -F ~/local/*/logs/cockroach.log + + From 08781787e692fab68521525ceae750a5065dcb01 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Fri, 8 Apr 2022 16:14:33 +0200 Subject: [PATCH 2/2] this fixes it --- pkg/kv/kvclient/kvcoord/dist_sender.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/kv/kvclient/kvcoord/dist_sender.go b/pkg/kv/kvclient/kvcoord/dist_sender.go index f6ac71f07829..013dc20c116b 100644 --- a/pkg/kv/kvclient/kvcoord/dist_sender.go +++ b/pkg/kv/kvclient/kvcoord/dist_sender.go @@ -2131,7 +2131,7 @@ func (ds *DistSender) sendToReplicas( // replica into the cache, but without a leaseholder (and taking into // account that the local node can't be down) it won't take long until we // talk to a replica that tells us who the leaseholder is. - if ctx.Err() == nil { + if true || ctx.Err() == nil { if lh := routing.Leaseholder(); lh != nil && *lh == curReplica { routing.EvictLease(ctx) }