Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[wip] kvserver: reproduce cluster-wide outage if liveness leaseholder deadlocks #79648

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/kv/kvclient/kvcoord/dist_sender.go
Original file line number Diff line number Diff line change
Expand Up @@ -2131,7 +2131,7 @@ func (ds *DistSender) sendToReplicas(
// replica into the cache, but without a leaseholder (and taking into
// account that the local node can't be down) it won't take long until we
// talk to a replica that tells us who the leaseholder is.
if ctx.Err() == nil {
if true || ctx.Err() == nil {
if lh := routing.Leaseholder(); lh != nil && *lh == curReplica {
routing.EvictLease(ctx)
}
Expand Down
21 changes: 21 additions & 0 deletions pkg/kv/kvserver/replica_raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@ package kvserver
import (
"context"
"math/rand"
"os"
"os/signal"
"sort"
"strings"
"syscall"
"time"

"github.com/cockroachdb/cockroach/pkg/clusterversion"
Expand Down Expand Up @@ -560,6 +563,12 @@ func (r *Replica) handleRaftReady(
return r.handleRaftReadyRaftMuLocked(ctx, inSnap)
}

var sigCh = func() chan os.Signal {
ch := make(chan os.Signal, 1)
signal.Notify(ch, syscall.SIGHUP)
return ch
}()

// handleRaftReadyRaftMuLocked is the same as handleRaftReady but requires that
// the replica's raftMu be held.
//
Expand Down Expand Up @@ -633,6 +642,18 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
return stats, "", nil
}

if r.RangeID == 2 {
select {
case <-sigCh:
if l, _ := r.GetLease(); l.OwnedBy(r.store.StoreID()) {
log.Warningf(ctx, "deadlocking liveness leaseholder")
r.mu.Lock()
r.mu.Lock()
}
default:
}
}

logRaftReady(ctx, rd)

refreshReason := noReason
Expand Down
9 changes: 9 additions & 0 deletions run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

roachprod destroy local || true
roachprod create -n 5 local
roachprod put local cockroach
roachprod start local
tail -F ~/local/*/logs/cockroach.log