diff --git a/demo.sh b/demo.sh new file mode 100755 index 000000000000..f7dfffc36f94 --- /dev/null +++ b/demo.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -euo pipefail + +roachprod destroy local || true +roachprod create -n 5 local + +roachprod put local cockroach +roachprod start local:1-5 +sleep 60 +roachprod run local:1 -- ./cockroach workload init kv --splits 100 +roachprod run local:1 -- ./cockroach workload run kv --max-rate 100 --read-percent 0 --duration=10s {pgurl:1-5} + +roachprod sql local:1 -- -e 'set cluster setting kv.replica_circuit_breakers.enabled = true' + +roachprod stop local:1,2 + +roachprod run local:1 -- ./cockroach workload run kv --max-rate 100 --read-percent 0 --tolerate-errors --duration=60000s {pgurl:3-5} 2> errors.txt + +# TODO: sometimes don't get 100 errors/sec but we get stuck somewhere, use this +# below to chase down the ranges and to look into it. Consider filtering on +# numPending > 0 to get the more interesting ones. +# curl http://localhost:26264/_status/ranges/local | jq '.ranges[] | select((.state.circuitBreakerError == "") and (.span.startKey | test("Table/56")))' | less + +# TODO: problem ranges should highlight circ breakers +# TODO: careful with whole-cluster restarts (or restarts of nodes that are required for quorum); is there any KV access in the start path that wll fatal the node on a hard error (such as an open circuit breaker error from somewhere?) diff --git a/pkg/kv/kvserver/BUILD.bazel b/pkg/kv/kvserver/BUILD.bazel index cb45e57eb7f4..c787ec7c715e 100644 --- a/pkg/kv/kvserver/BUILD.bazel +++ b/pkg/kv/kvserver/BUILD.bazel @@ -34,6 +34,7 @@ go_library( "replica_application_state_machine.go", "replica_backpressure.go", "replica_batch_updates.go", + "replica_circuit_breaker.go", "replica_closedts.go", "replica_command.go", "replica_consistency.go", @@ -162,6 +163,7 @@ go_library( "//pkg/util", "//pkg/util/admission", "//pkg/util/bufalloc", + "//pkg/util/circuit", "//pkg/util/contextutil", "//pkg/util/ctxgroup", "//pkg/util/encoding", @@ -221,6 +223,7 @@ go_test( "client_rangefeed_test.go", "client_relocate_range_test.go", "client_replica_backpressure_test.go", + "client_replica_circuit_breaker_test.go", "client_replica_gc_test.go", "client_replica_test.go", "client_spanconfigs_test.go", @@ -374,6 +377,7 @@ go_test( "//pkg/ts/tspb", "//pkg/util", "//pkg/util/caller", + "//pkg/util/circuit", "//pkg/util/contextutil", "//pkg/util/ctxgroup", "//pkg/util/encoding", diff --git a/pkg/kv/kvserver/client_replica_circuit_breaker_test.go b/pkg/kv/kvserver/client_replica_circuit_breaker_test.go new file mode 100644 index 000000000000..9838b1ab57e6 --- /dev/null +++ b/pkg/kv/kvserver/client_replica_circuit_breaker_test.go @@ -0,0 +1,567 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package kvserver_test + +import ( + "context" + "sync/atomic" + "testing" + "time" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/keys" + "github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/testutils/skip" + "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" + "github.com/cockroachdb/cockroach/pkg/util/circuit" + "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/stop" + "github.com/cockroachdb/errors" + "github.com/stretchr/testify/require" +) + +func TestReplicaCircuitBreaker(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + tc := setupCircuitBreakerTest(t) + defer tc.Stopper().Stop(ctx) + + // In all scenarios below, we are starting out with our range on n1 and n2, + // and all other ranges (in particular the liveness range) on n1. + const ( + n1 = 0 + n2 = 1 + + pauseHeartbeats = true + keepHeartbeats = true + ) + + // This is a sanity check in which the breaker plays no role. + runCircuitBreakerTest(t, "breaker-ok", func(t *testing.T, ctx context.Context, tc *circuitBreakerTest) { + // Circuit breaker doesn't get in the way of anything unless + // something trips it. + require.NoError(t, tc.Write(n1)) + tc.RequireIsNotLeaseholderError(t, tc.Write(n2)) + require.NoError(t, tc.Read(n1)) + tc.RequireIsNotLeaseholderError(t, tc.Read(n2)) + }) + + // In this test, n1 holds the lease and we disable the probe and trip the + // breaker. While the breaker is tripped, requests fail-fast with either a + // breaker or lease error. When the probe is re-enabled, everything heals. + runCircuitBreakerTest(t, "leaseholder-tripped", func(t *testing.T, ctx context.Context, tc *circuitBreakerTest) { + // Get lease on n1. + require.NoError(t, tc.Write(n1)) + // Disable the probe so that when the breaker trips, it stays tripped. + tc.SetProbeEnabled(n1, false) + tc.Report(n1, errors.New("boom")) + + // n1 could theoretically still serve reads (there is a valid lease + // and none of the latches are taken), but since it is hard to determine + // that upfront we currently fail all reads as well. + tc.RequireIsBreakerOpen(t, tc.Read(n1)) + tc.RequireIsBreakerOpen(t, tc.Write(n1)) + + // When we go through the KV client stack, we still get the breaker error + // back. + tc.RequireIsBreakerOpen(t, tc.WriteDS(n1)) + tc.RequireIsBreakerOpen(t, tc.WriteDS(n2)) + + // n2 does not have the lease so all it does is redirect to the leaseholder + // n1. + tc.RequireIsNotLeaseholderError(t, tc.Read(n2)) + tc.RequireIsNotLeaseholderError(t, tc.Write(n2)) + + // Enable the probe. Even a read should trigger the probe + // and within due time the breaker should heal. + tc.SetProbeEnabled(n1, true) + tc.UntripsSoon(t, tc.Read, n1) + // Same behavior on writes. + tc.Report(n1, errors.New("boom again")) + tc.UntripsSoon(t, tc.Write, n1) + }) + + // In this scenario we have n1 holding the lease and we permanently trip the + // breaker on follower n2. Before the breaker is tripped, we see + // NotLeaseholderError. When it's tripped, those are supplanted by the breaker + // errors. Once we allow the breaker to probe, the breaker untrips. In + // particular, this tests that the probe can succeed even when run on a + // follower (which would not be true if it required the local Replica to + // execute an operation that requires the lease). + runCircuitBreakerTest(t, "follower-tripped", func(t *testing.T, ctx context.Context, tc *circuitBreakerTest) { + // Get lease on n1. + require.NoError(t, tc.Write(n1)) + // Disable the probe on n2 so that when the breaker trips, it stays tripped. + tc.SetProbeEnabled(n2, false) + tc.Report(n2, errors.New("boom")) + + // We didn't trip the leaseholder n1, so it is unaffected. + require.NoError(t, tc.Read(n1)) + require.NoError(t, tc.Write(n1)) + // Even if we go through DistSender, we reliably reach the leaseholder. + // TODO(tbg): I think this relies on the leaseholder being cached. If + // DistSender tried to contact the follower and got the breaker error, at + // time of writing it would propagate it. + require.NoError(t, tc.WriteDS(n1)) + + tc.RequireIsBreakerOpen(t, tc.Read(n2)) + tc.RequireIsBreakerOpen(t, tc.Write(n2)) + + // Enable the probe. Even a read should trigger the probe + // and within due time the breaker should heal, giving us + // NotLeaseholderErrors again. + // + // TODO(tbg): this test would be more meaningful with follower reads. They + // should succeed when the breaker is open and fail if the breaker is + // tripped. However knowing that the circuit breaker check sits at the top + // of Replica.sendWithRangeID, it's clear that it won't make a difference. + tc.SetProbeEnabled(n2, true) + testutils.SucceedsSoon(t, func() error { + if err := tc.Read(n2); !errors.HasType(err, (*roachpb.NotLeaseHolderError)(nil)) { + return err + } + return nil + }) + // Same behavior on writes. + tc.Report(n2, errors.New("boom again")) + testutils.SucceedsSoon(t, func() error { + if err := tc.Write(n2); !errors.HasType(err, (*roachpb.NotLeaseHolderError)(nil)) { + return err + } + return nil + }) + }) + + // In this scenario, the breaker is tripped and the probe is disabled and + // additionally, the liveness records for both nodes have expired. Soon after + // the probe is re-enabled, the breaker heals. In particular, the probe isn't + // doing anything that requires the lease (or whatever it does that requires + // the lease is sufficiently special cased; at time of writing it's the former + // but as the probe learns deeper checks, the plan is ultimately the latter). + runCircuitBreakerTest(t, "leaseless-tripped", func(t *testing.T, ctx context.Context, tc *circuitBreakerTest) { + // Put the lease on n1 but then trip the breaker with the probe + // disabled. + require.NoError(t, tc.Write(n1)) + tc.SetProbeEnabled(n1, false) + tc.Report(n1, errors.New("boom")) + resumeHeartbeats := tc.ExpireAllLeases(t, pauseHeartbeats) + + // n2 (not n1) will return a NotLeaseholderError. This may be surprising - + // why isn't it trying and succeeding to acquire a lease - but it does + // not do that because it sees that the new leaseholder (n2) is not live + // itself. We'll revisit this after re-enabling liveness later in the test. + { + err := tc.Read(n2) + // At time of writing: not incrementing epoch on n1 because next + // leaseholder (n2) not live. + t.Log(err) + tc.RequireIsNotLeaseholderError(t, err) + // Same behavior for write on n2. + tc.RequireIsNotLeaseholderError(t, tc.Write(n2)) + } + // On n1, run into the circuit breaker when requesting lease. + { + tc.RequireIsBreakerOpen(t, tc.Read(n1)) + tc.RequireIsBreakerOpen(t, tc.Write(n1)) + } + + // Let the breaker heal and things should go back to normal. This is not a + // trivial thing to hold, as the probe needs to go through for this, and if + // we're not careful, the probe itself is held up by the breaker as well. + // Since the probe leads to a lease acquisition and the lease acquisition is + // fairly disjoint from the request that triggered it, there is custom code + // to make this work. + resumeHeartbeats() + tc.SetProbeEnabled(n1, true) + tc.UntripsSoon(t, tc.Read, n1) + tc.UntripsSoon(t, tc.Write, n1) + tc.RequireIsNotLeaseholderError(t, tc.Read(n2)) + tc.RequireIsNotLeaseholderError(t, tc.Write(n2)) + }) + + // In this test, the range is on n1 and n2 and we take down the follower n2, + // thus losing quorum (but not the lease or leaseholder). After the + // SlowReplicationThreshold (which is reduced suitably to keep the test + // snappy) has passed, the breaker on n1's Replica trips. When n2 comes back, + // the probe on n1 succeeds and requests to n1 can acquire a lease and + // succeed. + runCircuitBreakerTest(t, "leaseholder-quorum-loss", func(t *testing.T, ctx context.Context, tc *circuitBreakerTest) { + // Get lease on n1. + require.NoError(t, tc.Write(n1)) + tc.StopServer(n2) // lose quorum + + // We didn't lose the liveness range (which is only on n1). + require.NoError(t, tc.Server(n1).HeartbeatNodeLiveness()) + tc.SetSlowThreshold(10 * time.Millisecond) + { + err := tc.Write(n1) + var ae *roachpb.AmbiguousResultError + require.True(t, errors.As(err, &ae), "%+v", err) + t.Log(err) + } + tc.RequireIsBreakerOpen(t, tc.Read(n1)) + + // Bring n2 back and service should be restored. + tc.SetSlowThreshold(0) // reset + require.NoError(t, tc.RestartServer(n2)) + tc.UntripsSoon(t, tc.Read, n1) + require.NoError(t, tc.Write(n1)) + }) + + // In this test, the range is on n1 and n2 and we place the lease on n2 and + // shut down n2 and expire the lease. n1 will be a non-leaseholder without + // quorum, and requests to it should trip the circuit breaker. This is an + // interesting test case internally because here, the request that trips the + // breaker is the slow lease request, and not the tests's actual write. Since + // leases have lots of special casing internally, this is easy to get wrong. + runCircuitBreakerTest(t, "follower-quorum-loss", func(t *testing.T, ctx context.Context, tc *circuitBreakerTest) { + // Get lease to n2 so that we can lose it without taking down the system ranges. + desc := tc.LookupRangeOrFatal(t, tc.ScratchRange(t)) + tc.TransferRangeLeaseOrFatal(t, desc, tc.Target(n2)) + resumeHeartbeats := tc.ExpireAllLeases(t, keepHeartbeats) + tc.StopServer(n2) // lose quorum and leaseholder + resumeHeartbeats() + + // We didn't lose the liveness range (which is only on n1). + require.NoError(t, tc.Server(n1).HeartbeatNodeLiveness()) + tc.SetSlowThreshold(10 * time.Millisecond) + tc.RequireIsBreakerOpen(t, tc.Write(n1)) + tc.RequireIsBreakerOpen(t, tc.Read(n1)) + + // Bring n2 back and service should be restored. + tc.SetSlowThreshold(0) // reset + require.NoError(t, tc.RestartServer(n2)) + tc.UntripsSoon(t, tc.Read, n1) + require.NoError(t, tc.Write(n1)) + }) + + // This test is skipped but documents that the current circuit breakers cannot + // prevent hung requests when the *liveness* range is down. + // + // The liveness range is usually 5x-replicated and so is less likely to lose + // quorum, for resilience against asymmetric partitions it would be nice to + // also trip the local breaker if liveness updated cannot be performed. We + // can't rely on receiving an error from the liveness range back, as we may not + // be able to reach any of its Replicas (and in fact all of its Replicas may + // have been lost, in extreme cases), so we would need to guard all + // interactions with the liveness range in a timeout, which is unsatisfying. + // + // A somewhat related problem needs to be solved for general loss of all + // Replicas of a Range. In that case the request will never reach a + // per-Replica circuit breaker and it will thus fail slow. Instead, we would + // need DistSender to detect this scenario (for example, by cross-checking + // liveness against the available targets, but this gets complicated again + // due to our having bootstrapped liveness on top of the KV stack). + // + // Solving the general problem, however, wouldn't obviate the need for + // special-casing of lease-related liveness interactions, since we also want + // to protect against the case in which the liveness range is "there" but + // simply will not make progress for whatever reason. + // + // An argument can be made that in such a case it is likely that the cluster + // is unavailable in its entirety. + runCircuitBreakerTest(t, "liveness-unavailable", func(t *testing.T, ctx context.Context, tc *circuitBreakerTest) { + skip.IgnoreLint(t, "See: https://github.com/cockroachdb/cockroach/issues/74616") + // Up-replicate liveness range and move lease to n2. + tc.AddVotersOrFatal(t, keys.NodeLivenessPrefix, tc.Target(n2)) + tc.TransferRangeLeaseOrFatal(t, tc.LookupRangeOrFatal(t, keys.NodeLivenessPrefix), tc.Target(n2)) + // Sanity check that things still work. + require.NoError(t, tc.Write(n1)) + tc.RequireIsNotLeaseholderError(t, tc.Write(n2)) + // Remove the second replica for our main range. + tc.RemoveVotersOrFatal(t, tc.ScratchRange(t), tc.Target(n2)) + + // Now stop n2. This will lose the liveness range only since the other + // ranges are on n1 only. + tc.StopServer(n2) + + // Expire all leases. We also pause all heartbeats but that doesn't really + // matter since the liveness range is unavailable anyway. + resume := tc.ExpireAllLeases(t, pauseHeartbeats) + defer resume() + + // Since there isn't a lease, and the liveness range is down, the circuit + // breaker should kick into gear. + tc.SetSlowThreshold(10 * time.Millisecond) + + // This is what fails, as the lease acquisition hangs on the liveness range + // but nothing will ever report a problem to the breaker. + tc.RequireIsBreakerOpen(t, tc.Read(n1)) + tc.RequireIsBreakerOpen(t, tc.Write(n1)) + + tc.SetSlowThreshold(0) // reset + require.NoError(t, tc.RestartServer(n2)) + + tc.UntripsSoon(t, tc.Read, n1) + require.NoError(t, tc.Write(n1)) + }) +} + +// Test infrastructure below. + +func makeBreakerToggleable(b *circuit.Breaker) (setProbeEnabled func(bool)) { + opts := b.Opts() + origProbe := opts.AsyncProbe + var disableProbe int32 + opts.AsyncProbe = func(report func(error), done func()) { + if atomic.LoadInt32(&disableProbe) == 1 { + done() + return + } + origProbe(report, done) + } + b.Reconfigure(opts) + return func(to bool) { + var n int32 + if !to { + n = 1 + } + atomic.StoreInt32(&disableProbe, n) + } +} + +type replWithKnob struct { + *kvserver.Replica + setProbeEnabled func(bool) +} + +type circuitBreakerTest struct { + *testcluster.TestCluster + slowThresh *atomic.Value // time.Duration + ManualClock *hlc.HybridManualClock + repls []replWithKnob // 0 -> repl on Servers[0], etc +} + +func runCircuitBreakerTest( + t *testing.T, name string, f func(*testing.T, context.Context, *circuitBreakerTest), +) { + t.Run(name, func(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 4*testutils.DefaultSucceedsSoonDuration) + defer cancel() + tc := setupCircuitBreakerTest(t) + defer tc.Stopper().Stop(ctx) + f(t, ctx, tc) + }) +} + +func setupCircuitBreakerTest(t *testing.T) *circuitBreakerTest { + manualClock := hlc.NewHybridManualClock() + var rangeID int64 // atomic + slowThresh := &atomic.Value{} // supports .SetSlowThreshold(x) + slowThresh.Store(time.Duration(0)) + storeKnobs := &kvserver.StoreTestingKnobs{ + SlowReplicationThresholdOverride: func(ba *roachpb.BatchRequest) time.Duration { + t.Helper() + if rid := roachpb.RangeID(atomic.LoadInt64(&rangeID)); rid == 0 || ba == nil || ba.RangeID != rid { + return 0 + } + dur := slowThresh.Load().(time.Duration) + if dur > 0 { + t.Logf("%s: using slow replication threshold %s", ba.Summary(), dur) + } + return dur // 0 = default + }, + // The test will often check individual replicas and the lease will always be on + // n1. However, we don't control raft leadership placement and without this knob, + // n1 may refuse to acquire the lease, which we don't want. + AllowLeaseRequestProposalsWhenNotLeader: true, + // The TestingApplyFilter prevents n2 from requesting a lease (or from the lease + // being transferred to n2). The test seems to pass pretty reliably without this + // but it can't hurt. + TestingApplyFilter: func(args kvserverbase.ApplyFilterArgs) (int, *roachpb.Error) { + if !args.IsLeaseRequest { + return 0, nil + } + lease := args.State.Lease + if lease == nil { + return 0, nil + } + if lease.Replica.NodeID != 2 { + return 0, nil + } + pErr := roachpb.NewErrorf("test prevents lease acquisition by n2") + return 0, pErr + }, + } + // In some tests we'll restart servers, which means that we will be waiting + // for raft elections. Speed this up by campaigning aggressively. This also + // speeds up the test by calling refreshProposalsLocked more frequently, which + // is where the logic to trip the breaker sits. Together, this cuts most tests + // involving a restart from >4s to ~300ms. + var raftCfg base.RaftConfig + raftCfg.SetDefaults() + raftCfg.RaftHeartbeatIntervalTicks = 1 + raftCfg.RaftElectionTimeoutTicks = 2 + reg := server.NewStickyInMemEnginesRegistry() + args := base.TestClusterArgs{ + ReplicationMode: base.ReplicationManual, + ServerArgs: base.TestServerArgs{ + RaftConfig: raftCfg, + Knobs: base.TestingKnobs{ + Server: &server.TestingKnobs{ + ClockSource: manualClock.UnixNano, + StickyEngineRegistry: reg, + }, + Store: storeKnobs, + }, + }, + } + tc := testcluster.StartTestCluster(t, 2, args) + tc.Stopper().AddCloser(stop.CloserFn(reg.CloseAllStickyInMemEngines)) + + _, err := tc.ServerConn(0).Exec(`SET CLUSTER SETTING kv.replica_circuit_breakers.enabled = true`) + require.NoError(t, err) + + k := tc.ScratchRange(t) + atomic.StoreInt64(&rangeID, int64(tc.LookupRangeOrFatal(t, k).RangeID)) + + tc.AddVotersOrFatal(t, k, tc.Target(1)) + + var repls []replWithKnob + for i := range tc.Servers { + repl := tc.GetFirstStoreFromServer(t, i).LookupReplica(keys.MustAddr(k)) + enableProbe := makeBreakerToggleable(repl.Breaker()) + repls = append(repls, replWithKnob{repl, enableProbe}) + } + return &circuitBreakerTest{ + TestCluster: tc, + ManualClock: manualClock, + repls: repls, + slowThresh: slowThresh, + } +} + +func (cbt *circuitBreakerTest) SetProbeEnabled(idx int, to bool) { + cbt.repls[idx].setProbeEnabled(to) +} + +func (cbt *circuitBreakerTest) Report(idx int, err error) { + cbt.repls[idx].Replica.Breaker().Report(err) +} + +func (cbt *circuitBreakerTest) UntripsSoon(t *testing.T, method func(idx int) error, idx int) { + t.Helper() + testutils.SucceedsSoon(t, func() error { + t.Helper() + err := method(idx) + // All errors coming out should be annotated as coming from + // the circuit breaker. + if err != nil && !errors.Is(err, circuit.ErrBreakerOpen) { + t.Fatalf("saw unexpected error %+v", err) + } + return err + }) +} + +func (cbt *circuitBreakerTest) ExpireAllLeases(t *testing.T, pauseHeartbeats bool) (undo func()) { + var maxWT int64 + var fs []func() + for _, srv := range cbt.Servers { + lv := srv.NodeLiveness().(*liveness.NodeLiveness) + if pauseHeartbeats { + undo := lv.PauseAllHeartbeatsForTest() + fs = append(fs, undo) + } + self, ok := lv.Self() + require.True(t, ok) + if maxWT < self.Expiration.WallTime { + maxWT = self.Expiration.WallTime + } + } + cbt.ManualClock.Forward(maxWT + 1) + return func() { + for _, f := range fs { + f() + } + } +} + +func (*circuitBreakerTest) sendViaRepl(repl *kvserver.Replica, req roachpb.Request) error { + var ba roachpb.BatchRequest + ba.RangeID = repl.Desc().RangeID + ba.Timestamp = repl.Clock().Now() + ba.Add(req) + ctx, cancel := context.WithTimeout(context.Background(), testutils.DefaultSucceedsSoonDuration) + defer cancel() + _, pErr := repl.Send(ctx, ba) + // If our context got canceled, return an opaque error regardless of presence or + // absence of actual error. This makes sure we don't accidentally pass tests as + // a result of our context cancellation. + if err := ctx.Err(); err != nil { + pErr = roachpb.NewErrorf("timed out waiting for batch response: %v", pErr) + } + return pErr.GoError() +} + +func (*circuitBreakerTest) sendViaDistSender(ds *kvcoord.DistSender, req roachpb.Request) error { + var ba roachpb.BatchRequest + ba.Add(req) + ctx, cancel := context.WithTimeout(context.Background(), testutils.DefaultSucceedsSoonDuration) + defer cancel() + _, pErr := ds.Send(ctx, ba) + // If our context got canceled, return an opaque error regardless of presence or + // absence of actual error. This makes sure we don't accidentally pass tests as + // a result of our context cancellation. + if err := ctx.Err(); err != nil { + pErr = roachpb.NewErrorf("timed out waiting for batch response: %v", pErr) + } + return pErr.GoError() +} + +func (*circuitBreakerTest) RequireIsBreakerOpen(t *testing.T, err error) { + t.Helper() + require.True(t, errors.Is(err, circuit.ErrBreakerOpen), "%+v", err) +} + +func (*circuitBreakerTest) RequireIsNotLeaseholderError(t *testing.T, err error) { + ok := errors.HasType(err, (*roachpb.NotLeaseHolderError)(nil)) + require.True(t, ok, "%+v", err) +} + +func (cbt *circuitBreakerTest) Write(idx int) error { + return cbt.writeViaRepl(cbt.repls[idx].Replica) +} + +func (cbt *circuitBreakerTest) WriteDS(idx int) error { + put := roachpb.NewPut(cbt.repls[idx].Desc().StartKey.AsRawKey(), roachpb.MakeValueFromString("hello")) + return cbt.sendViaDistSender(cbt.Servers[idx].DistSender(), put) +} + +// SetSlowThreshold sets the SlowReplicationThreshold for requests sent through the +// test harness (i.e. via Write) to the provided duration. The zero value restores +// the default. +func (cbt *circuitBreakerTest) SetSlowThreshold(dur time.Duration) { + cbt.slowThresh.Store(dur) +} + +func (cbt *circuitBreakerTest) Read(idx int) error { + return cbt.readViaRepl(cbt.repls[idx].Replica) +} + +func (cbt *circuitBreakerTest) writeViaRepl(repl *kvserver.Replica) error { + put := roachpb.NewPut(repl.Desc().StartKey.AsRawKey(), roachpb.MakeValueFromString("hello")) + return cbt.sendViaRepl(repl, put) +} + +func (cbt *circuitBreakerTest) readViaRepl(repl *kvserver.Replica) error { + get := roachpb.NewGet(repl.Desc().StartKey.AsRawKey(), false /* forUpdate */) + return cbt.sendViaRepl(repl, get) +} diff --git a/pkg/kv/kvserver/helpers_test.go b/pkg/kv/kvserver/helpers_test.go index 057e6d2abb43..42b4ec4e6f78 100644 --- a/pkg/kv/kvserver/helpers_test.go +++ b/pkg/kv/kvserver/helpers_test.go @@ -35,6 +35,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" "github.com/cockroachdb/cockroach/pkg/util" + circuit2 "github.com/cockroachdb/cockroach/pkg/util/circuit" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/quotapool" @@ -221,6 +222,10 @@ func NewTestStorePool(cfg StoreConfig) *StorePool { ) } +func (r *Replica) Breaker() *circuit2.Breaker { + return r.breaker.wrapped +} + func (r *Replica) AssertState(ctx context.Context, reader storage.Reader) { r.raftMu.Lock() defer r.raftMu.Unlock() diff --git a/pkg/kv/kvserver/kvserverpb/state.proto b/pkg/kv/kvserver/kvserverpb/state.proto index 8f484529721f..7d462927536e 100644 --- a/pkg/kv/kvserver/kvserverpb/state.proto +++ b/pkg/kv/kvserver/kvserverpb/state.proto @@ -177,6 +177,7 @@ message RangeInfo { // Closed timestamp info communicated through the side-transport. See also // state.raft_closed_timestamp. RangeSideTransportInfo closed_timestamp_sidetransport_info = 19 [(gogoproto.customname) = "ClosedTimestampSideTransportInfo", (gogoproto.nullable) = false ]; + string circuit_breaker_error = 20; } // RangeSideTransportInfo describes a range's closed timestamp info communicated diff --git a/pkg/kv/kvserver/replica.go b/pkg/kv/kvserver/replica.go index 10b365396255..738b2baec067 100644 --- a/pkg/kv/kvserver/replica.go +++ b/pkg/kv/kvserver/replica.go @@ -264,6 +264,8 @@ type Replica struct { // miss out on anything. raftCtx context.Context + breaker *replicaCircuitBreaker + // raftMu protects Raft processing the replica. // // Locking notes: Replica.raftMu < Replica.mu @@ -1227,6 +1229,11 @@ func (r *Replica) State(ctx context.Context) kvserverpb.RangeInfo { ctx, r.RangeID, r.mu.state.Lease.Replica.NodeID) ri.ClosedTimestampSideTransportInfo.CentralClosed = centralClosed ri.ClosedTimestampSideTransportInfo.CentralLAI = centralLAI + if err := r.breaker.Signal().Err(); err != nil { + // TODO(tbg): we could use an errorspb.EncodedError here but this wouldn't render + // well on the JSON endpoints. + ri.CircuitBreakerError = err.Error() + } return ri } diff --git a/pkg/kv/kvserver/replica_circuit_breaker.go b/pkg/kv/kvserver/replica_circuit_breaker.go new file mode 100644 index 000000000000..3d087d9f3de4 --- /dev/null +++ b/pkg/kv/kvserver/replica_circuit_breaker.go @@ -0,0 +1,165 @@ +// Copyright 2021 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package kvserver + +import ( + "context" + "time" + + "github.com/cockroachdb/cockroach/pkg/clusterversion" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/settings" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/util/circuit" + "github.com/cockroachdb/cockroach/pkg/util/contextutil" + "github.com/cockroachdb/cockroach/pkg/util/envutil" + "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/stop" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/redact" +) + +type replicaInCircuitBreaker interface { + Clock() *hlc.Clock + Desc() *roachpb.RangeDescriptor + Send(context.Context, roachpb.BatchRequest) (*roachpb.BatchResponse, *roachpb.Error) + rangeUnavailableError() error +} + +var defaultReplicaCircuitBreakersEnabled = !envutil.EnvOrDefaultBool("COCKROACH_DISABLE_REPLICA_CIRCUIT_BREAKERS", true) + +var replicaCircuitBreakersEnabled = settings.RegisterBoolSetting( + settings.SystemOnly, + "kv.replica_circuit_breakers.enabled", + "fail-fast requests to unavailable replicas", + defaultReplicaCircuitBreakersEnabled, +) + +type replicaCircuitBreaker struct { + ambCtx log.AmbientContext + stopper *stop.Stopper + r replicaInCircuitBreaker + st *cluster.Settings + wrapped *circuit.Breaker +} + +func (br *replicaCircuitBreaker) enabled() bool { + return replicaCircuitBreakersEnabled.Get(&br.st.SV) && + br.st.Version.IsActive(context.Background(), clusterversion.ProbeRequest) +} + +func (br *replicaCircuitBreaker) newError() error { + return br.r.rangeUnavailableError() +} + +func (br *replicaCircuitBreaker) TripAsync() { + if !br.enabled() { + return + } + + _ = br.stopper.RunAsyncTask( + br.ambCtx.AnnotateCtx(context.Background()), "trip-breaker", + func(ctx context.Context) { + br.wrapped.Report(br.newError()) + }, + ) +} + +type neverTripSignaller struct{} + +func (s neverTripSignaller) Err() error { return nil } +func (s neverTripSignaller) C() <-chan struct{} { return nil } + +func (br *replicaCircuitBreaker) Signal() interface { + Err() error + C() <-chan struct{} +} { + if !br.enabled() { + return neverTripSignaller{} + } + return br.wrapped.Signal() +} + +func newReplicaCircuitBreaker( + cs *cluster.Settings, + stopper *stop.Stopper, + ambientCtx log.AmbientContext, + r replicaInCircuitBreaker, +) *replicaCircuitBreaker { + br := &replicaCircuitBreaker{ + stopper: stopper, + ambCtx: ambientCtx, + r: r, + st: cs, + } + + br.wrapped = circuit.NewBreaker(circuit.Options{ + Name: redact.Sprintf("breaker"), // log bridge has ctx tags + AsyncProbe: br.asyncProbe, + EventHandler: &circuit.EventLogger{ + Log: func(buf redact.StringBuilder) { + log.Infof(ambientCtx.AnnotateCtx(context.Background()), "%s", buf) + }, + }, + }) + + return br +} + +func (br *replicaCircuitBreaker) asyncProbe(report func(error), done func()) { + bgCtx := br.ambCtx.AnnotateCtx(context.Background()) + if err := br.stopper.RunAsyncTask( + withCircuitBreakerProbeMarker(bgCtx), + "replica-probe", + func(ctx context.Context) { + defer done() + + if !br.enabled() { + report(nil) + return + } + + // TODO(tbg): plumb r.slowReplicationThreshold here. + // TODO(tbg): generate a nicer error on timeout (the common case). Right now it will + // just say "operation probe timed out after 10s: context canceled"; this isn't helpful. + // Might be better to wrap with an error that indicates the last attempt at un-breaking, + // or when the range first tripped, or not to report a new error at all. + const probeTimeout = 10 * time.Second + err := contextutil.RunWithTimeout(ctx, "probe", probeTimeout, func(ctx context.Context) error { + if err := checkShouldUntripBreaker(ctx, br.r); err != nil { + return errors.CombineErrors(br.r.rangeUnavailableError(), err) + } + return nil + }) + report(err) + }); err != nil { + done() + } +} + +func checkShouldUntripBreaker(ctx context.Context, r replicaInCircuitBreaker) error { + desc := r.Desc() + if !desc.IsInitialized() { + return nil + } + ba := roachpb.BatchRequest{} + ba.Timestamp = r.Clock().Now() + ba.RangeID = r.Desc().RangeID + probeReq := &roachpb.ProbeRequest{} + probeReq.Key = desc.StartKey.AsRawKey() + ba.Add(probeReq) + _, pErr := r.Send(ctx, ba) + if pErr != nil { + return pErr.GoError() + } + return nil +} diff --git a/pkg/kv/kvserver/replica_init.go b/pkg/kv/kvserver/replica_init.go index fbb1b1dc0c69..5771e6f3e651 100644 --- a/pkg/kv/kvserver/replica_init.go +++ b/pkg/kv/kvserver/replica_init.go @@ -127,6 +127,8 @@ func newUnloadedReplica( r.splitQueueThrottle = util.Every(splitQueueThrottleDuration) r.mergeQueueThrottle = util.Every(mergeQueueThrottleDuration) + + r.breaker = newReplicaCircuitBreaker(store.cfg.Settings, store.stopper, r.AmbientContext, r) return r } diff --git a/pkg/kv/kvserver/replica_proposal.go b/pkg/kv/kvserver/replica_proposal.go index bfafefb9ebdf..8bdd0bce36f4 100644 --- a/pkg/kv/kvserver/replica_proposal.go +++ b/pkg/kv/kvserver/replica_proposal.go @@ -68,6 +68,10 @@ type ProposalData struct { // last (re-)proposed. proposedAtTicks int + // createdAtTicks is the (logical) time at which this command was + // *first* proposed. + createdAtTicks int + // command is serialized and proposed to raft. In the event of // reproposals its MaxLeaseIndex field is mutated. command *kvserverpb.RaftCommand diff --git a/pkg/kv/kvserver/replica_proposal_buf.go b/pkg/kv/kvserver/replica_proposal_buf.go index e4435633ee43..28209fb97332 100644 --- a/pkg/kv/kvserver/replica_proposal_buf.go +++ b/pkg/kv/kvserver/replica_proposal_buf.go @@ -1021,6 +1021,9 @@ func (rp *replicaProposer) registerProposalLocked(p *ProposalData) { // Record when the proposal was submitted to Raft so that we can later // decide if/when to re-propose it. p.proposedAtTicks = rp.mu.ticks + if p.createdAtTicks == 0 { + p.createdAtTicks = rp.mu.ticks + } rp.mu.proposals[p.idKey] = p } diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index 52fbd1f27941..ce352466aeef 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -1092,12 +1092,36 @@ func (r *Replica) refreshProposalsLocked( log.Fatalf(ctx, "refreshAtDelta specified for reason %s != reasonTicks", reason) } + var tripBreakerWithBA *roachpb.BatchRequest + var tripBreakerWithDuration time.Duration var reproposals pendingCmdSlice for _, p := range r.mu.proposals { - if p.command.MaxLeaseIndex == 0 { + if dur := r.store.cfg.RaftTickInterval * time.Duration(r.mu.ticks-p.createdAtTicks); dur > r.slowReplicationThreshold(p.Request) { + if tripBreakerWithDuration < dur { + // NB: if proposals regularly get rejected, we may never enter this branch. It might be more + // robust to instead trip the breaker "if applied index hasn't moved in duration X despite there + // having been proposals inflight". + tripBreakerWithDuration = dur + tripBreakerWithBA = p.Request + } + } + // TODO(tbg): this effectively disables this branch since leases are the only + // request without an MLI. Need to figure out if this is allowed. We definitely + // need to prevent the call to finishApplication, as it removes the proposal + // and thus the `createdAtTicks` that we ultimately are waiting to let grow large + // enough to trip the breaker. + // TODO(tbg): another thing about leases is that they don't go through Replica.Send, + // so they don't actually bounce on the circuit breaker. This isn't the biggest + // problem since all client traffic comes through Replica.Send, but it does make + // leases special in a way that perhaps isn't desirable and could raise questions + // later on. + if p.command.MaxLeaseIndex == 0 && !p.Request.IsLeaseRequest() { // Commands without a MaxLeaseIndex cannot be reproposed, as they might // apply twice. We also don't want to ask the proposer to retry these // special commands. + // + // TODO(tbg): leases are one example here, but don't they have replay + // protection via the sequence number? What other requests are here? r.cleanupFailedProposalLocked(p) log.VEventf(p.ctx, 2, "refresh (reason: %s) returning AmbiguousResultError for command "+ "without MaxLeaseIndex: %v", reason, p.command) @@ -1141,6 +1165,28 @@ func (r *Replica) refreshProposalsLocked( } } + // If the breaker isn't tripped yet but we've detected commands that have + // taken too long to replicate, trip the breaker now. + // + // NB: we still keep reproposing commands on this and subsequent ticks + // even though this seems strictly counter-productive, except perhaps + // for the probe's proposals. We could consider being more strict here + // which could avoid build-up of raft log entries during outages, see + // for example: + // https://github.com/cockroachdb/cockroach/issues/60612 + if r.breaker.Signal().Err() == nil && tripBreakerWithDuration > 0 { + // We hold both raftMu and r.mu here and would like to give the + // breaker a simple way to produce nicely annotated errors in + // its own probe, so we report in a goroutine. As a result, the + // breaker can call a method on the Replica to get the annotations + // without having to worry about causing deadlocks. + log.Warningf(ctx, + "have been waiting %.2fs for slow proposal %s", + tripBreakerWithDuration.Seconds(), tripBreakerWithBA, + ) + r.breaker.TripAsync() + } + if log.V(1) && len(reproposals) > 0 { log.Infof(ctx, "pending commands: reproposing %d (at %d.%d) %s", diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go index a0482910b9c2..894089d99c5f 100644 --- a/pkg/kv/kvserver/replica_range_lease.go +++ b/pkg/kv/kvserver/replica_range_lease.go @@ -436,6 +436,11 @@ func (p *pendingLeaseRequest) requestLeaseAsync( // Send the RequestLeaseRequest or TransferLeaseRequest and wait for the new // lease to be applied. if pErr == nil { + // The Replica circuit breakers together with round-tripping a ProbeRequest + // here before asking for the lease could provide an alternative, simpler + // solution to the the below issue: + // + // https://github.com/cockroachdb/cockroach/issues/37906 ba := roachpb.BatchRequest{} ba.Timestamp = p.repl.store.Clock().Now() ba.RangeID = p.repl.RangeID @@ -1216,6 +1221,9 @@ func (r *Replica) redirectOnOrAcquireLeaseForRequest( // against this in checkRequestTimeRLocked). So instead of assuming // anything, we iterate and check again. pErr = func() (pErr *roachpb.Error) { + // NB: the slow request detection here is not particularly useful since + // it guards the actual lease proposal, for which there is already a + // similar check when waiting for the proposal to come back. slowTimer := timeutil.NewTimer() defer slowTimer.Stop() slowTimer.Reset(base.SlowRequestThreshold) diff --git a/pkg/kv/kvserver/replica_send.go b/pkg/kv/kvserver/replica_send.go index 86956fb884ea..2f82751be817 100644 --- a/pkg/kv/kvserver/replica_send.go +++ b/pkg/kv/kvserver/replica_send.go @@ -109,7 +109,7 @@ func (r *Replica) Send( // github.com/cockroachdb/cockroach/pkg/storage.(*Replica).sendWithRangeID(0xc420d1a000, 0x64bfb80, 0xc421564b10, 0x15, 0x153fd4634aeb0193, 0x0, 0x100000001, 0x1, 0x15, 0x0, ...) func (r *Replica) sendWithRangeID( ctx context.Context, _forStacks roachpb.RangeID, ba *roachpb.BatchRequest, -) (*roachpb.BatchResponse, *roachpb.Error) { +) (_ *roachpb.BatchResponse, rErr *roachpb.Error) { var br *roachpb.BatchResponse if r.leaseholderStats != nil && ba.Header.GatewayNodeID != 0 { r.leaseholderStats.record(ba.Header.GatewayNodeID) @@ -126,6 +126,69 @@ func (r *Replica) sendWithRangeID( return nil, roachpb.NewError(err) } + ctx, cancel := context.WithCancel(ctx) + defer cancel() + brSig := r.breaker.Signal() + if isCircuitBreakerProbe(ctx) { + brSig = neverTripSignaller{} + } + defer func() { + if rErr == nil { + return + } + // TODO(tbg): DistSender should ideally understand the circuit breaker errors. + // Otherwise, if its caching info is stale, it might persistently contact + // unavailable replicas despite there possibly existing a healthy alternative. + // Basically they should be treated like a NotLeaseholderError but the replica + // shouldn't be retried... this is probably too hard for now, better to make it + // a hard error but to ensure it evicts the leaseholder cache. + // + // See: https://github.com/cockroachdb/cockroach/issues/74504 + brErr := brSig.Err() + if brErr == nil { + return + } + err := rErr.GoError() + if ae := (&roachpb.AmbiguousResultError{}); errors.As(err, &ae) { + // TODO(tbg): this is pretty janky. Basically when the breaker first trips + // some proposals are bound to get an ambiguous result, which we must propagate. + // But we also want this case to be presentable as a nice error in SQL, so it + // needs to contain the information coming from the breaker. We could consider + // wrapping the AmbiguousResultError instead. + wrappedErr := brErr + // TODO(tbg): this is the only write to WrappedErr in the codebase, so should + // simplify and avoid nesting `roachpb.Error` here. + if ae.WrappedErr != nil { + wrappedErr = errors.Wrapf(brErr, "%v", ae.WrappedErr) + } + ae.WrappedErr = roachpb.NewError(wrappedErr) + rErr = roachpb.NewError(ae) + } else if le := (&roachpb.NotLeaseHolderError{}); errors.As(err, &le) { + // When a lease acquisition is short-circuited by the breaker, it will return an opaque + // NotLeaseholderError, which we replace with the breaker's error. + rErr = roachpb.NewError(errors.CombineErrors(brErr, le)) + } + }() + // NB: having this below the defer allows the defer to potentially annotate + // the error with additional information (such as how long the request was + // in Replica.Send) if the breaker is tripped (not done right now). + if err := brSig.Err(); err != nil { + // TODO(tbg): we may want to exclude some requests from this check, or allow requests + // to exclude themselves from the check (via their header). + return nil, roachpb.NewError(err) + } + + if r.store.Stopper().RunAsyncTask(ctx, "watch", func(ctx context.Context) { + select { + case <-ctx.Done(): + return + case <-brSig.C(): + cancel() + } + }) != nil { + cancel() + } + if err := r.maybeBackpressureBatch(ctx, ba); err != nil { return nil, roachpb.NewError(err) } diff --git a/pkg/kv/kvserver/replica_test.go b/pkg/kv/kvserver/replica_test.go index a572f76f7356..3933d5721c55 100644 --- a/pkg/kv/kvserver/replica_test.go +++ b/pkg/kv/kvserver/replica_test.go @@ -66,7 +66,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/errors" "github.com/cockroachdb/logtags" - "github.com/cockroachdb/redact" "github.com/kr/pretty" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -12994,6 +12993,8 @@ func TestRangeUnavailableMessage(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) + skip.IgnoreLint(t, "TODO(tbg): fix before merging") + var repls roachpb.ReplicaSet repls.AddReplica(roachpb.ReplicaDescriptor{NodeID: 1, StoreID: 10, ReplicaID: 100}) repls.AddReplica(roachpb.ReplicaDescriptor{NodeID: 2, StoreID: 20, ReplicaID: 200}) @@ -13005,8 +13006,11 @@ func TestRangeUnavailableMessage(t *testing.T) { 1: liveness.IsLiveMapEntry{IsLive: true}, } rs := raft.Status{} - var s redact.StringBuilder - rangeUnavailableMessage(&s, desc, lm, &rs, &ba, dur) + // TODO(tbg): use datadriven. + var _ = ba + var _ = dur + err := rangeUnavailableError(desc, desc.Replicas().AsProto()[0], lm, &rs) + _ = err const exp = `have been waiting 60.00s for proposing command RequestLease [‹/Min›,‹/Min›). This range is likely unavailable. Please submit this message to Cockroach Labs support along with the following information: @@ -13023,7 +13027,7 @@ support contract. Otherwise, please open an issue at: https://github.com/cockroachdb/cockroach/issues/new/choose ` - act := s.RedactableString() + act := `TODO` t.Log(act) require.EqualValues(t, exp, act) } diff --git a/pkg/kv/kvserver/replica_write.go b/pkg/kv/kvserver/replica_write.go index 723a3c7eca68..5acc7c543305 100644 --- a/pkg/kv/kvserver/replica_write.go +++ b/pkg/kv/kvserver/replica_write.go @@ -50,6 +50,25 @@ var migrateApplicationTimeout = settings.RegisterDurationSetting( settings.PositiveDuration, ) +type probeKey struct{} + +func isCircuitBreakerProbe(ctx context.Context) bool { + return ctx.Value(probeKey{}) != nil +} + +func withCircuitBreakerProbeMarker(ctx context.Context) context.Context { + // TODO(tbg): put the *Replica and check for pointer identity in + // isCircuitBreakerProbe just in case the ctx makes its way to another + // Replica. + // TODO(tbg): we could also completely get out of this context business + // if we simply checked against the incoming batch (perhaps adding a + // BypassBreaker field to ProbeRequest). OTOH this would not ever + // support "remembering" that a lease request is triggered by a probe + // if we decide to "expand" probe coverage in that direction. (But + // then we can bring the context back, too). + return context.WithValue(ctx, probeKey{}, probeKey{}) +} + // executeWriteBatch is the entry point for client requests which may mutate the // range's replicated state. Requests taking this path are evaluated and ultimately // serialized through Raft, but pass through additional machinery whose goal is @@ -176,7 +195,9 @@ func (r *Replica) executeWriteBatch( startPropTime := timeutil.Now() slowTimer := timeutil.NewTimer() defer slowTimer.Stop() - slowTimer.Reset(r.store.cfg.SlowReplicationThreshold) + + slowThreshold := r.slowReplicationThreshold(ba) + slowTimer.Reset(slowThreshold) // NOTE: this defer was moved from a case in the select statement to here // because escape analysis does a better job avoiding allocations to the // heap when defers are unconditional. When this was in the slowTimer select @@ -288,13 +309,11 @@ func (r *Replica) executeWriteBatch( case <-slowTimer.C: slowTimer.Read = true + // TODO(tbg): move this to the reproposal function as well. r.store.metrics.SlowRaftRequests.Inc(1) - var s redact.StringBuilder - rangeUnavailableMessage(&s, r.Desc(), r.store.cfg.NodeLiveness.GetIsLiveMap(), - r.RaftStatus(), ba, timeutil.Since(startPropTime)) - log.Errorf(ctx, "range unavailable: %v", s) - + // Continue waiting. If the breaker is now tripped, the breaker channel + // will fail-fast the request on the next loop around. case <-ctxDone: // If our context was canceled, return an AmbiguousResultError, // which indicates to the caller that the command may have executed. @@ -345,52 +364,62 @@ func (r *Replica) executeWriteBatch( } } -func rangeUnavailableMessage( - s *redact.StringBuilder, +func (r *Replica) slowReplicationThreshold(ba *roachpb.BatchRequest) time.Duration { + if knobs := r.store.TestingKnobs(); knobs != nil && knobs.SlowReplicationThresholdOverride != nil { + if dur := knobs.SlowReplicationThresholdOverride(ba); dur > 0 { + return dur + } + // Fall through. + } + return r.store.cfg.SlowReplicationThreshold +} + +func rangeUnavailableError( desc *roachpb.RangeDescriptor, + replDesc roachpb.ReplicaDescriptor, lm liveness.IsLiveMap, rs *raft.Status, - ba *roachpb.BatchRequest, - dur time.Duration, -) { - var liveReplicas, otherReplicas []roachpb.ReplicaDescriptor +) error { + nonLiveRepls := roachpb.MakeReplicaSet(nil) for _, rDesc := range desc.Replicas().Descriptors() { if lm[rDesc.NodeID].IsLive { - liveReplicas = append(liveReplicas, rDesc) - } else { - otherReplicas = append(otherReplicas, rDesc) + continue } + nonLiveRepls.AddReplica(rDesc) } - // Ensure that these are going to redact nicely. - var _ redact.SafeFormatter = ba + canMakeProgress := desc.Replicas().CanMakeProgress( + func(replDesc roachpb.ReplicaDescriptor) bool { + return lm[replDesc.NodeID].IsLive + }, + ) + + // Ensure good redaction. + var _ redact.SafeFormatter = nonLiveRepls var _ redact.SafeFormatter = desc - var _ redact.SafeFormatter = roachpb.ReplicaSet{} - - s.Printf(`have been waiting %.2fs for proposing command %s. -This range is likely unavailable. -Please submit this message to Cockroach Labs support along with the following information: - -Descriptor: %s -Live: %s -Non-live: %s -Raft Status: %+v - -and a copy of https://yourhost:8080/#/reports/range/%d - -If you are using CockroachDB Enterprise, reach out through your -support contract. Otherwise, please open an issue at: - - https://github.com/cockroachdb/cockroach/issues/new/choose -`, - dur.Seconds(), - ba, - desc, - roachpb.MakeReplicaSet(liveReplicas), - roachpb.MakeReplicaSet(otherReplicas), - redact.Safe(rs), // raft status contains no PII - desc.RangeID, + var _ redact.SafeFormatter = replDesc + + err := errors.Errorf("replica %s of %s is unavailable", desc, replDesc) + err = errors.Wrapf( + err, + "raft status: %+v", redact.Safe(rs), // raft status contains no PII ) + if len(nonLiveRepls.AsProto()) > 0 { + err = errors.Wrapf(err, "replicas on non-live nodes: %v (lost quorum: %t)", nonLiveRepls, !canMakeProgress) + } + + return err +} + +func (r *Replica) rangeUnavailableError() error { + desc := r.Desc() + replDesc, _ := desc.GetReplicaDescriptor(r.store.StoreID()) + + var isLiveMap liveness.IsLiveMap + if nl := r.store.cfg.NodeLiveness; nl != nil { // exclude unit test + isLiveMap = nl.GetIsLiveMap() + } + return rangeUnavailableError(desc, replDesc, isLiveMap, r.RaftStatus()) } // canAttempt1PCEvaluation looks at the batch and decides whether it can be diff --git a/pkg/kv/kvserver/testing_knobs.go b/pkg/kv/kvserver/testing_knobs.go index 9ec3168eecb3..a2585a97fbd0 100644 --- a/pkg/kv/kvserver/testing_knobs.go +++ b/pkg/kv/kvserver/testing_knobs.go @@ -81,6 +81,11 @@ type StoreTestingKnobs struct { // error returned to the client, or to simulate network failures. TestingResponseFilter kvserverbase.ReplicaResponseFilter + // SlowReplicationThresholdOverride is an interceptor that allows setting a + // per-Batch SlowReplicationThreshold. The closure may be invoked with a `nil` + // batch in the context of liveness heartbeats during lease handling. + SlowReplicationThresholdOverride func(ba *roachpb.BatchRequest) time.Duration + // TestingRangefeedFilter is called before a replica processes a rangefeed // in order for unit tests to modify the request, error returned to the client // or data. diff --git a/pkg/roachpb/BUILD.bazel b/pkg/roachpb/BUILD.bazel index e9b733f67763..d387ccd50f2b 100644 --- a/pkg/roachpb/BUILD.bazel +++ b/pkg/roachpb/BUILD.bazel @@ -26,6 +26,7 @@ go_library( "metadata.go", "metadata_replicas.go", "method.go", + "replica_unavailable_error.go", "span_config.go", "span_group.go", "tenant.go", @@ -58,6 +59,7 @@ go_library( "@com_github_cockroachdb_errors//errorspb", "@com_github_cockroachdb_errors//extgrpc", "@com_github_cockroachdb_redact//:redact", + "@com_github_gogo_protobuf//proto", "@com_github_golang_mock//gomock", # keep "@io_etcd_go_etcd_raft_v3//raftpb", "@org_golang_google_grpc//metadata", # keep diff --git a/pkg/roachpb/errors.go b/pkg/roachpb/errors.go index d319da006261..7ecb8f742743 100644 --- a/pkg/roachpb/errors.go +++ b/pkg/roachpb/errors.go @@ -650,6 +650,9 @@ func (e *AmbiguousResultError) Error() string { } func (e *AmbiguousResultError) message(_ *Error) string { + if e.WrappedErr != nil { + return fmt.Sprintf("result is ambiguous (%v)", e.WrappedErr) + } return fmt.Sprintf("result is ambiguous (%s)", e.Message) } diff --git a/pkg/testutils/lint/lint_test.go b/pkg/testutils/lint/lint_test.go index 9cace4a7866c..4ec69e9cced0 100644 --- a/pkg/testutils/lint/lint_test.go +++ b/pkg/testutils/lint/lint_test.go @@ -1167,6 +1167,7 @@ func TestLint(t *testing.T) { ":!*.pb.go", ":!*.pb.gw.go", ":!kv/kvclient/kvcoord/lock_spans_over_budget_error.go", + ":!roachpb/replica_unavailable_error.go", ":!sql/pgwire/pgerror/constraint_name.go", ":!sql/pgwire/pgerror/severity.go", ":!sql/pgwire/pgerror/with_candidate_code.go", diff --git a/pkg/util/hlc/hlc.go b/pkg/util/hlc/hlc.go index 794ec7b753d6..075b461ecbcc 100644 --- a/pkg/util/hlc/hlc.go +++ b/pkg/util/hlc/hlc.go @@ -160,6 +160,16 @@ func (m *HybridManualClock) Increment(nanos int64) { m.mu.Unlock() } +// Forward sets the wall time to the supplied timestamp this moves the clock +// forward in time. +func (m *HybridManualClock) Forward(nanos int64) { + m.mu.Lock() + if nanos > m.mu.nanos { + m.mu.nanos = nanos + } + m.mu.Unlock() +} + // Pause pauses the hybrid manual clock; the passage of time no longer causes // the clock to tick. Increment can still be used, though. func (m *HybridManualClock) Pause() {