From bf108d43e53efe7dd1152c30f7c33919b0ac73e5 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Thu, 19 Aug 2021 17:06:35 -0400 Subject: [PATCH] kvclient: make a test more robust TestFollowerReadsWithStaleDescriptor was fragile: it wants to control precisely which follower a particular query will be routed to and it went through some knobs to do so, but one more knob is needed. This patch introduces the knob for inhibiting the GRPCTransport's check of connection health when deciding the replica ordering - it normally tries to deprioritize nodes to which there isn't a healthy conn. A connection starts off as unhealthy until it's heartbeated for the first time so, depending on whether a connection was established or not to the desired node sufficiently before the query of interest, that check is a problem. In particular, the transport's default behavior becomes a problem with the following commits which remove the old closed timestamp mechanism that came with early establishment of some network connections. Without it, this test becomes unhappy. Release note: None --- .../kvccl/kvfollowerreadsccl/followerreads_test.go | 7 +++++++ pkg/kv/kvclient/kvcoord/dist_sender.go | 11 +++++++++-- pkg/kv/kvclient/kvcoord/testing_knobs.go | 6 ++++++ pkg/kv/kvclient/kvcoord/transport.go | 14 +++++++++++--- 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/pkg/ccl/kvccl/kvfollowerreadsccl/followerreads_test.go b/pkg/ccl/kvccl/kvfollowerreadsccl/followerreads_test.go index cf11f95a3503..c87cd330c4a5 100644 --- a/pkg/ccl/kvccl/kvfollowerreadsccl/followerreads_test.go +++ b/pkg/ccl/kvccl/kvfollowerreadsccl/followerreads_test.go @@ -593,6 +593,13 @@ func TestFollowerReadsWithStaleDescriptor(t *testing.T) { UseDatabase: "t", Knobs: base.TestingKnobs{ KVClient: &kvcoord.ClientTestingKnobs{ + // Inhibit the checking of connection health done by the + // GRPCTransport. This test wants to control what replica (which + // follower) a request is sent to and, depending on timing, the + // connection from n4 to the respective follower might not be + // heartbeated by the time the test wants to use it. Without this + // knob, that would cause the transport to reorder replicas. + DontConsiderConnHealth: true, LatencyFunc: func(addr string) (time.Duration, bool) { if (addr == n2Addr.Get()) || (addr == n3Addr.Get()) { return time.Millisecond, true diff --git a/pkg/kv/kvclient/kvcoord/dist_sender.go b/pkg/kv/kvclient/kvcoord/dist_sender.go index 0ce83696b4c7..52a4cca11750 100644 --- a/pkg/kv/kvclient/kvcoord/dist_sender.go +++ b/pkg/kv/kvclient/kvcoord/dist_sender.go @@ -290,6 +290,11 @@ type DistSender struct { // the descriptor, instead of trying to reorder them by latency. The knob // only applies to requests sent with the LEASEHOLDER routing policy. dontReorderReplicas bool + // dontConsiderConnHealth, if set, makes the GRPCTransport not take into + // consideration the connection health when deciding the ordering for + // replicas. When not set, replicas on nodes with unhealthy connections are + // deprioritized. + dontConsiderConnHealth bool // Currently executing range feeds. activeRangeFeeds sync.Map // // map[*rangeFeedRegistry]nil @@ -383,6 +388,7 @@ func NewDistSender(cfg DistSenderConfig) *DistSender { ds.transportFactory = GRPCTransportFactory } ds.dontReorderReplicas = cfg.TestingKnobs.DontReorderReplicas + ds.dontConsiderConnHealth = cfg.TestingKnobs.DontConsiderConnHealth ds.rpcRetryOptions = base.DefaultRetryOptions() if cfg.RPCRetryOptions != nil { ds.rpcRetryOptions = *cfg.RPCRetryOptions @@ -1855,8 +1861,9 @@ func (ds *DistSender) sendToReplicas( } opts := SendOptions{ - class: rpc.ConnectionClassForKey(desc.RSpan().Key), - metrics: &ds.metrics, + class: rpc.ConnectionClassForKey(desc.RSpan().Key), + metrics: &ds.metrics, + dontConsiderConnHealth: ds.dontConsiderConnHealth, } transport, err := ds.transportFactory(opts, ds.nodeDialer, replicas) if err != nil { diff --git a/pkg/kv/kvclient/kvcoord/testing_knobs.go b/pkg/kv/kvclient/kvcoord/testing_knobs.go index 94b7c9c04647..d29e70408c55 100644 --- a/pkg/kv/kvclient/kvcoord/testing_knobs.go +++ b/pkg/kv/kvclient/kvcoord/testing_knobs.go @@ -19,6 +19,12 @@ type ClientTestingKnobs struct { // testing purposes. TransportFactory TransportFactory + // DontConsiderConnHealth, if set, makes the GRPCTransport not take into + // consideration the connection health when deciding the ordering for + // replicas. When not set, replicas on nodes with unhealthy connections are + // deprioritized. + DontConsiderConnHealth bool + // The maximum number of times a txn will attempt to refresh its // spans for a single transactional batch. // 0 means use a default. -1 means disable refresh. diff --git a/pkg/kv/kvclient/kvcoord/transport.go b/pkg/kv/kvclient/kvcoord/transport.go index cacb061a6397..f14f85ee708f 100644 --- a/pkg/kv/kvclient/kvcoord/transport.go +++ b/pkg/kv/kvclient/kvcoord/transport.go @@ -34,6 +34,11 @@ import ( type SendOptions struct { class rpc.ConnectionClass metrics *DistSenderMetrics + // dontConsiderConnHealth, if set, makes the transport not take into + // consideration the connection health when deciding the ordering for + // replicas. When not set, replicas on nodes with unhealthy connections are + // deprioritized. + dontConsiderConnHealth bool } // TransportFactory encapsulates all interaction with the RPC @@ -109,6 +114,7 @@ func grpcTransportFactoryImpl( } else { replicas = replicas[:len(rs)] } + // We'll map the index of the replica descriptor in its slice to its health. var health util.FastIntMap for i, r := range rs { @@ -121,9 +127,11 @@ func grpcTransportFactoryImpl( } } - // Put known-healthy clients first, while otherwise respecting the existing - // ordering of the replicas. - splitHealthy(replicas, health) + if !opts.dontConsiderConnHealth { + // Put known-healthy clients first, while otherwise respecting the existing + // ordering of the replicas. + splitHealthy(replicas, health) + } *transport = grpcTransport{ opts: opts,