From 410061c99aa142f1cf4429e5553a9f78fa5ae4c7 Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Thu, 8 Mar 2018 14:00:16 -0500 Subject: [PATCH] kv: connect transport lazily A user observed that after decommissioning a node, connections would still be attempted to the node, seemingly forever. This was easily reproduced (gRPC warnings are logged when our onlyOnceDialer returns `grpcutil.ErrCannotReuseClientConn`). The root cause is that when the node disappears, many affected range descriptors are not evicted by `DistSender`: when the decommissioned node was not the lease holder, there will be no reason to evict. Similarly, if it was, we'll opportunistically send an RPC to the other possibly stale, but often not) replicas which frequently discovers the new lease holder. This doesn't seem like a problem; the replica is carried through and usually placed last (behind the lease holder and any other healthy replica) - the only issue was that the transport layer connects to all of the replicas eagerly, which provokes the spew of log messages. This commit changes the initialization process so that we don't actually GRPCDial replicas until they are needed. The previous call site to GRPCDial and the new one are very close to each other, so nothing is lost. In the common case, the rpc context already has a connection open and no actual work is done anyway. I diagnosed this using an in-progress decommissioning roachtest (and printf debugging); I hope to be able to prevent regression of this bug in that test when it's done. Touches #21882. Release note (bug fix): Avoid connection attempts to former members of the cluster and the associated spurious log messages. --- pkg/kv/transport.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pkg/kv/transport.go b/pkg/kv/transport.go index fa0a24b70fec..e3ce4e940da2 100644 --- a/pkg/kv/transport.go +++ b/pkg/kv/transport.go @@ -42,7 +42,6 @@ type SendOptions struct { type batchClient struct { remoteAddr string - conn *rpc.Connection args roachpb.BatchRequest healthy bool pending bool @@ -118,15 +117,14 @@ func grpcTransportFactoryImpl( ) (Transport, error) { clients := make([]batchClient, 0, len(replicas)) for _, replica := range replicas { - conn := rpcContext.GRPCDial(replica.NodeDesc.Address.String()) argsCopy := args argsCopy.Replica = replica.ReplicaDescriptor remoteAddr := replica.NodeDesc.Address.String() + healthy := rpcContext.ConnHealth(remoteAddr) == nil clients = append(clients, batchClient{ remoteAddr: remoteAddr, - conn: conn, args: argsCopy, - healthy: rpcContext.ConnHealth(remoteAddr) == nil, + healthy: healthy, }) } @@ -244,7 +242,7 @@ func (gt *grpcTransport) send( } log.VEventf(ctx, 2, "sending request to %s", client.remoteAddr) - conn, err := client.conn.Connect(ctx) + conn, err := gt.rpcContext.GRPCDial(client.remoteAddr).Connect(ctx) if err != nil { return nil, err }