diff --git a/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go b/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go index 1aa78f3f9df7..42cdf8b7b845 100644 --- a/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go +++ b/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go @@ -79,6 +79,28 @@ func (s tpccOLAPSpec) run(ctx context.Context, t test.Test, c cluster.Cluster) { return nil }) m.Wait() + + // Before checking liveness, set the gRPC logging[^1] to verbose. We stopped the + // load so this should be OK in terms of overhead, and it can explain to us why + // we see gRPC connections buckle in this workload. See: + // + // https://github.com/cockroachdb/cockroach/issues/96543 + // + // [^1]: google.golang.org/grpc/grpclog/component.go + for i := 1; i <= c.Spec().NodeCount-1; i++ { + _, err := c.Conn(ctx, t.L(), i).ExecContext(ctx, `SELECT crdb_internal.set_vmodule('component=2');`) + if err != nil { + t.L().PrintfCtx(ctx, "ignoring vmodule error: %s", err) + } + } + defer func() { + for i := 1; i <= c.Spec().NodeCount-1; i++ { + _, err := c.Conn(ctx, t.L(), i).ExecContext(ctx, `SELECT crdb_internal.set_vmodule('');`) + if err != nil { + t.L().PrintfCtx(ctx, "ignoring vmodule-reset error: %s", err) + } + } + }() verifyNodeLiveness(ctx, c, t, duration) } diff --git a/pkg/rpc/context.go b/pkg/rpc/context.go index b3ddf4368b2f..2426de524182 100644 --- a/pkg/rpc/context.go +++ b/pkg/rpc/context.go @@ -1849,7 +1849,8 @@ func init() { type onlyOnceDialer struct { mu struct { syncutil.Mutex - err error + err error + redialed bool } } @@ -1866,13 +1867,13 @@ func (ood *onlyOnceDialer) dial(ctx context.Context, addr string) (net.Conn, err defer ood.mu.Unlock() if err := ood.mu.err; err != nil { - // Not first dial. - if !errors.Is(err, grpcutil.ErrConnectionInterrupted) { - // Hitting this path would indicate that gRPC retried even though it - // received an error not marked as retriable. At the time of writing, at - // least in simple experiments, as expected we don't see this error. - err = errors.Wrap(err, "previous dial failed") + if ood.mu.redialed { + // We set up onlyOnceDialer to avoid returning any errors that could look + // temporary to gRPC, and so we don't expect it to re-dial a connection + // twice (the first re-dial is supposed to surface the permanent error). + return nil, errors.NewAssertionErrorWithWrappedErrf(err, "gRPC connection unexpectedly re-dialed") } + ood.mu.redialed = true return nil, err } diff --git a/pkg/rpc/context_test.go b/pkg/rpc/context_test.go index 31c645b61113..31a7fec2dcd5 100644 --- a/pkg/rpc/context_test.go +++ b/pkg/rpc/context_test.go @@ -2533,3 +2533,27 @@ func TestRejectDialOnQuiesce(t *testing.T) { require.Error(t, err) require.Equal(t, codes.Canceled, status.Code(err)) } + +// TestOnlyOnceDialer verifies that onlyOnceDialer prevents gRPC from re-dialing +// on an existing connection. (Instead, gRPC will have to make a new dialer). +func TestOnlyOnceDialer(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + ln, err := net.Listen(util.TestAddr.Network(), util.TestAddr.String()) + require.NoError(t, err) + defer func() { _ = ln.Close() }() + + ood := &onlyOnceDialer{} + conn, err := ood.dial(ctx, ln.Addr().String()) + require.NoError(t, err) + _ = conn.Close() + + for i := 0; i < 5; i++ { + _, err := ood.dial(ctx, ln.Addr().String()) + if i == 0 { + require.True(t, errors.Is(err, grpcutil.ErrConnectionInterrupted), "i=%d: %+v", i, err) + } else { + require.ErrorContains(t, err, `gRPC connection unexpectedly re-dialed`) + } + } +} diff --git a/pkg/server/init.go b/pkg/server/init.go index 958ebc366a53..5665f235bbb5 100644 --- a/pkg/server/init.go +++ b/pkg/server/init.go @@ -469,12 +469,6 @@ func (s *initServer) attemptJoinTo( initClient := roachpb.NewInternalClient(conn) resp, err := initClient.Join(ctx, req) if err != nil { - // If the target node does not implement the Join RPC, or explicitly - // returns errJoinRPCUnsupported (because the cluster version - // introducing its usage is not yet active), we error out so the init - // server knows to fall back on the gossip-based discovery mechanism for - // the clusterID. - status, ok := grpcstatus.FromError(errors.UnwrapAll(err)) if !ok { return nil, err