From 702dd9b2cdf815812bf7654034d5ccc9d64ff844 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Tue, 7 Feb 2023 20:28:27 -0500 Subject: [PATCH 1/3] server: remove stale comment Release note: None Epic: None --- pkg/server/init.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pkg/server/init.go b/pkg/server/init.go index 958ebc366a53..5665f235bbb5 100644 --- a/pkg/server/init.go +++ b/pkg/server/init.go @@ -469,12 +469,6 @@ func (s *initServer) attemptJoinTo( initClient := roachpb.NewInternalClient(conn) resp, err := initClient.Join(ctx, req) if err != nil { - // If the target node does not implement the Join RPC, or explicitly - // returns errJoinRPCUnsupported (because the cluster version - // introducing its usage is not yet active), we error out so the init - // server knows to fall back on the gossip-based discovery mechanism for - // the clusterID. - status, ok := grpcstatus.FromError(errors.UnwrapAll(err)) if !ok { return nil, err From 3285d25d7693febdaaf421109a69e5f2ff7dbd25 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Wed, 8 Feb 2023 08:49:55 +0100 Subject: [PATCH 2/3] rpc: improve detection of onlyOnceDialer redials While looking into #96543, I wasn't 100% sure we weren't accidentally redialing a connection internally. This improved logging and the test makes it more obvious that things are working as intended. Touches #96543. Epic: none Release note: None --- pkg/rpc/context.go | 15 ++++++++------- pkg/rpc/context_test.go | 24 ++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/pkg/rpc/context.go b/pkg/rpc/context.go index 61928d9aebd6..2c24c8413be6 100644 --- a/pkg/rpc/context.go +++ b/pkg/rpc/context.go @@ -1798,7 +1798,8 @@ func init() { type onlyOnceDialer struct { mu struct { syncutil.Mutex - err error + err error + redialed bool } } @@ -1815,13 +1816,13 @@ func (ood *onlyOnceDialer) dial(ctx context.Context, addr string) (net.Conn, err defer ood.mu.Unlock() if err := ood.mu.err; err != nil { - // Not first dial. - if !errors.Is(err, grpcutil.ErrConnectionInterrupted) { - // Hitting this path would indicate that gRPC retried even though it - // received an error not marked as retriable. At the time of writing, at - // least in simple experiments, as expected we don't see this error. - err = errors.Wrap(err, "previous dial failed") + if ood.mu.redialed { + // We set up onlyOnceDialer to avoid returning any errors that could look + // temporary to gRPC, and so we don't expect it to re-dial a connection + // twice (the first re-dial is supposed to surface the permanent error). + return nil, errors.NewAssertionErrorWithWrappedErrf(err, "gRPC connection unexpectedly re-dialed") } + ood.mu.redialed = true return nil, err } diff --git a/pkg/rpc/context_test.go b/pkg/rpc/context_test.go index 31c645b61113..31a7fec2dcd5 100644 --- a/pkg/rpc/context_test.go +++ b/pkg/rpc/context_test.go @@ -2533,3 +2533,27 @@ func TestRejectDialOnQuiesce(t *testing.T) { require.Error(t, err) require.Equal(t, codes.Canceled, status.Code(err)) } + +// TestOnlyOnceDialer verifies that onlyOnceDialer prevents gRPC from re-dialing +// on an existing connection. (Instead, gRPC will have to make a new dialer). +func TestOnlyOnceDialer(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + ln, err := net.Listen(util.TestAddr.Network(), util.TestAddr.String()) + require.NoError(t, err) + defer func() { _ = ln.Close() }() + + ood := &onlyOnceDialer{} + conn, err := ood.dial(ctx, ln.Addr().String()) + require.NoError(t, err) + _ = conn.Close() + + for i := 0; i < 5; i++ { + _, err := ood.dial(ctx, ln.Addr().String()) + if i == 0 { + require.True(t, errors.Is(err, grpcutil.ErrConnectionInterrupted), "i=%d: %+v", i, err) + } else { + require.ErrorContains(t, err, `gRPC connection unexpectedly re-dialed`) + } + } +} From 0f08f6dfbcb9a2cd1f6cfb6b22f202e36da09fd1 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Wed, 8 Feb 2023 09:07:31 +0100 Subject: [PATCH 3/3] roachtest: verbose gRPC logging for admission-control/tpcc-olap I ran the test[^1] and it passed, so hopefully this isn't obviously breaking anything. See https://github.com/cockroachdb/cockroach/issues/96543. [^1]: `GCE_PROJECT=andrei-jepsen ./pkg/cmd/roachtest/roachstress.sh -c 1 -u admission-control/tpcc-olap/nodes=3/cpu=8/w=50/c=96 -- tag:weekly` Epic: none Release note: None --- .../tests/admission_control_tpcc_overload.go | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go b/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go index 1aa78f3f9df7..42cdf8b7b845 100644 --- a/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go +++ b/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go @@ -79,6 +79,28 @@ func (s tpccOLAPSpec) run(ctx context.Context, t test.Test, c cluster.Cluster) { return nil }) m.Wait() + + // Before checking liveness, set the gRPC logging[^1] to verbose. We stopped the + // load so this should be OK in terms of overhead, and it can explain to us why + // we see gRPC connections buckle in this workload. See: + // + // https://github.com/cockroachdb/cockroach/issues/96543 + // + // [^1]: google.golang.org/grpc/grpclog/component.go + for i := 1; i <= c.Spec().NodeCount-1; i++ { + _, err := c.Conn(ctx, t.L(), i).ExecContext(ctx, `SELECT crdb_internal.set_vmodule('component=2');`) + if err != nil { + t.L().PrintfCtx(ctx, "ignoring vmodule error: %s", err) + } + } + defer func() { + for i := 1; i <= c.Spec().NodeCount-1; i++ { + _, err := c.Conn(ctx, t.L(), i).ExecContext(ctx, `SELECT crdb_internal.set_vmodule('');`) + if err != nil { + t.L().PrintfCtx(ctx, "ignoring vmodule-reset error: %s", err) + } + } + }() verifyNodeLiveness(ctx, c, t, duration) }