Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
96768: server: remove stale comment r=andreimatei a=andreimatei

Release note: None
Epic: None

96781: rpc: improve detection of onlyOnceDialer redials r=erikgrinaker a=tbg

While looking into #96543, I wasn't 100% sure we weren't accidentally
redialing a connection internally. This improved logging and the test
makes it more obvious that things are working as intended.

Touches #96543.

Epic: none
Release note: None


96782: roachtest: verbose gRPC logging for admission-control/tpcc-olap r=irfansharif a=tbg

Closes #96543.

Next time we'll know more.

Epic: none
Release note: None


Co-authored-by: Andrei Matei <[email protected]>
Co-authored-by: Tobias Grieger <[email protected]>
  • Loading branch information
3 people committed Feb 8, 2023
4 parents ec38fb4 + 702dd9b + 3285d25 + 0f08f6d commit 72e3b1f
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 13 deletions.
22 changes: 22 additions & 0 deletions pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,28 @@ func (s tpccOLAPSpec) run(ctx context.Context, t test.Test, c cluster.Cluster) {
return nil
})
m.Wait()

// Before checking liveness, set the gRPC logging[^1] to verbose. We stopped the
// load so this should be OK in terms of overhead, and it can explain to us why
// we see gRPC connections buckle in this workload. See:
//
// https://github.com/cockroachdb/cockroach/issues/96543
//
// [^1]: google.golang.org/grpc/grpclog/component.go
for i := 1; i <= c.Spec().NodeCount-1; i++ {
_, err := c.Conn(ctx, t.L(), i).ExecContext(ctx, `SELECT crdb_internal.set_vmodule('component=2');`)
if err != nil {
t.L().PrintfCtx(ctx, "ignoring vmodule error: %s", err)
}
}
defer func() {
for i := 1; i <= c.Spec().NodeCount-1; i++ {
_, err := c.Conn(ctx, t.L(), i).ExecContext(ctx, `SELECT crdb_internal.set_vmodule('');`)
if err != nil {
t.L().PrintfCtx(ctx, "ignoring vmodule-reset error: %s", err)
}
}
}()
verifyNodeLiveness(ctx, c, t, duration)
}

Expand Down
15 changes: 8 additions & 7 deletions pkg/rpc/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -1849,7 +1849,8 @@ func init() {
type onlyOnceDialer struct {
mu struct {
syncutil.Mutex
err error
err error
redialed bool
}
}

Expand All @@ -1866,13 +1867,13 @@ func (ood *onlyOnceDialer) dial(ctx context.Context, addr string) (net.Conn, err
defer ood.mu.Unlock()

if err := ood.mu.err; err != nil {
// Not first dial.
if !errors.Is(err, grpcutil.ErrConnectionInterrupted) {
// Hitting this path would indicate that gRPC retried even though it
// received an error not marked as retriable. At the time of writing, at
// least in simple experiments, as expected we don't see this error.
err = errors.Wrap(err, "previous dial failed")
if ood.mu.redialed {
// We set up onlyOnceDialer to avoid returning any errors that could look
// temporary to gRPC, and so we don't expect it to re-dial a connection
// twice (the first re-dial is supposed to surface the permanent error).
return nil, errors.NewAssertionErrorWithWrappedErrf(err, "gRPC connection unexpectedly re-dialed")
}
ood.mu.redialed = true
return nil, err
}

Expand Down
24 changes: 24 additions & 0 deletions pkg/rpc/context_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2533,3 +2533,27 @@ func TestRejectDialOnQuiesce(t *testing.T) {
require.Error(t, err)
require.Equal(t, codes.Canceled, status.Code(err))
}

// TestOnlyOnceDialer verifies that onlyOnceDialer prevents gRPC from re-dialing
// on an existing connection. (Instead, gRPC will have to make a new dialer).
func TestOnlyOnceDialer(t *testing.T) {
defer leaktest.AfterTest(t)()
ctx := context.Background()
ln, err := net.Listen(util.TestAddr.Network(), util.TestAddr.String())
require.NoError(t, err)
defer func() { _ = ln.Close() }()

ood := &onlyOnceDialer{}
conn, err := ood.dial(ctx, ln.Addr().String())
require.NoError(t, err)
_ = conn.Close()

for i := 0; i < 5; i++ {
_, err := ood.dial(ctx, ln.Addr().String())
if i == 0 {
require.True(t, errors.Is(err, grpcutil.ErrConnectionInterrupted), "i=%d: %+v", i, err)
} else {
require.ErrorContains(t, err, `gRPC connection unexpectedly re-dialed`)
}
}
}
6 changes: 0 additions & 6 deletions pkg/server/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -469,12 +469,6 @@ func (s *initServer) attemptJoinTo(
initClient := roachpb.NewInternalClient(conn)
resp, err := initClient.Join(ctx, req)
if err != nil {
// If the target node does not implement the Join RPC, or explicitly
// returns errJoinRPCUnsupported (because the cluster version
// introducing its usage is not yet active), we error out so the init
// server knows to fall back on the gossip-based discovery mechanism for
// the clusterID.

status, ok := grpcstatus.FromError(errors.UnwrapAll(err))
if !ok {
return nil, err
Expand Down

0 comments on commit 72e3b1f

Please sign in to comment.