diff --git a/pkg/base/config.go b/pkg/base/config.go index 462e6824ba26..8e8bfcd31423 100644 --- a/pkg/base/config.go +++ b/pkg/base/config.go @@ -113,41 +113,57 @@ func DefaultHistogramWindowInterval() time.Duration { var ( // NetworkTimeout is the timeout used for network operations that require a // single network round trip. It is conservatively defined as one maximum - // network round trip time (RTT) plus one TCP packet retransmit (RTO), then - // multiplied by 2 as a safety margin. + // network round trip time (RTT) plus one TCP packet retransmit (RTO) with an + // additional safety margin. // - // The maximum RTT between cloud regions is roughly 350ms both in GCP - // (asia-south2 to southamerica-west1) and AWS (af-south-1 to sa-east-1). It - // can occasionally be up to 500ms, but 400ms is a reasonable upper bound - // under nominal conditions. + // The maximum RTT between cloud regions is roughly 400ms both in GCP + // (asia-south2 to southamerica-west1) and AWS (af-south-1 to sa-east-1), but + // p99 RTTs can occasionally approach 600ms. // https://datastudio.google.com/reporting/fc733b10-9744-4a72-a502-92290f608571/page/70YCB // https://www.cloudping.co/grid/p_99/timeframe/1W // // Linux has an RTT-dependant retransmission timeout (RTO) which we can // approximate as 1.5x RTT (smoothed RTT + 4x RTT variance), with a lower - // bound of 200ms. Under nominal conditions, this is approximately 600ms. + // bound of 200ms. It can thus be up to 900ms in the worst case. // - // The maximum p99 RPC heartbeat latency in any Cockroach Cloud cluster over a - // 90-day period was 557ms. This was a single-region US cluster, where the - // high latency appeared to be due to CPU overload or throttling: the cluster - // had 2 vCPU nodes running at 100%. - // - // The NetworkTimeout is thus set to 2 * (400ms + 600ms) = 2s. - // - // TODO(erikgrinaker): Consider reducing this to 1 second, which should be - // sufficient but may be fragile under latency fluctuations. + // The NetworkTimeout is therefore set to 2 seconds: 600ms RTT plus 900ms RTO + // plus a 500ms safety margin. NetworkTimeout = envutil.EnvOrDefaultDuration("COCKROACH_NETWORK_TIMEOUT", 2*time.Second) // DialTimeout is the timeout used when dialing a node. gRPC connections take - // up to 3 roundtrips for the TCP + TLS handshakes. Because NetworkTimeout - // allows for both a network roundtrip (RTT) and a TCP retransmit (RTO), with - // the RTO being greater than the RTT, and we don't need to tolerate more than - // 1 retransmit per connection attempt, 2 * NetworkTimeout is sufficient. + // up to 3 roundtrips for the TCP + TLS handshakes. NetworkTimeout allows for + // both a network roundtrip and a TCP retransmit, but we don't need to + // tolerate more than 1 retransmit per connection attempt, so + // 2 * NetworkTimeout is sufficient. DialTimeout = 2 * NetworkTimeout - // defaultRPCHeartbeatIntervalAndTimeout is the default value of - // RPCHeartbeatIntervalAndTimeout used by the RPC context. - defaultRPCHeartbeatIntervalAndTimeout = NetworkTimeout + // PingInterval is the interval between network heartbeat pings. It is used + // both for RPC heartbeat intervals and gRPC server keepalive pings. It is + // set to 1 second in order to fail fast, but with large default timeouts + // to tolerate high-latency multiregion clusters. + PingInterval = envutil.EnvOrDefaultDuration("COCKROACH_PING_INTERVAL", time.Second) + + // defaultRPCHeartbeatTimeout is the default RPC heartbeat timeout. It is set + // very high at 3 * NetworkTimeout for several reasons: the gRPC transport may + // need to complete a dial/handshake before sending the heartbeat, the + // heartbeat has occasionally been seen to require 3 RTTs even post-dial (for + // unknown reasons), and under load the heartbeat may be head-of-line blocked + // by other RPC traffic. + // + // High-latency experiments with 6s RPC heartbeat timeouts showed that + // clusters running TPCC imports were stable at 400ms RTT, but started seeing + // RPC heartbeat failures at 500ms RTT. With light load (e.g. rate-limited + // kv50), clusters were stable at 1000ms RTT. + // + // The maximum p99 RPC heartbeat latency in any Cockroach Cloud cluster over a + // 90-day period was found to be 557ms. This was a single-region US cluster + // where the latency was caused by CPU overload. + // + // TODO(erikgrinaker): We should avoid head-of-line blocking for RPC + // heartbeats and reduce this to NetworkTimeout (plus DialTimeout for the + // initial heartbeat), see: + // https://github.com/cockroachdb/cockroach/issues/93397. + defaultRPCHeartbeatTimeout = 3 * NetworkTimeout // defaultRaftElectionTimeoutTicks specifies the number of Raft Tick // invocations that must pass between elections. @@ -252,11 +268,13 @@ type Config struct { // This is computed from HTTPAddr if specified otherwise Addr. HTTPAdvertiseAddr string - // RPCHeartbeatIntervalAndTimeout controls how often a Ping request is sent on - // peer connections to determine connection health and update the local view - // of remote clocks. This is also used as a timeout for heartbeats, so don't - // set this too low. - RPCHeartbeatIntervalAndTimeout time.Duration + // RPCHeartbeatInterval controls how often Ping requests are sent on peer + // connections to determine connection health and update the local view of + // remote clocks. + RPCHeartbeatInterval time.Duration + + // RPCHearbeatTimeout is the timeout for Ping requests. + RPCHeartbeatTimeout time.Duration // SecondaryTenantPortOffset is the increment to add to the various // addresses to generate the network configuration for the in-memory @@ -314,7 +332,8 @@ func (cfg *Config) InitDefaults() { cfg.SQLAdvertiseAddr = cfg.SQLAddr cfg.SocketFile = "" cfg.SSLCertsDir = DefaultCertsDirectory - cfg.RPCHeartbeatIntervalAndTimeout = defaultRPCHeartbeatIntervalAndTimeout + cfg.RPCHeartbeatInterval = PingInterval + cfg.RPCHeartbeatTimeout = defaultRPCHeartbeatTimeout cfg.ClusterName = "" cfg.DisableClusterNameVerification = false cfg.ClockDevicePath = "" diff --git a/pkg/rpc/context.go b/pkg/rpc/context.go index d1a31a69d480..d32b062f3d5c 100644 --- a/pkg/rpc/context.go +++ b/pkg/rpc/context.go @@ -365,8 +365,9 @@ type Context struct { RemoteClocks *RemoteClockMonitor MasterCtx context.Context - heartbeatTimeout time.Duration - HeartbeatCB func() + heartbeatInterval time.Duration + heartbeatTimeout time.Duration + HeartbeatCB func() rpcCompression bool @@ -571,7 +572,8 @@ func NewContext(ctx context.Context, opts ContextOptions) *Context { rpcCompression: enableRPCCompression, MasterCtx: masterCtx, metrics: makeMetrics(), - heartbeatTimeout: opts.Config.RPCHeartbeatIntervalAndTimeout, + heartbeatInterval: opts.Config.RPCHeartbeatInterval, + heartbeatTimeout: opts.Config.RPCHeartbeatTimeout, logClosingConnEvery: log.Every(time.Second), } @@ -579,7 +581,7 @@ func NewContext(ctx context.Context, opts ContextOptions) *Context { // CLI commands are exempted. if !opts.ClientOnly { rpcCtx.RemoteClocks = newRemoteClockMonitor( - opts.Clock, opts.MaxOffset, 10*opts.Config.RPCHeartbeatIntervalAndTimeout, opts.Config.HistogramWindowInterval()) + opts.Clock, opts.MaxOffset, 10*opts.Config.RPCHeartbeatTimeout, opts.Config.HistogramWindowInterval()) } if id := opts.Knobs.StorageClusterID; id != nil { @@ -2193,7 +2195,7 @@ func (rpcCtx *Context) runHeartbeat( }) } - heartbeatTimer.Reset(rpcCtx.Config.RPCHeartbeatIntervalAndTimeout) + heartbeatTimer.Reset(rpcCtx.heartbeatInterval) } } diff --git a/pkg/rpc/context_test.go b/pkg/rpc/context_test.go index 03027eab37be..b301f39665b7 100644 --- a/pkg/rpc/context_test.go +++ b/pkg/rpc/context_test.go @@ -265,7 +265,8 @@ func testClockOffsetInPingRequestInternal(t *testing.T, clientOnly bool) { clientOpts := opts clientOpts.Config = testutils.NewNodeTestBaseContext() // Experimentally, values below 50ms seem to incur flakiness. - clientOpts.Config.RPCHeartbeatIntervalAndTimeout = 100 * time.Millisecond + clientOpts.Config.RPCHeartbeatInterval = 100 * time.Millisecond + clientOpts.Config.RPCHeartbeatTimeout = 100 * time.Millisecond clientOpts.ClientOnly = clientOnly clientOpts.OnOutgoingPing = func(ctx context.Context, req *PingRequest) error { select { @@ -903,7 +904,8 @@ func TestHeartbeatHealth(t *testing.T) { clientCtx.Config.AdvertiseAddr = lisLocalServer.Addr().String() // Make the interval shorter to speed up the test. - clientCtx.Config.RPCHeartbeatIntervalAndTimeout = 1 * time.Millisecond + clientCtx.Config.RPCHeartbeatInterval = 1 * time.Millisecond + clientCtx.Config.RPCHeartbeatTimeout = 1 * time.Millisecond m := clientCtx.Metrics() @@ -1148,7 +1150,8 @@ func TestHeartbeatHealthTransport(t *testing.T) { clientCtx := newTestContext(clusterID, clock, maxOffset, stopper) // Make the interval shorter to speed up the test. - clientCtx.Config.RPCHeartbeatIntervalAndTimeout = 1 * time.Millisecond + clientCtx.Config.RPCHeartbeatInterval = 1 * time.Millisecond + clientCtx.Config.RPCHeartbeatTimeout = 1 * time.Millisecond if _, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(context.Background()); err != nil { t.Fatal(err) } @@ -1259,7 +1262,7 @@ func TestHeartbeatHealthTransport(t *testing.T) { }) // Should stay unhealthy despite reconnection attempts. - for then := timeutil.Now(); timeutil.Since(then) < 50*clientCtx.Config.RPCHeartbeatIntervalAndTimeout; { + for then := timeutil.Now(); timeutil.Since(then) < 50*clientCtx.Config.RPCHeartbeatTimeout; { err := clientCtx.TestingConnHealth(remoteAddr, serverNodeID) if !isUnhealthy(err) { t.Fatal(err) @@ -1303,7 +1306,8 @@ func TestOffsetMeasurement(t *testing.T) { clientMaxOffset := time.Duration(0) clientCtx := newTestContext(clusterID, clientClock, clientMaxOffset, stopper) // Make the interval shorter to speed up the test. - clientCtx.Config.RPCHeartbeatIntervalAndTimeout = 1 * time.Millisecond + clientCtx.Config.RPCHeartbeatInterval = 1 * time.Millisecond + clientCtx.Config.RPCHeartbeatTimeout = 1 * time.Millisecond clientCtx.RemoteClocks.offsetTTL = 5 * clientClock.getAdvancementInterval() if _, err := clientCtx.GRPCDialNode(remoteAddr, serverNodeID, DefaultClass).Connect(ctx); err != nil { t.Fatal(err) @@ -1465,7 +1469,8 @@ func TestRemoteOffsetUnhealthy(t *testing.T) { clock := timeutil.NewManualTime(timeutil.Unix(0, start.Add(nodeCtxs[i].offset).UnixNano())) nodeCtxs[i].errChan = make(chan error, 1) nodeCtxs[i].ctx = newTestContext(clusterID, clock, maxOffset, stopper) - nodeCtxs[i].ctx.Config.RPCHeartbeatIntervalAndTimeout = maxOffset + nodeCtxs[i].ctx.Config.RPCHeartbeatInterval = maxOffset + nodeCtxs[i].ctx.Config.RPCHeartbeatTimeout = maxOffset nodeCtxs[i].ctx.NodeID.Set(context.Background(), roachpb.NodeID(i+1)) s := newTestServer(t, nodeCtxs[i].ctx) @@ -1681,7 +1686,8 @@ func grpcRunKeepaliveTestCase(testCtx context.Context, c grpcKeepaliveTestCase) log.Infof(ctx, "setting up client") clientCtx := newTestContext(clusterID, clock, maxOffset, stopper) // Disable automatic heartbeats. We'll send them by hand. - clientCtx.Config.RPCHeartbeatIntervalAndTimeout = math.MaxInt64 + clientCtx.Config.RPCHeartbeatInterval = math.MaxInt64 + clientCtx.Config.RPCHeartbeatTimeout = math.MaxInt64 var firstConn int32 = 1 diff --git a/pkg/rpc/keepalive.go b/pkg/rpc/keepalive.go index cafd4376241e..abfe2dbc313b 100644 --- a/pkg/rpc/keepalive.go +++ b/pkg/rpc/keepalive.go @@ -37,8 +37,8 @@ var clientKeepalive = keepalive.ClientParameters{ PermitWithoutStream: true, } var serverKeepalive = keepalive.ServerParameters{ - // Send periodic pings on the connection. - Time: base.NetworkTimeout, + // Send periodic pings on the connection when there is no other traffic. + Time: base.PingInterval, // If the pings don't get a response within the timeout, we might be // experiencing a network partition. gRPC will close the transport-level // connection and all the pending RPCs (which may not have timeouts) will diff --git a/pkg/rpc/nodedialer/nodedialer_test.go b/pkg/rpc/nodedialer/nodedialer_test.go index db7841875d29..4546405fc635 100644 --- a/pkg/rpc/nodedialer/nodedialer_test.go +++ b/pkg/rpc/nodedialer/nodedialer_test.go @@ -462,7 +462,8 @@ func newTestContext( ) *rpc.Context { cfg := testutils.NewNodeTestBaseContext() cfg.Insecure = true - cfg.RPCHeartbeatIntervalAndTimeout = 100 * time.Millisecond + cfg.RPCHeartbeatInterval = 100 * time.Millisecond + cfg.RPCHeartbeatTimeout = 100 * time.Millisecond ctx := context.Background() rctx := rpc.NewContext(ctx, rpc.ContextOptions{ TenantID: roachpb.SystemTenantID, diff --git a/pkg/server/server_controller.go b/pkg/server/server_controller.go index a4fc00cad6a5..a2789890af9a 100644 --- a/pkg/server/server_controller.go +++ b/pkg/server/server_controller.go @@ -464,7 +464,8 @@ func makeInMemoryTenantServerConfig( baseCfg.Config.User = kvServerCfg.Config.User baseCfg.Config.DisableTLSForHTTP = kvServerCfg.Config.DisableTLSForHTTP baseCfg.Config.AcceptSQLWithoutTLS = kvServerCfg.Config.AcceptSQLWithoutTLS - baseCfg.Config.RPCHeartbeatIntervalAndTimeout = kvServerCfg.Config.RPCHeartbeatIntervalAndTimeout + baseCfg.Config.RPCHeartbeatInterval = kvServerCfg.Config.RPCHeartbeatInterval + baseCfg.Config.RPCHeartbeatTimeout = kvServerCfg.Config.RPCHeartbeatTimeout baseCfg.Config.ClockDevicePath = kvServerCfg.Config.ClockDevicePath baseCfg.Config.ClusterName = kvServerCfg.Config.ClusterName baseCfg.Config.DisableClusterNameVerification = kvServerCfg.Config.DisableClusterNameVerification