From 42e33bb8ff00775d035e58c9f6f323e4e21aeb52 Mon Sep 17 00:00:00 2001 From: Derek Menteer <105233703+hashi-derek@users.noreply.github.com> Date: Tue, 24 Oct 2023 08:52:27 -0500 Subject: [PATCH] =?UTF-8?q?Backport=20of=20Add=20grpc=20keepalive=20config?= =?UTF-8?q?uration.=20into=20release/1.14.x=20(#1=E2=80=A6=20(#19348)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport of Add grpc keepalive configuration. into release/1.14.x (#19339) Add grpc keepalive configuration. (#19339) Prior to the introduction of this configuration, grpc keepalive messages were sent after 2 hours of inactivity on the stream. This posed issues in various scenarios where the server-side xds connection balancing was unaware that envoy instances were uncleanly killed / force-closed, since the connections would only be cleaned up after ~5 minutes of TCP timeouts occurred. Setting this config to a 30 second interval with a 20 second timeout ensures that at most, it should take up to 50 seconds for a dead xds connection to be closed. --- .changelog/19339.txt | 4 ++++ agent/agent.go | 5 +++++ agent/config/builder.go | 9 +++++++++ agent/config/config.go | 8 +++++--- agent/config/default.go | 2 ++ agent/config/runtime.go | 13 +++++++++++++ agent/config/runtime_test.go | 2 ++ .../testdata/TestRuntimeConfig_Sanitize.golden | 2 ++ agent/config/testdata/full-config.hcl | 2 ++ agent/config/testdata/full-config.json | 4 +++- agent/consul/server_test.go | 3 ++- agent/grpc-external/server.go | 8 +++++++- agent/grpc-external/stats_test.go | 3 ++- agent/rpc/peering/service_test.go | 3 ++- website/content/docs/agent/config/config-files.mdx | 4 ++++ 15 files changed, 64 insertions(+), 8 deletions(-) create mode 100644 .changelog/19339.txt diff --git a/.changelog/19339.txt b/.changelog/19339.txt new file mode 100644 index 000000000000..884fb4a3bd16 --- /dev/null +++ b/.changelog/19339.txt @@ -0,0 +1,4 @@ +```release-note:bug +connect: Fix bug where uncleanly closed xDS connections would influence connection balancing for too long and prevent envoy instances from starting. Two new configuration fields +`performance.grpc_keepalive_timeout` and `performance.grpc_keepalive_interval` now exist to allow for configuration on how often these dead connections will be cleaned up. +``` diff --git a/agent/agent.go b/agent/agent.go index e4b507535d3e..cf8681706883 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -31,6 +31,7 @@ import ( "golang.org/x/net/http2" "golang.org/x/net/http2/h2c" "google.golang.org/grpc" + "google.golang.org/grpc/keepalive" "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl/resolver" @@ -570,6 +571,10 @@ func (a *Agent) Start(ctx context.Context) error { a.logger.Named("grpc.external"), metrics.Default(), a.tlsConfigurator, + keepalive.ServerParameters{ + Time: a.config.GRPCKeepaliveInterval, + Timeout: a.config.GRPCKeepaliveTimeout, + }, ) if err := a.startLicenseManager(ctx); err != nil { diff --git a/agent/config/builder.go b/agent/config/builder.go index ee89f5f8bd29..e30d382fc31d 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -1000,6 +1000,8 @@ func (b *builder) build() (rt RuntimeConfig, err error) { GRPCPort: grpcPort, GRPCTLSAddrs: grpcTlsAddrs, GRPCTLSPort: grpcTlsPort, + GRPCKeepaliveInterval: b.durationValWithDefault("performance.grpc_keepalive_interval", c.Performance.GRPCKeepaliveInterval, 30*time.Second), + GRPCKeepaliveTimeout: b.durationValWithDefault("performance.grpc_keepalive_timeout", c.Performance.GRPCKeepaliveTimeout, 20*time.Second), HTTPMaxConnsPerClient: intVal(c.Limits.HTTPMaxConnsPerClient), HTTPSHandshakeTimeout: b.durationVal("limits.https_handshake_timeout", c.Limits.HTTPSHandshakeTimeout), KVMaxValueSize: uint64Val(c.Limits.KVMaxValueSize), @@ -1124,6 +1126,13 @@ func (b *builder) build() (rt RuntimeConfig, err error) { } } + if rt.GRPCKeepaliveInterval < time.Second { + return RuntimeConfig{}, fmt.Errorf("performance.grpc_keepalive_interval must be 1s or greater, was: %v", rt.GRPCKeepaliveInterval) + } + if rt.GRPCKeepaliveTimeout < time.Second { + return RuntimeConfig{}, fmt.Errorf("performance.grpc_keepalive_timeout must be 1s or greater, was: %v", rt.GRPCKeepaliveTimeout) + } + if err := b.BuildEnterpriseRuntimeConfig(&rt, &c); err != nil { return rt, err } diff --git a/agent/config/config.go b/agent/config/config.go index c6a4d4fa1a04..b0cb73ab98da 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -662,9 +662,11 @@ type HTTPConfig struct { } type Performance struct { - LeaveDrainTime *string `mapstructure:"leave_drain_time"` - RaftMultiplier *int `mapstructure:"raft_multiplier"` // todo(fs): validate as uint - RPCHoldTimeout *string `mapstructure:"rpc_hold_timeout"` + LeaveDrainTime *string `mapstructure:"leave_drain_time"` + RaftMultiplier *int `mapstructure:"raft_multiplier"` // todo(fs): validate as uint + RPCHoldTimeout *string `mapstructure:"rpc_hold_timeout"` + GRPCKeepaliveInterval *string `mapstructure:"grpc_keepalive_interval"` + GRPCKeepaliveTimeout *string `mapstructure:"grpc_keepalive_timeout"` } type Telemetry struct { diff --git a/agent/config/default.go b/agent/config/default.go index e80a0d6c3140..f424c6254eb5 100644 --- a/agent/config/default.go +++ b/agent/config/default.go @@ -109,6 +109,8 @@ func DefaultSource() Source { leave_drain_time = "5s" raft_multiplier = ` + strconv.Itoa(int(consul.DefaultRaftMultiplier)) + ` rpc_hold_timeout = "7s" + grpc_keepalive_interval = "30s" + grpc_keepalive_timeout = "20s" } ports = { dns = 8600 diff --git a/agent/config/runtime.go b/agent/config/runtime.go index 93784ff42ee2..da9afacca886 100644 --- a/agent/config/runtime.go +++ b/agent/config/runtime.go @@ -719,6 +719,19 @@ type RuntimeConfig struct { // hcl: client_addr = string addresses { grpc_tls = string } ports { grpc_tls = int } GRPCTLSAddrs []net.Addr + // GRPCKeepaliveInterval determines how frequently an HTTP2 keepalive will be broadcast + // whenever a GRPC connection is idle. This helps detect xds connections that have died. + // + // Since the xds load balancing between servers relies on knowing how many connections + // are active, this configuration ensures that they are routinely detected / cleaned up + // on an interval. + GRPCKeepaliveInterval time.Duration + + // GRPCKeepaliveTimeout specifies how long a GRPC client has to reply to the keepalive + // messages spawned from GRPCKeepaliveInterval. If a client does not reply in this amount of + // time, the connection will be closed by the server. + GRPCKeepaliveTimeout time.Duration + // HTTPAddrs contains the list of TCP addresses and UNIX sockets the HTTP // server will bind to. If the HTTP endpoint is disabled (ports.http <= 0) // the list is empty. diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index 470fd44d41ef..112244ac96d2 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -6190,6 +6190,8 @@ func TestLoad_FullConfig(t *testing.T) { GRPCAddrs: []net.Addr{tcpAddr("32.31.61.91:4881")}, GRPCTLSPort: 5201, GRPCTLSAddrs: []net.Addr{tcpAddr("23.14.88.19:5201")}, + GRPCKeepaliveInterval: 33 * time.Second, + GRPCKeepaliveTimeout: 22 * time.Second, HTTPAddrs: []net.Addr{tcpAddr("83.39.91.39:7999")}, HTTPBlockEndpoints: []string{"RBvAFcGD", "fWOWFznh"}, AllowWriteHTTPFrom: []*net.IPNet{cidr("127.0.0.0/8"), cidr("22.33.44.55/32"), cidr("0.0.0.0/0")}, diff --git a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden index 3b7498881929..387e1634930d 100644 --- a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden +++ b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden @@ -205,6 +205,8 @@ "GRPCPort": 0, "GRPCTLSAddrs": [], "GRPCTLSPort": 0, + "GRPCKeepaliveInterval": "0s", + "GRPCKeepaliveTimeout": "0s", "GossipLANGossipInterval": "0s", "GossipLANGossipNodes": 0, "GossipLANProbeInterval": "0s", diff --git a/agent/config/testdata/full-config.hcl b/agent/config/testdata/full-config.hcl index 212829cd98d4..04abdbe266f9 100644 --- a/agent/config/testdata/full-config.hcl +++ b/agent/config/testdata/full-config.hcl @@ -325,6 +325,8 @@ performance { leave_drain_time = "8265s" raft_multiplier = 5 rpc_hold_timeout = "15707s" + grpc_keepalive_interval = "33s" + grpc_keepalive_timeout = "22s" } pid_file = "43xN80Km" ports { diff --git a/agent/config/testdata/full-config.json b/agent/config/testdata/full-config.json index c6b9c4b7479e..5778dca1d937 100644 --- a/agent/config/testdata/full-config.json +++ b/agent/config/testdata/full-config.json @@ -324,7 +324,9 @@ "performance": { "leave_drain_time": "8265s", "raft_multiplier": 5, - "rpc_hold_timeout": "15707s" + "rpc_hold_timeout": "15707s", + "grpc_keepalive_interval": "33s", + "grpc_keepalive_timeout": "22s" }, "pid_file": "43xN80Km", "ports": { diff --git a/agent/consul/server_test.go b/agent/consul/server_test.go index 299b2cc158b3..167b0a3a75d5 100644 --- a/agent/consul/server_test.go +++ b/agent/consul/server_test.go @@ -23,6 +23,7 @@ import ( "github.com/stretchr/testify/require" "golang.org/x/time/rate" "google.golang.org/grpc" + "google.golang.org/grpc/keepalive" "github.com/hashicorp/consul/agent/hcp" @@ -331,7 +332,7 @@ func newServerWithDeps(t *testing.T, c *Config, deps Deps) (*Server, error) { oldNotify() } } - grpcServer := external.NewServer(deps.Logger.Named("grpc.external"), nil, deps.TLSConfigurator) + grpcServer := external.NewServer(deps.Logger.Named("grpc.external"), nil, deps.TLSConfigurator, keepalive.ServerParameters{}) srv, err := NewServer(c, deps, grpcServer) if err != nil { return nil, err diff --git a/agent/grpc-external/server.go b/agent/grpc-external/server.go index dd0186d480f3..7d96d6b1c9d3 100644 --- a/agent/grpc-external/server.go +++ b/agent/grpc-external/server.go @@ -23,7 +23,12 @@ var ( // NewServer constructs a gRPC server for the external gRPC port, to which // handlers can be registered. -func NewServer(logger agentmiddleware.Logger, metricsObj *metrics.Metrics, tls *tlsutil.Configurator) *grpc.Server { +func NewServer( + logger agentmiddleware.Logger, + metricsObj *metrics.Metrics, + tls *tlsutil.Configurator, + keepaliveParams keepalive.ServerParameters, +) *grpc.Server { if metricsObj == nil { metricsObj = metrics.Default() } @@ -51,6 +56,7 @@ func NewServer(logger agentmiddleware.Logger, metricsObj *metrics.Metrics, tls * grpc.StatsHandler(agentmiddleware.NewStatsHandler(metricsObj, metricsLabels)), middleware.WithUnaryServerChain(unaryInterceptors...), middleware.WithStreamServerChain(streamInterceptors...), + grpc.KeepaliveParams(keepaliveParams), grpc.KeepaliveEnforcementPolicy(keepalive.EnforcementPolicy{ // This must be less than the keealive.ClientParameters Time setting, otherwise // the server will disconnect the client for sending too many keepalive pings. diff --git a/agent/grpc-external/stats_test.go b/agent/grpc-external/stats_test.go index c231a922d121..7bdf424e385f 100644 --- a/agent/grpc-external/stats_test.go +++ b/agent/grpc-external/stats_test.go @@ -11,6 +11,7 @@ import ( "github.com/stretchr/testify/require" "golang.org/x/sync/errgroup" "google.golang.org/grpc" + "google.golang.org/grpc/keepalive" "github.com/hashicorp/go-hclog" @@ -23,7 +24,7 @@ import ( func TestServer_EmitsStats(t *testing.T) { sink, metricsObj := testutil.NewFakeSink(t) - srv := NewServer(hclog.Default(), metricsObj, nil) + srv := NewServer(hclog.Default(), metricsObj, nil, keepalive.ServerParameters{}) testservice.RegisterSimpleServer(srv, &testservice.Simple{}) diff --git a/agent/rpc/peering/service_test.go b/agent/rpc/peering/service_test.go index 53782df1807a..58c71cb13d00 100644 --- a/agent/rpc/peering/service_test.go +++ b/agent/rpc/peering/service_test.go @@ -18,6 +18,7 @@ import ( "github.com/stretchr/testify/require" gogrpc "google.golang.org/grpc" "google.golang.org/grpc/codes" + "google.golang.org/grpc/keepalive" "google.golang.org/grpc/metadata" grpcstatus "google.golang.org/grpc/status" "google.golang.org/protobuf/proto" @@ -1811,7 +1812,7 @@ func newTestServer(t *testing.T, cb func(conf *consul.Config)) testingServer { conf.ACLResolverSettings.EnterpriseMeta = *conf.AgentEnterpriseMeta() deps := newDefaultDeps(t, conf) - externalGRPCServer := external.NewServer(deps.Logger, nil, deps.TLSConfigurator) + externalGRPCServer := external.NewServer(deps.Logger, nil, deps.TLSConfigurator, keepalive.ServerParameters{}) server, err := consul.NewServer(conf, deps, externalGRPCServer) require.NoError(t, err) diff --git a/website/content/docs/agent/config/config-files.mdx b/website/content/docs/agent/config/config-files.mdx index e6c90c62f549..0950cd0fd8ce 100644 --- a/website/content/docs/agent/config/config-files.mdx +++ b/website/content/docs/agent/config/config-files.mdx @@ -603,6 +603,10 @@ Refer to the [formatting specification](https://golang.org/pkg/time/#ParseDurati This was added in Consul 1.0. Must be a duration value such as 10s. Defaults to 7s. + - `grpc_keepalive_interval` - A duration that determines the frequency that Consul servers send keep-alive messages to inactive gRPC clients. Configure this setting to modify how quickly Consul detects and removes improperly closed xDS or peering connections. Default is `30s`. + + - `grpc_keepalive_timeout` - A duration that determines how long a Consul server waits for a reply to a keep-alive message. If the server does not receive a reply before the end of the duration, Consul flags the gRPC connection as unhealthy and forcibly removes it. Defaults to `20s`. + - `pid_file` Equivalent to the [`-pid-file` command line flag](/docs/agent/config/cli-flags#_pid_file). - `ports` This is a nested object that allows setting the bind ports for the following keys: