Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(kds): nack backoff #5894

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pkg/config/app/kuma-cp/kuma-cp.defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,8 @@ multizone:
# MsgSendTimeout defines a timeout on sending a single KDS message.
# KDS stream between control planes is terminated if the control plane hits this timeout.
msgSendTimeout: 60s # ENV: KUMA_MULTIZONE_GLOBAL_KDS_MSG_SEND_TIMEOUT
# Backoff that is executed when the global control plane is sending the response that was previously rejected by zone control plane
nackBackoff: 5s # ENV: KUMA_MULTIZONE_GLOBAL_KDS_NACK_BACKOFF
zone:
# Kuma Zone name used to mark the zone dataplane resources
name: "" # ENV: KUMA_MULTIZONE_ZONE_NAME
Expand All @@ -427,6 +429,8 @@ multizone:
# MsgSendTimeout defines a timeout on sending a single KDS message.
# KDS stream between control planes is terminated if the control plane hits this timeout.
msgSendTimeout: 60s # ENV: KUMA_MULTIZONE_ZONE_KDS_MSG_SEND_TIMEOUT
# Backoff that is executed when the zone control plane is sending the response that was previously rejected by global control plane
nackBackoff: 5s # ENV: KUMA_MULTIZONE_ZONE_KDS_NACK_BACKOFF

# Diagnostics configuration
diagnostics:
Expand Down
6 changes: 6 additions & 0 deletions pkg/config/loader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,12 +228,14 @@ var _ = Describe("Config loader", func() {
Expect(cfg.Multizone.Global.KDS.TlsKeyFile).To(Equal("/key"))
Expect(cfg.Multizone.Global.KDS.MaxMsgSize).To(Equal(uint32(1)))
Expect(cfg.Multizone.Global.KDS.MsgSendTimeout.Duration).To(Equal(10 * time.Second))
Expect(cfg.Multizone.Global.KDS.NackBackoff.Duration).To(Equal(11 * time.Second))
Expect(cfg.Multizone.Zone.GlobalAddress).To(Equal("grpc://1.1.1.1:5685"))
Expect(cfg.Multizone.Zone.Name).To(Equal("zone-1"))
Expect(cfg.Multizone.Zone.KDS.RootCAFile).To(Equal("/rootCa"))
Expect(cfg.Multizone.Zone.KDS.RefreshInterval.Duration).To(Equal(9 * time.Second))
Expect(cfg.Multizone.Zone.KDS.MaxMsgSize).To(Equal(uint32(2)))
Expect(cfg.Multizone.Zone.KDS.MsgSendTimeout.Duration).To(Equal(20 * time.Second))
Expect(cfg.Multizone.Zone.KDS.NackBackoff.Duration).To(Equal(21 * time.Second))

Expect(cfg.Defaults.SkipMeshCreation).To(BeTrue())
Expect(cfg.Defaults.EnableLocalhostInboundClusters).To(BeTrue())
Expand Down Expand Up @@ -489,6 +491,7 @@ multizone:
tlsCipherSuites: ["TLS_RSA_WITH_AES_128_CBC_SHA", "TLS_AES_256_GCM_SHA384"]
maxMsgSize: 1
msgSendTimeout: 10s
nackBackoff: 11s
zone:
globalAddress: "grpc://1.1.1.1:5685"
name: "zone-1"
Expand All @@ -497,6 +500,7 @@ multizone:
rootCaFile: /rootCa
maxMsgSize: 2
msgSendTimeout: 20s
nackBackoff: 21s
dnsServer:
domain: test-domain
CIDR: 127.1.0.0/16
Expand Down Expand Up @@ -725,12 +729,14 @@ proxy:
"KUMA_MULTIZONE_GLOBAL_KDS_TLS_CIPHER_SUITES": "TLS_RSA_WITH_AES_128_CBC_SHA,TLS_AES_256_GCM_SHA384",
"KUMA_MULTIZONE_GLOBAL_KDS_MAX_MSG_SIZE": "1",
"KUMA_MULTIZONE_GLOBAL_KDS_MSG_SEND_TIMEOUT": "10s",
"KUMA_MULTIZONE_GLOBAL_KDS_NACK_BACKOFF": "11s",
"KUMA_MULTIZONE_ZONE_GLOBAL_ADDRESS": "grpc://1.1.1.1:5685",
"KUMA_MULTIZONE_ZONE_NAME": "zone-1",
"KUMA_MULTIZONE_ZONE_KDS_ROOT_CA_FILE": "/rootCa",
"KUMA_MULTIZONE_ZONE_KDS_REFRESH_INTERVAL": "9s",
"KUMA_MULTIZONE_ZONE_KDS_MAX_MSG_SIZE": "2",
"KUMA_MULTIZONE_ZONE_KDS_MSG_SEND_TIMEOUT": "20s",
"KUMA_MULTIZONE_ZONE_KDS_NACK_BACKOFF": "21s",
"KUMA_MULTIZONE_GLOBAL_KDS_ZONE_INSIGHT_FLUSH_INTERVAL": "5s",
"KUMA_DEFAULTS_SKIP_MESH_CREATION": "true",
"KUMA_DEFAULTS_ENABLE_LOCALHOST_INBOUND_CLUSTERS": "true",
Expand Down
4 changes: 4 additions & 0 deletions pkg/config/multizone/kds.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ type KdsServerConfig struct {
// MsgSendTimeout defines a timeout on sending a single KDS message.
// KDS stream between control planes is terminated if the control plane hits this timeout.
MsgSendTimeout config_types.Duration `json:"msgSendTimeout" envconfig:"kuma_multizone_global_kds_msg_send_timeout"`
// Backoff that is executed when the global control plane is sending the response that was previously rejected by zone control plane.
NackBackoff config_types.Duration `json:"nackBackoff" envconfig:"kuma_multizone_global_kds_nack_backoff"`
}

var _ config.Config = &KdsServerConfig{}
Expand Down Expand Up @@ -78,6 +80,8 @@ type KdsClientConfig struct {
// MsgSendTimeout defines a timeout on sending a single KDS message.
// KDS stream between control planes is terminated if the control plane hits this timeout.
MsgSendTimeout config_types.Duration `json:"msgSendTimeout" envconfig:"kuma_multizone_zone_kds_msg_send_timeout"`
// Backoff that is executed when the zone control plane is sending the response that was previously rejected by global control plane.
NackBackoff config_types.Duration `json:"nackBackoff" envconfig:"kuma_multizone_zone_kds_nack_backoff"`
}

var _ config.Config = &KdsClientConfig{}
Expand Down
2 changes: 2 additions & 0 deletions pkg/config/multizone/multicluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ func DefaultGlobalConfig() *GlobalConfig {
MsgSendTimeout: config_types.Duration{Duration: 60 * time.Second},
TlsMinVersion: "TLSv1_2",
TlsCipherSuites: []string{},
NackBackoff: config_types.Duration{Duration: 5 * time.Second},
},
}
}
Expand Down Expand Up @@ -99,6 +100,7 @@ func DefaultZoneConfig() *ZoneConfig {
RefreshInterval: config_types.Duration{Duration: 1 * time.Second},
MaxMsgSize: 10 * 1024 * 1024,
MsgSendTimeout: config_types.Duration{Duration: 60 * time.Second},
NackBackoff: config_types.Duration{Duration: 5 * time.Second},
},
}
}
Expand Down
14 changes: 11 additions & 3 deletions pkg/kds/global/components.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,17 @@ func Setup(rt runtime.Runtime) error {
return nil
}
reg := registry.Global()
kdsServer, err := kds_server.New(kdsGlobalLog, rt, reg.ObjectTypes(model.HasKDSFlag(model.ProvidedByGlobal)),
"global", rt.Config().Multizone.Global.KDS.RefreshInterval.Duration,
rt.KDSContext().GlobalProvidedFilter, rt.KDSContext().GlobalResourceMapper, true)
kdsServer, err := kds_server.New(
kdsGlobalLog,
rt,
reg.ObjectTypes(model.HasKDSFlag(model.ProvidedByGlobal)),
"global",
rt.Config().Multizone.Global.KDS.RefreshInterval.Duration,
rt.KDSContext().GlobalProvidedFilter,
rt.KDSContext().GlobalResourceMapper,
true,
rt.Config().Multizone.Global.KDS.NackBackoff.Duration,
)
if err != nil {
return err
}
Expand Down
13 changes: 12 additions & 1 deletion pkg/kds/server/components.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,17 @@ import (
util_xds_v3 "github.com/kumahq/kuma/pkg/util/xds/v3"
)

func New(log logr.Logger, rt core_runtime.Runtime, providedTypes []model.ResourceType, serverID string, refresh time.Duration, filter reconcile.ResourceFilter, mapper reconcile.ResourceMapper, insight bool) (Server, error) {
func New(
log logr.Logger,
rt core_runtime.Runtime,
providedTypes []model.ResourceType,
serverID string,
refresh time.Duration,
filter reconcile.ResourceFilter,
mapper reconcile.ResourceMapper,
insight bool,
nackBackoff time.Duration,
) (Server, error) {
hasher, cache := newKDSContext(log)
generator := reconcile.NewSnapshotGenerator(rt.ReadOnlyResourceManager(), providedTypes, filter, mapper)
versioner := util_xds_v3.SnapshotAutoVersioner{UUID: core.NewUUID}
Expand All @@ -38,6 +48,7 @@ func New(log logr.Logger, rt core_runtime.Runtime, providedTypes []model.Resourc
util_xds_v3.NewControlPlaneIdCallbacks(serverID),
util_xds_v3.AdaptCallbacks(util_xds.LoggingCallbacks{Log: log}),
util_xds_v3.AdaptCallbacks(statsCallbacks),
util_xds_v3.AdaptCallbacks(NewNackBackoff(nackBackoff)),
syncTracker,
}
if insight {
Expand Down
40 changes: 40 additions & 0 deletions pkg/kds/server/nack_backoff.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package server

import (
"time"

"github.com/kumahq/kuma/pkg/core"
util_xds "github.com/kumahq/kuma/pkg/util/xds"
)

var nackLog = core.Log.WithName("kds").WithName("nack-backoff")

type nackBackoff struct {
backoff time.Duration
util_xds.NoopCallbacks
}

var _ util_xds.Callbacks = &nackBackoff{}

func NewNackBackoff(backoff time.Duration) util_xds.Callbacks {
return &nackBackoff{
backoff: backoff,
}
}

func (n *nackBackoff) OnStreamResponse(_ int64, request util_xds.DiscoveryRequest, _ util_xds.DiscoveryResponse) {
if request.HasErrors() {
// When DiscoveryRequest contains errors, it means that a control plane rejected configuration generated by the other control plane
// It may happen for several reasons:
// 1) Eventual consistency - ex. MeshTrafficPermission, but Mesh for this TrafficPermission is not synced yet.
// 2) Config is valid from one control plane side but invalid from the other side - ex. schema is broken
//
// Second case is especially dangerous because we will end up in a loop.
// CP is constantly trying to send a config and other cp immediately rejects the config.
// Without this backoff, CP is under a lot of pressure from faulty control plane.
//
// It is safe to sleep here because OnStreamResponse is executed in the goroutine of a single ADS stream
nackLog.Info("config was previously rejected by other control plane. Applying backoff before resending it", "backoff", n.backoff, "nodeID", request.NodeId(), "reason", request.ErrorMsg())
time.Sleep(n.backoff)
}
}
14 changes: 11 additions & 3 deletions pkg/kds/zone/components.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,17 @@ func Setup(rt core_runtime.Runtime) error {
zone := rt.Config().Multizone.Zone.Name
reg := registry.Global()
kdsCtx := rt.KDSContext()
kdsServer, err := kds_server.New(kdsZoneLog, rt, reg.ObjectTypes(model.HasKDSFlag(model.ProvidedByZone)),
zone, rt.Config().Multizone.Zone.KDS.RefreshInterval.Duration,
kdsCtx.ZoneProvidedFilter, kdsCtx.ZoneResourceMapper, false)
kdsServer, err := kds_server.New(
kdsZoneLog,
rt,
reg.ObjectTypes(model.HasKDSFlag(model.ProvidedByZone)),
zone,
rt.Config().Multizone.Zone.KDS.RefreshInterval.Duration,
kdsCtx.ZoneProvidedFilter,
kdsCtx.ZoneResourceMapper,
false,
rt.Config().Multizone.Zone.KDS.NackBackoff.Duration,
)
if err != nil {
return err
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/test/kds/setup/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,5 @@ func StartServer(store store.ResourceStore, clusterID string, providedTypes []mo
cfg: kuma_cp.Config{},
metrics: metrics,
}
return kds_server.New(core.Log.WithName("kds").WithName(clusterID), rt, providedTypes, clusterID, 100*time.Millisecond, providedFilter, providedMapper, false)
return kds_server.New(core.Log.WithName("kds").WithName(clusterID), rt, providedTypes, clusterID, 100*time.Millisecond, providedFilter, providedMapper, false, 1*time.Second)
}