From b456aa82ff8017ace2f8937002c20c338092097d Mon Sep 17 00:00:00 2001 From: Johannes Malsam <60240743+johannes94@users.noreply.github.com> Date: Mon, 18 Nov 2024 14:07:18 +0100 Subject: [PATCH] ROX-26421: add backoff logic if addon is stuck in upgrading (#2090) * add backoff logic if addon is stuck in upgrading * PR feedback --- internal/dinosaur/pkg/services/addon.go | 19 ++++++ internal/dinosaur/pkg/services/addon_test.go | 70 ++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/internal/dinosaur/pkg/services/addon.go b/internal/dinosaur/pkg/services/addon.go index 3b2b47fb5..846e8f911 100644 --- a/internal/dinosaur/pkg/services/addon.go +++ b/internal/dinosaur/pkg/services/addon.go @@ -4,6 +4,7 @@ import ( "encoding/json" "errors" "fmt" + "time" "github.com/golang/glog" clustersmgmtv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" @@ -20,6 +21,12 @@ import ( const fleetshardImageTagParameter = "fleetshardSyncImageTag" +// This backoff was introduced to prevent reaching OCM service log limits +// by sending too many addon upgrade requests. The limit is 1000. A backoff +// of 20 minutes would result in 504 requests per week, which leaves some space +// for other openshift components to call the API as well. +const addonUpgradeBackoff = 20 * time.Minute + type updateAddonStatusMetricFunc func(addonID, clusterName string, status metrics.AddonStatus) // AddonProvisioner keeps addon installations on the data plane clusters up-to-date @@ -27,6 +34,8 @@ type AddonProvisioner struct { ocmClient ocm.Client customizations []addonCustomization updateAddonStatusMetricFunc updateAddonStatusMetricFunc + lastStatus metrics.AddonStatus + lastUpgradeRequestTime time.Time } // NewAddonProvisioner creates a new instance of AddonProvisioner @@ -224,10 +233,16 @@ func (p *AddonProvisioner) newInstallation(config gitops.AddonConfig) (*clusters } func (p *AddonProvisioner) updateAddon(clusterID string, config gitops.AddonConfig) error { + if p.backoffUpgradeRequest() { + glog.V(5).Infof("update addon request backoff for cluster: %s", clusterID) + return nil + } + update, err := p.newInstallation(config) if err != nil { return err } + p.lastUpgradeRequestTime = time.Now() if err := p.ocmClient.UpdateAddonInstallation(clusterID, update); err != nil { return fmt.Errorf("update addon %s: %w", update.ID(), err) } @@ -235,6 +250,10 @@ func (p *AddonProvisioner) updateAddon(clusterID string, config gitops.AddonConf return nil } +func (p *AddonProvisioner) backoffUpgradeRequest() bool { + return p.lastStatus == metrics.AddonUpgrade && time.Since(p.lastUpgradeRequestTime) < addonUpgradeBackoff +} + func (p *AddonProvisioner) uninstallAddon(clusterID string, addonID string) error { if err := p.ocmClient.DeleteAddonInstallation(clusterID, addonID); err != nil { return fmt.Errorf("uninstall addon %s: %w", addonID, err) diff --git a/internal/dinosaur/pkg/services/addon_test.go b/internal/dinosaur/pkg/services/addon_test.go index 15503c130..a3782829b 100644 --- a/internal/dinosaur/pkg/services/addon_test.go +++ b/internal/dinosaur/pkg/services/addon_test.go @@ -3,6 +3,7 @@ package services import ( "encoding/json" "testing" + "time" . "github.com/onsi/gomega" clustersmgmtv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" @@ -661,6 +662,13 @@ func TestAddonProvisioner_Provision(t *testing.T) { if tt.want != nil { tt.want(tt.fields.ocmClient) } + + if tt.fields.ocmClient != nil { + if len(tt.fields.ocmClient.UpdateAddonInstallationCalls()) > 0 { + Expect(p.lastUpgradeRequestTime).NotTo(Equal(time.Time{})) + } + } + Expect(updates).To(Equal(tt.wantStatuses)) }) } @@ -894,3 +902,65 @@ func TestAddonProvisioner_NewAddonProvisioner(t *testing.T) { Expect(baseConfigPtr.ClientSecret).To(Equal("base-client-secret")) Expect(baseConfigPtr.SelfToken).To(Equal("base-token")) } + +func TestAddonProvisioner_Provision_UpgradeBackoff(t *testing.T) { + RegisterTestingT(t) + + ocmMock := &ocm.ClientMock{ + GetAddonInstallationFunc: func(clusterID string, addonID string) (*clustersmgmtv1.AddOnInstallation, *errors.ServiceError) { + object, err := clustersmgmtv1.NewAddOnInstallation(). + ID(addonID). + Addon(clustersmgmtv1.NewAddOn().ID(addonID)). + AddonVersion(clustersmgmtv1.NewAddOnVersion().ID("0.2.0")). + State(clustersmgmtv1.AddOnInstallationStateReady). + Build() + Expect(err).To(Not(HaveOccurred())) + return object, nil + }, + GetAddonVersionFunc: func(addonID string, version string) (*clustersmgmtv1.AddOnVersion, error) { + return clustersmgmtv1.NewAddOnVersion(). + ID("0.2.0"). + SourceImage("quay.io/osd-addons/acs-fleetshard-index@sha256:71eaaccb4d3962043eac953fb3c19a6cc6a88b18c472dd264efc5eb3da4960ac"). + PackageImage("quay.io/osd-addons/acs-fleetshard-package@sha256:3e4fc039662b876c83dd4b48a9608d6867a12ab4932c5b7297bfbe50ba8ee61c"). + Build() + }, + UpdateAddonInstallationFunc: func(clusterID string, addon *clustersmgmtv1.AddOnInstallation) error { + return nil + }, + } + addonConfig := ocmImpl.AddonConfig{ + FleetshardSyncImageTag: "0307e03", + InheritFleetshardSyncImageTag: true, + } + p := &AddonProvisioner{ + ocmClient: ocmMock, + customizations: initCustomizations(addonConfig), + lastStatus: metrics.AddonUpgrade, + lastUpgradeRequestTime: time.Now(), + } + err := p.Provision(api.Cluster{ + Addons: addonsJSON([]dbapi.AddonInstallation{ + { + ID: "acs-fleetshard", + Version: "0.2.0", + SourceImage: "quay.io/osd-addons/acs-fleetshard-index@sha256:71eaaccb4d3962043eac953fb3c19a6cc6a88b18c472dd264efc5eb3da4960ac", + PackageImage: "quay.io/osd-addons/acs-fleetshard-package@sha256:3e4fc039662b876c83dd4b48a9608d6867a12ab4932c5b7297bfbe50ba8ee61c", + ParametersSHA256Sum: "3e4fc039662b876c83dd4b48a9608d6867a12ab4932c5b7297bfbe50ba8ee61c", // pragma: allowlist secret + }, + }), + }, + gitops.DataPlaneClusterConfig{ + Addons: []gitops.AddonConfig{ + { + ID: "acs-fleetshard", + Version: "0.3.0", + Parameters: map[string]string{ + "fleetshardSyncImageTag": "inherit", + }, + }, + }, + }) + + Expect(err).To(Not(HaveOccurred())) + Expect(ocmMock.UpdateAddonInstallationCalls()).To(BeEmpty()) +}