Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SHIPA-2322] adds wait-retry to helm update & delete #233

Merged
merged 4 commits into from
Feb 11, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 99 additions & 5 deletions internal/chart/helm_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,58 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)

type statusFunc func(cfg *action.Configuration, appName string) (*release.Release, release.Status, error)

const (
WaitRetry = iota
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it looks like there is no need to export WaitRetry, TakeAction and NoAction.
Is it possible to use waitRetry, takeAction and noAction correspondingly?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You bet. Privatized!

TakeAction
NoAction
)

// helmStatusActionMapUpdate maps a Release Status to a Ketch action for helm updates
var helmStatusActionMapUpdate = map[release.Status]int{
"not-found": TakeAction,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe a new const for not-found?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added.

release.StatusUnknown: WaitRetry,
release.StatusDeployed: TakeAction,
release.StatusUninstalled: TakeAction,
release.StatusSuperseded: NoAction,
release.StatusFailed: TakeAction,
release.StatusUninstalling: WaitRetry,
release.StatusPendingInstall: WaitRetry,
release.StatusPendingUpgrade: WaitRetry,
release.StatusPendingRollback: WaitRetry,
}

// helmStatusActionMapDelete maps a Release Status to a Ketch action for helm deletions
var helmStatusActionMapDelete = map[release.Status]int{
"not-found": NoAction,
release.StatusUnknown: WaitRetry,
release.StatusDeployed: TakeAction,
release.StatusUninstalled: NoAction,
release.StatusSuperseded: NoAction,
release.StatusFailed: NoAction,
release.StatusUninstalling: NoAction,
release.StatusPendingInstall: WaitRetry,
release.StatusPendingUpgrade: WaitRetry,
release.StatusPendingRollback: WaitRetry,
}

var (
statusRetryInterval = time.Second * 1
statusRetryTimeout = time.Second * 5
)

const (
defaultDeploymentTimeout = 10 * time.Minute
)

// HelmClient performs helm install and uninstall operations for provided application helm charts.
type HelmClient struct {
cfg *action.Configuration
namespace string
c client.Client
log logr.Logger
cfg *action.Configuration
namespace string
c client.Client
log logr.Logger
statusFunc statusFunc
}

// TemplateValuer is an interface that permits types that implement it (e.g. Application, Job)
Expand Down Expand Up @@ -105,15 +147,67 @@ func (c HelmClient) UpdateChart(tv TemplateValuer, config ChartConfig, opts ...I
namespace: c.namespace,
cli: c.c,
}
shouldUpdate, err := c.waitForActionableStatus(c.statusFunc, appName, helmStatusActionMapUpdate)
if err != nil || !shouldUpdate {
return nil, err
}
return updateClient.Run(appName, chrt, vals)
}

// DeleteChart uninstalls the app's helm release. It doesn't return an error if the release is not found.
func (c HelmClient) DeleteChart(appName string) error {
shouldDelete, err := c.waitForActionableStatus(c.statusFunc, appName, helmStatusActionMapDelete)
if err != nil || !shouldDelete {
return err
}
uninstall := action.NewUninstall(c.cfg)
_, err := uninstall.Run(appName)
_, err = uninstall.Run(appName)
if err != nil && errors.Is(err, driver.ErrReleaseNotFound) {
return nil
}
return err
}

// getHelmStatus returns the Release, Status, and error for an app
func getHelmStatus(cfg *action.Configuration, appName string) (*release.Release, release.Status, error) {
statusClient := action.NewStatus(cfg)
status, err := statusClient.Run(appName)
if err != nil {
if errors.Is(err, driver.ErrReleaseNotFound) || status.Info == nil {
return nil, "not-found", nil
}
return nil, "", err
}
return status, status.Info.Status, nil
}

// waitForActionableStatus returns true if the helm status is such that it is ok to proceed with update/delete. If the statusFunc returns
// WaitRetry, the func blocks while retrying.
func (c HelmClient) waitForActionableStatus(statusFunc statusFunc, appName string, statusActionMap map[release.Status]int) (bool, error) {
ticker := time.NewTicker(statusRetryInterval)
done := time.After(statusRetryTimeout)
var helmRelease *release.Release
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this approach, but will it block the appReconciler's loop?
if yes, what are the negative consequences we are going to deal with?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Per our slack discussion: I removed the wait loop. Now, "wait-retry" statuses throw an error, which the reconciler loop is expected to handle. This means that 1) we don't manually update a chart's status here (not sure if that's good or bad) 2) there is a possiblity that a chart will get stuck in a weird status like pending-uninstall and the reconciler will just keep re-trying. Not sure if that's something to be concerned about.

var status release.Status
var err error
for {
select {
case <-done:
c.log.Info(fmt.Sprintf("Setting status (%s) of %s release that has timeouted to: deployed", appName, status))
helmRelease.SetStatus(release.StatusDeployed, "set manually after timeout waiting for actionable status")
return true, nil
case <-ticker.C:
helmRelease, status, err = statusFunc(c.cfg, appName)
if err != nil {
return false, err
}
action := statusActionMap[status] // default wait-retry
if action == NoAction {
c.log.Info(fmt.Sprintf("helm chart for app %s release already in state %s - no action required", appName, status))
return false, nil
}
if action == TakeAction {
return true, nil
}
}
}
}
2 changes: 1 addition & 1 deletion internal/chart/helm_client_factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ func (f *HelmClientFactory) NewHelmClient(namespace string, c client.Client, log
f.configurations[namespace] = cfg
}
f.configurationsLastUsedTimes[namespace] = time.Now()
return &HelmClient{cfg: cfg, namespace: namespace, c: c, log: log.WithValues("helm-client", namespace)}, nil
return &HelmClient{cfg: cfg, namespace: namespace, c: c, log: log.WithValues("helm-client", namespace), statusFunc: getHelmStatus}, nil
}

func (f *HelmClientFactory) cleanup() {
Expand Down
117 changes: 117 additions & 0 deletions internal/chart/helm_client_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
package chart

import (
"testing"
"time"

"sigs.k8s.io/controller-runtime/pkg/log"

"github.com/stretchr/testify/require"
"helm.sh/helm/v3/pkg/action"
"helm.sh/helm/v3/pkg/chart"
"helm.sh/helm/v3/pkg/release"
)

func TestWaitForActionableStatus(t *testing.T) {
tests := []struct {
description string
statusFuncMap map[int]release.Status // sequence of responses returned by mockStatusFunc
statusMap map[release.Status]int // test update or delete
expected bool
expectedStatusFuncCalls int
}{
{
description: "delete - deployed",
statusFuncMap: map[int]release.Status{0: release.StatusDeployed},
statusMap: helmStatusActionMapDelete,
expected: true,
expectedStatusFuncCalls: 1,
},
{
description: "delete - eventual success",
statusFuncMap: map[int]release.Status{0: release.StatusUnknown, 1: release.StatusDeployed},
statusMap: helmStatusActionMapDelete,
expected: true,
expectedStatusFuncCalls: 2,
},
{
description: "delete - not found",
statusFuncMap: map[int]release.Status{0: "not-found"},
statusMap: helmStatusActionMapDelete,
expected: false,
expectedStatusFuncCalls: 1,
},
{
description: "delete - superseded",
statusFuncMap: map[int]release.Status{0: release.StatusSuperseded},
statusMap: helmStatusActionMapDelete,
expected: false,
expectedStatusFuncCalls: 1,
},
{
description: "delete timeout",
statusFuncMap: map[int]release.Status{0: release.StatusPendingInstall, 1: release.StatusPendingInstall, 2: release.StatusPendingInstall, 3: release.StatusPendingInstall, 4: release.StatusPendingInstall},
statusMap: helmStatusActionMapDelete,
expected: true,
expectedStatusFuncCalls: 5,
},
{
description: "update - deployed",
statusFuncMap: map[int]release.Status{0: release.StatusDeployed},
statusMap: helmStatusActionMapUpdate,
expected: true,
expectedStatusFuncCalls: 1,
},
{
description: "update - eventual success",
statusFuncMap: map[int]release.Status{0: release.StatusUnknown, 1: release.StatusDeployed},
statusMap: helmStatusActionMapUpdate,
expected: true,
expectedStatusFuncCalls: 2,
},
{
description: "update - not found",
statusFuncMap: map[int]release.Status{0: "not-found"},
statusMap: helmStatusActionMapUpdate,
expected: true,
expectedStatusFuncCalls: 1,
},
{
description: "update - superseded",
statusFuncMap: map[int]release.Status{0: release.StatusSuperseded},
statusMap: helmStatusActionMapUpdate,
expected: false,
expectedStatusFuncCalls: 1,
},
{
description: "update timeout",
statusFuncMap: map[int]release.Status{0: release.StatusPendingInstall, 1: release.StatusPendingInstall, 2: release.StatusPendingInstall, 3: release.StatusPendingInstall, 4: release.StatusPendingInstall},
statusMap: helmStatusActionMapUpdate,
expected: true,
expectedStatusFuncCalls: 5,
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
c := &HelmClient{
log: log.NullLogger{},
}
// speed up retry, timeout
statusRetryInterval = time.Millisecond * 100
statusRetryTimeout = time.Millisecond * 500
// mockStatusFunc and counter to track times called
counter := 0
mockStatusFunc := func(cfg *action.Configuration, appName string) (*release.Release, release.Status, error) {
status := tc.statusFuncMap[counter]
counter += 1
mockRelease := &release.Release{Chart: &chart.Chart{}, Info: &release.Info{}}
return mockRelease, status, nil
}

ok, err := c.waitForActionableStatus(mockStatusFunc, "testapp", tc.statusMap)
require.Nil(t, err)
require.Equal(t, tc.expected, ok)
require.Equal(t, tc.expectedStatusFuncCalls, counter)
})
}
}