Skip to content

Commit

Permalink
Zero downtime upgrade test (#10436)
Browse files Browse the repository at this point in the history
Co-authored-by: changelog-bot <changelog-bot>
  • Loading branch information
jenshu authored Dec 11, 2024
1 parent 6d162f9 commit 6a950f5
Show file tree
Hide file tree
Showing 11 changed files with 177 additions and 202 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pr-kubernetes-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ jobs:
# Dec 4, 2024: 26 minutes
- cluster-name: 'cluster-six'
go-test-args: '-v -timeout=30m'
go-test-run-regex: '^TestDiscoveryWatchlabels$$|^TestK8sGatewayNoValidation$$|^TestHelm$$|^TestHelmSettings$$|^TestK8sGatewayAws$$|^TestK8sGateway$$/^HTTPRouteServices$$|^TestK8sGateway$$/^TCPRouteServices$$|^TestZeroDowntimeRollout$$'
go-test-run-regex: '^TestDiscoveryWatchlabels$$|^TestK8sGatewayNoValidation$$|^TestHelm$$|^TestHelmSettings$$|^TestK8sGatewayAws$$|^TestK8sGateway$$/^HTTPRouteServices$$|^TestK8sGateway$$/^TCPRouteServices$$'

# Dec 4, 2024: 13 minutes
- cluster-name: 'cluster-seven'
Expand Down
6 changes: 6 additions & 0 deletions changelog/v1.19.0-beta2/zd-upgrade-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
changelog:
- type: NON_USER_FACING
description: >-
Add a kube gateway zero-downtime upgrade test.
skipCI-docs-build:true
143 changes: 140 additions & 3 deletions test/kubernetes/e2e/features/upgrade/suite.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,24 @@ package upgrade

import (
"context"
"fmt"
"net/http"
"path/filepath"
"strconv"
"time"

"github.com/stretchr/testify/suite"

. "github.com/onsi/gomega"
"github.com/solo-io/gloo/pkg/utils/kubeutils"
"github.com/solo-io/gloo/pkg/utils/requestutils/curl"
testmatchers "github.com/solo-io/gloo/test/gomega/matchers"
"github.com/solo-io/gloo/test/kubernetes/e2e"
"github.com/solo-io/gloo/test/kubernetes/e2e/defaults"
"github.com/solo-io/gloo/test/kubernetes/e2e/tests/base"
"github.com/solo-io/gloo/test/kubernetes/testutils/helper"
"github.com/solo-io/skv2/codegen/util"
"github.com/stretchr/testify/suite"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
)

var _ e2e.NewSuiteFunc = NewTestingSuite
Expand All @@ -29,7 +38,7 @@ func NewTestingSuite(ctx context.Context, testInst *e2e.TestInstallation) suite.
testInst.Metadata.ReleasedVersion = releaseVersion

return &testingSuite{
base.NewBaseTestingSuite(ctx, testInst, testHelper, base.SimpleTestCase{}, nil),
base.NewBaseTestingSuite(ctx, testInst, testHelper, base.SimpleTestCase{}, testCases),
}
}

Expand All @@ -42,15 +51,22 @@ func (s *testingSuite) TearDownSuite() {
}

func (s *testingSuite) BeforeTest(suiteName, testName string) {
// the old release is installed before the test
err := s.TestHelper.InstallGloo(s.Ctx, 600*time.Second, helper.WithExtraArgs([]string{
"--values", s.TestInstallation.Metadata.ProfileValuesManifestFile,
"--values", s.TestInstallation.Metadata.ValuesManifestFile,
}...),
helper.WithCRDs(filepath.Join(s.TestHelper.RootDir, "install", "helm", "gloo", "crds")))
s.TestInstallation.Assertions.Require.NoError(err)

// apply manifests, if any
s.BaseTestingSuite.BeforeTest(suiteName, testName)
}

func (s *testingSuite) AfterTest(suiteName, testName string) {
// delete manifests, if any
s.BaseTestingSuite.AfterTest(suiteName, testName)

s.TestInstallation.UninstallGlooGateway(s.Ctx, func(ctx context.Context) error {
return s.TestHelper.UninstallGlooAll()
})
Expand Down Expand Up @@ -107,10 +123,131 @@ func (s *testingSuite) TestValidationWebhookCABundle() {
ensureWebhookCABundleMatchesSecretsRootCAValue()
}

func (s *testingSuite) TestZeroDowntimeUpgrade() {
s.waitProxyRunning()

// repeatedly send curl requests to the proxy while performing an upgrade, and make sure all
// requests succeed
s.ensureZeroDowntimeDuringAction(func() {
// do the upgrade
s.UpgradeWithCustomValuesFile(filepath.Join(util.MustGetThisDir(), "testdata/manifests", "zero-downtime-upgrade.yaml"))

// as a sanity check make sure the deployer re-deployed resources with the new values
svc := &corev1.Service{}
err := s.TestInstallation.ClusterContext.Client.Get(s.Ctx,
types.NamespacedName{Name: glooProxyObjectMeta.Name, Namespace: glooProxyObjectMeta.Namespace},
svc)
s.Require().NoError(err)
s.TestInstallation.Assertions.Gomega.Expect(svc.GetLabels()).To(
HaveKeyWithValue("new-service-label-key", "new-service-label-val"))

// now restart the deployment and make sure there's still no downtime
err = s.TestHelper.RestartDeploymentAndWait(s.Ctx, "gloo-proxy-gw")
Expect(err).ToNot(HaveOccurred())
}, 3000)
}

func (s *testingSuite) UpgradeWithCustomValuesFile(valuesFile string) {
_, err := s.TestHelper.UpgradeGloo(s.Ctx, 600*time.Second, helper.WithExtraArgs([]string{
// Do not reuse the existing values as we need to install the new chart with the new version of the images
"--values", valuesFile,
}...))
s.TestInstallation.Assertions.Require.NoError(err)
}

// waitProxyRunning waits until the proxy pod is running and able to receive traffic
func (s *testingSuite) waitProxyRunning() {
s.TestInstallation.Assertions.EventuallyRunningReplicas(s.Ctx, glooProxyObjectMeta, Equal(1))
s.TestInstallation.Assertions.AssertEventualCurlResponse(
s.Ctx,
defaults.CurlPodExecOpt,
[]curl.Option{
curl.WithHost(kubeutils.ServiceFQDN(proxyService.ObjectMeta)),
curl.WithHostHeader("example.com"),
},
&testmatchers.HttpResponse{
StatusCode: http.StatusOK,
})
}

// ensureZeroDowntimeDuringAction continuously sends traffic to the proxy while performing an action specified by
// `actionFunc`, and ensures there is no downtime.
// `numRequests` specifies the total number of requests to send
func (s *testingSuite) ensureZeroDowntimeDuringAction(actionFunc func(), numRequests int) {
// Send traffic to the gloo gateway pod while performing the specified action.
// Run this for long enough to perform the action since there's no easy way
// to stop this command once the test is over
// e.g. for numRequests=800, this executes 800 req @ 4 req/sec = 20s (3 * terminationGracePeriodSeconds (5) + buffer)
// kubectl exec -n hey hey -- hey -disable-keepalive -c 4 -q 10 --cpus 1 -n 1200 -m GET -t 1 -host example.com http://gloo-proxy-gw.default.svc.cluster.local:8080
args := []string{"exec", "-n", "hey", "hey", "--", "hey", "-disable-keepalive", "-c", "4", "-q", "10", "--cpus", "1", "-n", strconv.Itoa(numRequests), "-m", "GET", "-t", "1", "-host", "example.com", "http://gloo-proxy-gw.default.svc.cluster.local:8080"}

var err error
cmd := s.TestHelper.Cli.Command(s.Ctx, args...)
err = cmd.Start()
Expect(err).ToNot(HaveOccurred())

// Perform the specified action. There should be no downtime since the gloo gateway pod should have the readiness probes configured
actionFunc()

now := time.Now()
err = cmd.Wait()
Expect(err).ToNot(HaveOccurred())

// Since there's no easy way to stop the command after we've performed the action,
// we ensure that at least 1 second has passed since we began sending traffic to the gloo gateway pod
after := int(time.Now().Sub(now).Abs().Seconds())
s.GreaterOrEqual(after, 1)

// Summary:
// Total: 30.0113 secs
// Slowest: 0.0985 secs
// Fastest: 0.0025 secs
// Average: 0.0069 secs
// Requests/sec: 39.9849
//
// Total data: 738000 bytes
// Size/request: 615 bytes
//
// Response time histogram:
// 0.003 [1] |
// 0.012 [1165] |■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■
// 0.022 [24] |■
// 0.031 [4] |
// 0.041 [0] |
// 0.050 [0] |
// 0.060 [0] |
// 0.070 [0] |
// 0.079 [0] |
// 0.089 [1] |
// 0.098 [5] |
//
// Latency distribution:
// 10% in 0.0036 secs
// 25% in 0.0044 secs
// 50% in 0.0060 secs
// 75% in 0.0082 secs
// 90% in 0.0099 secs
// 95% in 0.0109 secs
// 99% in 0.0187 secs
//
// Details (average, fastest, slowest):
// DNS+dialup: 0.0028 secs, 0.0025 secs, 0.0985 secs
// DNS-lookup: 0.0016 secs, 0.0001 secs, 0.0116 secs
// req write: 0.0003 secs, 0.0001 secs, 0.0041 secs
// resp wait: 0.0034 secs, 0.0012 secs, 0.0782 secs
// resp read: 0.0003 secs, 0.0001 secs, 0.0039 secs
//
// Status code distribution:
// [200] 800 responses
//
// ***** Should not contain something like this *****
// Status code distribution:
// [200] 779 responses
// Error distribution:
// [17] Get http://gloo-proxy-gw.default.svc.cluster.local:8080: dial tcp 10.96.177.91:8080: connection refused
// [4] Get http://gloo-proxy-gw.default.svc.cluster.local:8080: net/http: request canceled while waiting for connection

// Verify that there were no errors
Expect(cmd.Output()).To(ContainSubstring(fmt.Sprintf("[200] %d responses", numRequests)))
Expect(cmd.Output()).ToNot(ContainSubstring("Error distribution"))
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
kubeGateway:
enabled: true
# Enable the probes to ensure zero downtime
gatewayParameters:
glooGateway:
service:
extraLabels:
new-service-label-key: new-service-label-val
podTemplate:
terminationGracePeriodSeconds: 5
gracefulShutdown:
enabled: true
sleepTimeSeconds: 2
probes: true
customLivenessProbe:
exec:
command:
- wget
- -O
- /dev/null
- 127.0.0.1:19000/server_info
initialDelaySeconds: 3
periodSeconds: 10
failureThreshold: 3
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
package zero_downtime_rollout
package upgrade

import (
"path/filepath"

"github.com/solo-io/gloo/test/kubernetes/e2e/defaults"
"github.com/solo-io/gloo/test/kubernetes/e2e/tests/base"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/solo-io/skv2/codegen/util"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)

var (
Expand All @@ -32,8 +30,8 @@ var (
},
}

zeroDowntimeTestCases = map[string]*base.TestCase{
"TestZeroDowntimeRollout": {
testCases = map[string]*base.TestCase{
"TestZeroDowntimeUpgrade": {
SimpleTestCase: base.SimpleTestCase{
Manifests: []string{defaults.CurlPodManifest, serviceManifest, routeWithServiceManifest},
Resources: []client.Object{proxyDeployment, proxyService, defaults.CurlPod, heyPod},
Expand Down
126 changes: 0 additions & 126 deletions test/kubernetes/e2e/features/zero_downtime_rollout/suite.go

This file was deleted.

Loading

0 comments on commit 6a950f5

Please sign in to comment.