From 22313b8007073574df590aa3a791dfea8c5b7b1b Mon Sep 17 00:00:00 2001 From: Jarek Kowalski Date: Thu, 7 Feb 2019 09:42:17 -0800 Subject: [PATCH] Added very simple stress test which scales fleets up/down repeatedly and basic stress test harness. The same test is used during regular e2e tests, except it runs just a few iterations on smaller fleets. To run stress test simply invoke `make stress-test-e2e` optionally passing `STRESS_TEST_LEVEL`, which controls the fleet sizes to be used (1..100, defaults to 20). Depending on stress test level, you may need a cluster with lots of capacity. By convention 'make stress-test-e2e' runs all test cases whose names include 'StressTest' and ignores everything else. --- build/Makefile | 18 +++++ test/e2e/fleet_test.go | 125 +++++++++++++++++++++++++++++--- test/e2e/framework/framework.go | 6 +- test/e2e/framework/perf.go | 65 +++++++++++++++++ test/e2e/main_test.go | 3 +- 5 files changed, 204 insertions(+), 13 deletions(-) create mode 100644 test/e2e/framework/perf.go diff --git a/build/Makefile b/build/Makefile index 75e93aab49..269e3c1326 100644 --- a/build/Makefile +++ b/build/Makefile @@ -48,6 +48,10 @@ GCP_BUCKET_CHARTS ?= agones-chart MINIKUBE_PROFILE ?= agones GO_BUILD_TAGS ?= none +# Specify stress test level 1..100 +# STRESS_TEST_LEVEL=n requires capacity between 50*n up to 100*n simple-udp Game Servers. +STRESS_TEST_LEVEL=20 + # kind cluster name to use KIND_PROFILE ?= agones KIND_CONTAINER_NAME=kind-$(KIND_PROFILE)-control-plane @@ -217,6 +221,16 @@ test-e2e: $(ensure-build-image) --gameserver-image=$(GS_TEST_IMAGE) \ --pullsecret=$(IMAGE_PULL_SECRET) +# Runs end-to-end stress tests on the current configured cluster +# For minikube user the minikube-stress-test-e2e targets +stress-test-e2e: $(ensure-build-image) + $(GO_TEST) $(agones_package)/test/e2e $(ARGS) $(GO_E2E_TEST_ARGS) \ + -timeout 1h \ + -run '.*StressTest.*' \ + --gameserver-image=$(GS_TEST_IMAGE) \ + --pullsecret=$(IMAGE_PULL_SECRET) \ + --stress $(STRESS_TEST_LEVEL) + # Run test on install yaml - make sure there is no change # mostly this is for CI test-install-yaml: @@ -610,6 +624,10 @@ minikube-transfer-image: minikube-test-e2e: DOCKER_RUN_ARGS=--network=host -v $(minikube_cert_mount) minikube-test-e2e: minikube-agones-profile test-e2e +# Runs stress tests against our minikube +minikube-stress-test-e2e: DOCKER_RUN_ARGS=--network=host -v $(minikube_cert_mount) +minikube-stress-test-e2e: minikube-agones-profile stress-test-e2e + # prometheus on minkube # we have to disable PVC as it's not supported on minkube. minikube-setup-prometheus: diff --git a/test/e2e/fleet_test.go b/test/e2e/fleet_test.go index 6a2f9a9f71..f7445f5b14 100644 --- a/test/e2e/fleet_test.go +++ b/test/e2e/fleet_test.go @@ -31,6 +31,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/retry" ) const ( @@ -640,6 +641,103 @@ func TestCreateFleetAndUpdateScaleSubresource(t *testing.T) { framework.WaitForFleetCondition(t, flt, e2e.FleetReadyCount(initialReplicas)) } +// TestScaleUpAndDownInParallelStressTest creates N fleets, half of which start with replicas=0 +// and the other half with 0 and scales them up/down 3 times in parallel expecting it to reach +// the desired number of ready replicas each time. +// This test is also used as a stress test with 'make stress-test-e2e', in which case it creates +// many more fleets of bigger sizes and runs many more repetitions. +func TestScaleUpAndDownInParallelStressTest(t *testing.T) { + t.Parallel() + + alpha1 := framework.AgonesClient.StableV1alpha1() + fleetCount := 2 + fleetSize := int32(10) + repeatCount := 3 + deadline := time.Now().Add(1 * time.Minute) + + logrus.WithField("fleetCount", fleetCount). + WithField("fleetSize", fleetSize). + WithField("repeatCount", repeatCount). + WithField("deadline", deadline). + Info("starting scale up/down test") + + if framework.StressTestLevel > 0 { + fleetSize = 10 * int32(framework.StressTestLevel) + repeatCount = 10 + fleetCount = 10 + deadline = time.Now().Add(45 * time.Minute) + } + + var fleets []*v1alpha1.Fleet + + var scaleUpResults e2e.PerfResults + var scaleDownResults e2e.PerfResults + + for fleetNumber := 0; fleetNumber < fleetCount; fleetNumber++ { + flt := defaultFleet() + flt.ObjectMeta.GenerateName = fmt.Sprintf("scale-fleet-%v-", fleetNumber) + if fleetNumber%2 == 0 { + // even-numbered fleets starts at fleetSize and are scaled down to zero and back. + flt.Spec.Replicas = fleetSize + } else { + // odd-numbered fleets starts at zero and are scaled up to fleetSize and back. + flt.Spec.Replicas = 0 + } + + flt, err := alpha1.Fleets(defaultNs).Create(flt) + if assert.Nil(t, err) { + defer alpha1.Fleets(defaultNs).Delete(flt.ObjectMeta.Name, nil) // nolint:errcheck + } + fleets = append(fleets, flt) + } + + // wait for initial fleet conditions. + for fleetNumber, flt := range fleets { + if fleetNumber%2 == 0 { + framework.WaitForFleetCondition(t, flt, e2e.FleetReadyCount(fleetSize)) + } else { + framework.WaitForFleetCondition(t, flt, e2e.FleetReadyCount(0)) + } + } + + var wg sync.WaitGroup + + for fleetNumber, flt := range fleets { + wg.Add(1) + go func(fleetNumber int, flt *v1alpha1.Fleet) { + defer wg.Done() + defer func() { + if err := recover(); err != nil { + t.Errorf("recovered panic: %v", err) + } + }() + + if fleetNumber%2 == 0 { + scaleDownResults.AddSample(scaleAndWait(t, flt, 0)) + } + for i := 0; i < repeatCount; i++ { + if time.Now().After(deadline) { + break + } + scaleUpResults.AddSample(scaleAndWait(t, flt, fleetSize)) + scaleDownResults.AddSample(scaleAndWait(t, flt, 0)) + } + }(fleetNumber, flt) + } + + wg.Wait() + + scaleUpResults.Report(fmt.Sprintf("scale up 0 to %v with %v fleets", fleetSize, fleetCount)) + scaleDownResults.Report(fmt.Sprintf("scale down %v to 0 with %v fleets", fleetSize, fleetCount)) +} + +func scaleAndWait(t *testing.T, flt *v1alpha1.Fleet, fleetSize int32) time.Duration { + t0 := time.Now() + scaleFleetSubresource(t, flt, fleetSize) + framework.WaitForFleetCondition(t, flt, e2e.FleetReadyCount(fleetSize)) + return time.Since(t0) +} + // scaleFleetPatch creates a patch to apply to a Fleet. // Easier for testing, as it removes object generational issues. func scaleFleetPatch(t *testing.T, f *v1alpha1.Fleet, scale int32) *v1alpha1.Fleet { @@ -654,19 +752,26 @@ func scaleFleetPatch(t *testing.T, f *v1alpha1.Fleet, scale int32) *v1alpha1.Fle // scaleFleetSubresource uses scale subresource to change Replicas size of the Fleet. // Returns the same f as in parameter, just to keep signature in sync with scaleFleetPatch func scaleFleetSubresource(t *testing.T, f *v1alpha1.Fleet, scale int32) *v1alpha1.Fleet { - alpha1 := framework.AgonesClient.StableV1alpha1() - // GetScale returns current Scale object with resourceVersion which is opaque object - // and it will be used to create new Scale object - opts := metav1.GetOptions{} - sc, err := alpha1.Fleets(defaultNs).GetScale(f.ObjectMeta.Name, opts) - assert.Nil(t, err, "could not get the current scale subresource") + logrus.WithField("fleet", f.ObjectMeta.Name).WithField("scale", scale).Info("Scaling fleet") - sc2 := newScale(f.Name, scale, sc.ObjectMeta.ResourceVersion) - _, err = alpha1.Fleets(defaultNs).UpdateScale(f.ObjectMeta.Name, sc2) - assert.Nil(t, err, "could not update the scale subresource") + err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + alpha1 := framework.AgonesClient.StableV1alpha1() + // GetScale returns current Scale object with resourceVersion which is opaque object + // and it will be used to create new Scale object + opts := metav1.GetOptions{} + sc, err := alpha1.Fleets(defaultNs).GetScale(f.ObjectMeta.Name, opts) + if err != nil { + return err + } - logrus.WithField("fleet", f.ObjectMeta.Name).WithField("scale", scale).Info("Scaling fleet") + sc2 := newScale(f.Name, scale, sc.ObjectMeta.ResourceVersion) + _, err = alpha1.Fleets(defaultNs).UpdateScale(f.ObjectMeta.Name, sc2) + return err + }) + if err != nil { + t.Fatal("could not update the scale subresource") + } return f } diff --git a/test/e2e/framework/framework.go b/test/e2e/framework/framework.go index ce1adff9fc..2719a1d8c9 100644 --- a/test/e2e/framework/framework.go +++ b/test/e2e/framework/framework.go @@ -48,10 +48,11 @@ type Framework struct { AgonesClient versioned.Interface GameServerImage string PullSecret string + StressTestLevel int } // New setups a testing framework using a kubeconfig path and the game server image to use for testing. -func New(kubeconfig, gsimage string, pullSecret string) (*Framework, error) { +func New(kubeconfig, gsimage string, pullSecret string, stressTestLevel int) (*Framework, error) { config, err := clientcmd.BuildConfigFromFlags("", kubeconfig) if err != nil { return nil, errors.Wrap(err, "build config from flags failed") @@ -72,6 +73,7 @@ func New(kubeconfig, gsimage string, pullSecret string) (*Framework, error) { AgonesClient: agonesClient, GameServerImage: gsimage, PullSecret: pullSecret, + StressTestLevel: stressTestLevel, }, nil } @@ -137,7 +139,7 @@ func (f *Framework) WaitForFleetCondition(t *testing.T, flt *v1alpha1.Fleet, con }) if err != nil { logrus.WithField("fleet", flt.Name).WithError(err).Info("error waiting for fleet condition") - t.Fatal("error waiting for fleet condition") + t.Fatalf("error waiting for fleet condition on fleet %v", flt.Name) } } diff --git a/test/e2e/framework/perf.go b/test/e2e/framework/perf.go new file mode 100644 index 0000000000..0786a741ef --- /dev/null +++ b/test/e2e/framework/perf.go @@ -0,0 +1,65 @@ +package framework + +import ( + "sort" + "sync" + "time" + + "github.com/sirupsen/logrus" +) + +// PerfResults aggregates performance test results. +// The AddSample() method is safe for concurrent use by multiple goroutines. +type PerfResults struct { + mu sync.Mutex + samples []time.Duration + + firstSampleTime time.Time + lastSampleTime time.Time +} + +// AddSample adds a single time measurement. +func (p *PerfResults) AddSample(d time.Duration) { + p.mu.Lock() + defer p.mu.Unlock() + + n := time.Now() + if len(p.samples) == 0 { + p.firstSampleTime = n + } + p.lastSampleTime = n + p.samples = append(p.samples, d) +} + +// Report outputs performance report to log. +func (p *PerfResults) Report(name string) { + if len(p.samples) == 0 { + return + } + + sort.Slice(p.samples, func(i, j int) bool { + return p.samples[i] < p.samples[j] + }) + + var sum time.Duration + for _, s := range p.samples { + sum += s + } + + avg := time.Duration(int64(sum) / int64(len(p.samples))) + logrus. + WithField("avg", avg). + WithField("count", len(p.samples)). + WithField("min", p.samples[0].Seconds()). + WithField("max", p.samples[len(p.samples)-1].Seconds()). + WithField("p50", p.samples[len(p.samples)*500/1001].Seconds()). + WithField("p90", p.samples[len(p.samples)*900/1001].Seconds()). + WithField("p95", p.samples[len(p.samples)*950/1001].Seconds()). + WithField("p99", p.samples[len(p.samples)*990/1001].Seconds()). + WithField("p999", p.samples[len(p.samples)*999/1001].Seconds()). + WithField("duration", p.lastSampleTime.Sub(p.firstSampleTime).Seconds()). + Info(name) + + // TODO - use something like Fortio ("fortio.org/fortio/stats") to + // generate histogram for long-term storage and analysis. +} diff --git a/test/e2e/main_test.go b/test/e2e/main_test.go index 1e13315104..369cf5fced 100644 --- a/test/e2e/main_test.go +++ b/test/e2e/main_test.go @@ -37,6 +37,7 @@ func TestMain(m *testing.M) { "gameserver image to use for those tests, gcr.io/agones-images/udp-server:0.6") pullSecret := flag.String("pullsecret", "", "optional secret to be used for pulling the gameserver and/or Agones SDK sidecar images") + stressTestLevel := flag.Int("stress", 0, "enable stress test at given level 0-100") flag.Parse() @@ -45,7 +46,7 @@ func TestMain(m *testing.M) { exitCode int ) - if framework, err = e2eframework.New(*kubeconfig, *gsimage, *pullSecret); err != nil { + if framework, err = e2eframework.New(*kubeconfig, *gsimage, *pullSecret, *stressTestLevel); err != nil { log.Printf("failed to setup framework: %v\n", err) os.Exit(1) }