Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added very simple stress test which scales fleets up/down and basic stress test harness #571

Merged
merged 1 commit into from
Feb 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions build/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ GCP_BUCKET_CHARTS ?= agones-chart
MINIKUBE_PROFILE ?= agones
GO_BUILD_TAGS ?= none

# Specify stress test level 1..100
# STRESS_TEST_LEVEL=n requires capacity between 50*n up to 100*n simple-udp Game Servers.
STRESS_TEST_LEVEL=20

# kind cluster name to use
KIND_PROFILE ?= agones
KIND_CONTAINER_NAME=kind-$(KIND_PROFILE)-control-plane
Expand Down Expand Up @@ -217,6 +221,16 @@ test-e2e: $(ensure-build-image)
--gameserver-image=$(GS_TEST_IMAGE) \
--pullsecret=$(IMAGE_PULL_SECRET)

# Runs end-to-end stress tests on the current configured cluster
# For minikube user the minikube-stress-test-e2e targets
stress-test-e2e: $(ensure-build-image)
$(GO_TEST) $(agones_package)/test/e2e $(ARGS) $(GO_E2E_TEST_ARGS) \
-timeout 1h \
-run '.*StressTest.*' \
--gameserver-image=$(GS_TEST_IMAGE) \
--pullsecret=$(IMAGE_PULL_SECRET) \
--stress $(STRESS_TEST_LEVEL)

# Run test on install yaml - make sure there is no change
# mostly this is for CI
test-install-yaml:
Expand Down Expand Up @@ -610,6 +624,10 @@ minikube-transfer-image:
minikube-test-e2e: DOCKER_RUN_ARGS=--network=host -v $(minikube_cert_mount)
minikube-test-e2e: minikube-agones-profile test-e2e

# Runs stress tests against our minikube
minikube-stress-test-e2e: DOCKER_RUN_ARGS=--network=host -v $(minikube_cert_mount)
minikube-stress-test-e2e: minikube-agones-profile stress-test-e2e

# prometheus on minkube
# we have to disable PVC as it's not supported on minkube.
minikube-setup-prometheus:
Expand Down
125 changes: 115 additions & 10 deletions test/e2e/fleet_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/util/retry"
)

const (
Expand Down Expand Up @@ -640,6 +641,103 @@ func TestCreateFleetAndUpdateScaleSubresource(t *testing.T) {
framework.WaitForFleetCondition(t, flt, e2e.FleetReadyCount(initialReplicas))
}

// TestScaleUpAndDownInParallelStressTest creates N fleets, half of which start with replicas=0
// and the other half with 0 and scales them up/down 3 times in parallel expecting it to reach
// the desired number of ready replicas each time.
// This test is also used as a stress test with 'make stress-test-e2e', in which case it creates
// many more fleets of bigger sizes and runs many more repetitions.
func TestScaleUpAndDownInParallelStressTest(t *testing.T) {
t.Parallel()

alpha1 := framework.AgonesClient.StableV1alpha1()
fleetCount := 2
fleetSize := int32(10)
repeatCount := 3
deadline := time.Now().Add(1 * time.Minute)

logrus.WithField("fleetCount", fleetCount).
WithField("fleetSize", fleetSize).
WithField("repeatCount", repeatCount).
WithField("deadline", deadline).
Info("starting scale up/down test")

if framework.StressTestLevel > 0 {
fleetSize = 10 * int32(framework.StressTestLevel)
repeatCount = 10
fleetCount = 10
deadline = time.Now().Add(45 * time.Minute)
}

var fleets []*v1alpha1.Fleet

var scaleUpResults e2e.PerfResults
var scaleDownResults e2e.PerfResults

for fleetNumber := 0; fleetNumber < fleetCount; fleetNumber++ {
flt := defaultFleet()
flt.ObjectMeta.GenerateName = fmt.Sprintf("scale-fleet-%v-", fleetNumber)
if fleetNumber%2 == 0 {
// even-numbered fleets starts at fleetSize and are scaled down to zero and back.
flt.Spec.Replicas = fleetSize
} else {
// odd-numbered fleets starts at zero and are scaled up to fleetSize and back.
flt.Spec.Replicas = 0
}

flt, err := alpha1.Fleets(defaultNs).Create(flt)
if assert.Nil(t, err) {
defer alpha1.Fleets(defaultNs).Delete(flt.ObjectMeta.Name, nil) // nolint:errcheck
}
fleets = append(fleets, flt)
}

// wait for initial fleet conditions.
for fleetNumber, flt := range fleets {
if fleetNumber%2 == 0 {
framework.WaitForFleetCondition(t, flt, e2e.FleetReadyCount(fleetSize))
} else {
framework.WaitForFleetCondition(t, flt, e2e.FleetReadyCount(0))
}
}

var wg sync.WaitGroup

for fleetNumber, flt := range fleets {
wg.Add(1)
go func(fleetNumber int, flt *v1alpha1.Fleet) {
defer wg.Done()
defer func() {
if err := recover(); err != nil {
t.Errorf("recovered panic: %v", err)
}
}()

if fleetNumber%2 == 0 {
scaleDownResults.AddSample(scaleAndWait(t, flt, 0))
}
for i := 0; i < repeatCount; i++ {
if time.Now().After(deadline) {
break
}
scaleUpResults.AddSample(scaleAndWait(t, flt, fleetSize))
scaleDownResults.AddSample(scaleAndWait(t, flt, 0))
}
}(fleetNumber, flt)
}

wg.Wait()

scaleUpResults.Report(fmt.Sprintf("scale up 0 to %v with %v fleets", fleetSize, fleetCount))
scaleDownResults.Report(fmt.Sprintf("scale down %v to 0 with %v fleets", fleetSize, fleetCount))
}

func scaleAndWait(t *testing.T, flt *v1alpha1.Fleet, fleetSize int32) time.Duration {
t0 := time.Now()
scaleFleetSubresource(t, flt, fleetSize)
framework.WaitForFleetCondition(t, flt, e2e.FleetReadyCount(fleetSize))
return time.Since(t0)
}

// scaleFleetPatch creates a patch to apply to a Fleet.
// Easier for testing, as it removes object generational issues.
func scaleFleetPatch(t *testing.T, f *v1alpha1.Fleet, scale int32) *v1alpha1.Fleet {
Expand All @@ -654,19 +752,26 @@ func scaleFleetPatch(t *testing.T, f *v1alpha1.Fleet, scale int32) *v1alpha1.Fle
// scaleFleetSubresource uses scale subresource to change Replicas size of the Fleet.
// Returns the same f as in parameter, just to keep signature in sync with scaleFleetPatch
func scaleFleetSubresource(t *testing.T, f *v1alpha1.Fleet, scale int32) *v1alpha1.Fleet {
alpha1 := framework.AgonesClient.StableV1alpha1()
// GetScale returns current Scale object with resourceVersion which is opaque object
// and it will be used to create new Scale object
opts := metav1.GetOptions{}
sc, err := alpha1.Fleets(defaultNs).GetScale(f.ObjectMeta.Name, opts)
assert.Nil(t, err, "could not get the current scale subresource")
logrus.WithField("fleet", f.ObjectMeta.Name).WithField("scale", scale).Info("Scaling fleet")

sc2 := newScale(f.Name, scale, sc.ObjectMeta.ResourceVersion)
_, err = alpha1.Fleets(defaultNs).UpdateScale(f.ObjectMeta.Name, sc2)
assert.Nil(t, err, "could not update the scale subresource")
err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
alpha1 := framework.AgonesClient.StableV1alpha1()
// GetScale returns current Scale object with resourceVersion which is opaque object
// and it will be used to create new Scale object
opts := metav1.GetOptions{}
sc, err := alpha1.Fleets(defaultNs).GetScale(f.ObjectMeta.Name, opts)
if err != nil {
return err
}

logrus.WithField("fleet", f.ObjectMeta.Name).WithField("scale", scale).Info("Scaling fleet")
sc2 := newScale(f.Name, scale, sc.ObjectMeta.ResourceVersion)
_, err = alpha1.Fleets(defaultNs).UpdateScale(f.ObjectMeta.Name, sc2)
return err
})

if err != nil {
t.Fatal("could not update the scale subresource")
}
return f
}

Expand Down
6 changes: 4 additions & 2 deletions test/e2e/framework/framework.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,11 @@ type Framework struct {
AgonesClient versioned.Interface
GameServerImage string
PullSecret string
StressTestLevel int
}

// New setups a testing framework using a kubeconfig path and the game server image to use for testing.
func New(kubeconfig, gsimage string, pullSecret string) (*Framework, error) {
func New(kubeconfig, gsimage string, pullSecret string, stressTestLevel int) (*Framework, error) {
config, err := clientcmd.BuildConfigFromFlags("", kubeconfig)
if err != nil {
return nil, errors.Wrap(err, "build config from flags failed")
Expand All @@ -72,6 +73,7 @@ func New(kubeconfig, gsimage string, pullSecret string) (*Framework, error) {
AgonesClient: agonesClient,
GameServerImage: gsimage,
PullSecret: pullSecret,
StressTestLevel: stressTestLevel,
}, nil
}

Expand Down Expand Up @@ -137,7 +139,7 @@ func (f *Framework) WaitForFleetCondition(t *testing.T, flt *v1alpha1.Fleet, con
})
if err != nil {
logrus.WithField("fleet", flt.Name).WithError(err).Info("error waiting for fleet condition")
t.Fatal("error waiting for fleet condition")
t.Fatalf("error waiting for fleet condition on fleet %v", flt.Name)
}
}

Expand Down
65 changes: 65 additions & 0 deletions test/e2e/framework/perf.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package framework

import (
"sort"
"sync"
"time"

"github.com/sirupsen/logrus"
)

// PerfResults aggregates performance test results.
// The AddSample() method is safe for concurrent use by multiple goroutines.
type PerfResults struct {
mu sync.Mutex
samples []time.Duration

firstSampleTime time.Time
lastSampleTime time.Time
}

// AddSample adds a single time measurement.
func (p *PerfResults) AddSample(d time.Duration) {
p.mu.Lock()
defer p.mu.Unlock()

n := time.Now()
if len(p.samples) == 0 {
p.firstSampleTime = n
}
p.lastSampleTime = n
p.samples = append(p.samples, d)
}

// Report outputs performance report to log.
func (p *PerfResults) Report(name string) {
if len(p.samples) == 0 {
return
}

sort.Slice(p.samples, func(i, j int) bool {
return p.samples[i] < p.samples[j]
})

var sum time.Duration
for _, s := range p.samples {
sum += s
}

avg := time.Duration(int64(sum) / int64(len(p.samples)))
logrus.
WithField("avg", avg).
WithField("count", len(p.samples)).
WithField("min", p.samples[0].Seconds()).
WithField("max", p.samples[len(p.samples)-1].Seconds()).
WithField("p50", p.samples[len(p.samples)*500/1001].Seconds()).
WithField("p90", p.samples[len(p.samples)*900/1001].Seconds()).
WithField("p95", p.samples[len(p.samples)*950/1001].Seconds()).
WithField("p99", p.samples[len(p.samples)*990/1001].Seconds()).
WithField("p999", p.samples[len(p.samples)*999/1001].Seconds()).
WithField("duration", p.lastSampleTime.Sub(p.firstSampleTime).Seconds()).
Info(name)

// TODO - use something like Fortio ("fortio.org/fortio/stats") to
// generate histogram for long-term storage and analysis.
}
3 changes: 2 additions & 1 deletion test/e2e/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ func TestMain(m *testing.M) {
"gameserver image to use for those tests, gcr.io/agones-images/udp-server:0.6")
pullSecret := flag.String("pullsecret", "",
"optional secret to be used for pulling the gameserver and/or Agones SDK sidecar images")
stressTestLevel := flag.Int("stress", 0, "enable stress test at given level 0-100")

flag.Parse()

Expand All @@ -45,7 +46,7 @@ func TestMain(m *testing.M) {
exitCode int
)

if framework, err = e2eframework.New(*kubeconfig, *gsimage, *pullSecret); err != nil {
if framework, err = e2eframework.New(*kubeconfig, *gsimage, *pullSecret, *stressTestLevel); err != nil {
log.Printf("failed to setup framework: %v\n", err)
os.Exit(1)
}
Expand Down