From 0cf65652b2537486e03faeff5b7c9ccc1520be18 Mon Sep 17 00:00:00 2001
From: WVerlaek <wouter@improbable.io>
Date: Wed, 5 Jan 2022 15:39:34 +0100
Subject: [PATCH] e2e test to reproduce issue#2397

---
 test/e2e/fleet_test.go          | 46 ++++++++++++++++++++++++++++-----
 test/e2e/framework/framework.go | 32 +++++++++++++++++++++++
 2 files changed, 71 insertions(+), 7 deletions(-)

diff --git a/test/e2e/fleet_test.go b/test/e2e/fleet_test.go
index 9ce1f79521..c7ff98a4eb 100644
--- a/test/e2e/fleet_test.go
+++ b/test/e2e/fleet_test.go
@@ -223,14 +223,16 @@ func TestFleetRollingUpdate(t *testing.T) {
 	t.Parallel()
 	ctx := context.Background()
 	// Use scaleFleetPatch (true) or scaleFleetSubresource (false)
-	fixtures := []bool{true, false}
-	maxSurge := []string{"25%", "10%"}
+	fixtures := []bool{true}    //, false} // TODO Enable these again
+	maxSurge := []string{"25%"} //, "10%"} // TODO
+	doCycle := true             // TODO: fixture?
 
 	for _, usePatch := range fixtures {
 		for _, maxSurgeParam := range maxSurge {
 			usePatch := usePatch
 			maxSurgeParam := maxSurgeParam
-			t.Run(fmt.Sprintf("Use fleet Patch %t %s", usePatch, maxSurgeParam), func(t *testing.T) {
+			doCycleParam := doCycle
+			t.Run(fmt.Sprintf("Use fleet Patch %t %s cycle %t", usePatch, maxSurgeParam, doCycleParam), func(t *testing.T) {
 				t.Parallel()
 
 				client := framework.AgonesClient.AgonesV1()
@@ -267,10 +269,33 @@ func TestFleetRollingUpdate(t *testing.T) {
 				flt, err = client.Fleets(framework.Namespace).Get(ctx, flt.ObjectMeta.GetName(), metav1.GetOptions{})
 				assert.NoError(t, err)
 
+				done := make(chan bool, 1)
+				defer close(done)
+				if doCycleParam {
+					// Repeatedly cycle allocations to keep ~half of the GameServers Allocated, spread over both GSSets.
+					// Simulates a rolling update on a live Fleet that continuously receives new allocations,
+					// and reproduces an issue where this causes a rolling update to get stuck.
+					const halfScale = targetScale / 2
+					go framework.CycleAllocations(t, flt, time.Second*3, time.Second*halfScale*3, done)
+
+					// Wait for at least half of the fleet to have be cycled (either Allocated or shutting down)
+					// before updating the fleet.
+					err = framework.WaitForFleetCondition(t, flt, func(entry *logrus.Entry, fleet *agonesv1.Fleet) bool {
+						return fleet.Status.ReadyReplicas < halfScale
+					})
+				}
+
 				// Change ContainerPort to trigger creating a new GSSet
-				fltCopy := flt.DeepCopy()
-				fltCopy.Spec.Template.Spec.Ports[0].ContainerPort++
-				flt, err = client.Fleets(framework.Namespace).Update(ctx, fltCopy, metav1.UpdateOptions{})
+				err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
+					flt, err = client.Fleets(framework.Namespace).Get(ctx, flt.GetName(), metav1.GetOptions{})
+					if err != nil {
+						return err
+					}
+					fltCopy := flt.DeepCopy()
+					fltCopy.Spec.Template.Spec.Ports[0].ContainerPort++
+					flt, err = client.Fleets(framework.Namespace).Update(ctx, fltCopy, metav1.UpdateOptions{})
+					return err
+				})
 				assert.NoError(t, err)
 
 				selector := labels.SelectorFromSet(labels.Set{agonesv1.FleetNameLabel: flt.ObjectMeta.Name})
@@ -308,7 +333,9 @@ func TestFleetRollingUpdate(t *testing.T) {
 					assert.Nil(t, err)
 
 					expectedTotal := targetScale + maxSurge + maxUnavailable + shift
-					if len(list.Items) > expectedTotal {
+					if len(list.Items) > expectedTotal && !doCycleParam {
+						// This fails when Allocation cycling is enabled as there's a number of additional gameservers
+						// shutting down.
 						err = fmt.Errorf("new replicas should be less than target + maxSurge + maxUnavailable + shift. Replicas: %d, Expected: %d", len(list.Items), expectedTotal)
 					}
 					if err != nil {
@@ -324,6 +351,11 @@ func TestFleetRollingUpdate(t *testing.T) {
 
 				assert.NoError(t, err)
 
+				// Stop cycling Allocations.
+				// The AssertFleetConditions below will wait until the Allocation cycling has
+				// fully stopped (when all Allocated GameServers are shut down).
+				done <- true
+
 				// scale down, with allocation
 				const scaleDownTarget = 1
 				if usePatch {
diff --git a/test/e2e/framework/framework.go b/test/e2e/framework/framework.go
index bbfdd556c3..7bc09b6a16 100644
--- a/test/e2e/framework/framework.go
+++ b/test/e2e/framework/framework.go
@@ -278,6 +278,38 @@ func (f *Framework) WaitForGameServerState(t *testing.T, gs *agonesv1.GameServer
 		state, gs.Namespace, gs.Name)
 }
 
+func (f *Framework) GetGameServer(t *testing.T, namespace string, name string) *agonesv1.GameServer {
+	gs, err := f.AgonesClient.AgonesV1().GameServers(namespace).Get(context.Background(), name, metav1.GetOptions{})
+	require.NoError(t, err, "failed to get gameserver: %s/%s", namespace, name)
+	return gs
+}
+
+// CycleAllocations repeatedly Allocates a GameServer in the Fleet (if one is available), once every specified period.
+// Each Allocated GameServer gets deleted allocDuration after it was Allocated.
+// GameServers will continue to be Allocated until a message is passed to the done channel.
+func (f *Framework) CycleAllocations(t *testing.T, flt *agonesv1.Fleet, period time.Duration, allocDuration time.Duration, done <-chan bool) {
+	ticker := time.NewTicker(period)
+	for {
+		select {
+		case <-done:
+			return
+		case <-ticker.C:
+			gsa := GetAllocation(flt)
+			gsa, err := f.AgonesClient.AllocationV1().GameServerAllocations(flt.Namespace).Create(context.Background(), gsa, metav1.CreateOptions{})
+			if err != nil || gsa.Status.State != allocationv1.GameServerAllocationAllocated {
+				continue
+			}
+
+			// Deallocate after allocDuration.
+			go func(gsa *allocationv1.GameServerAllocation) {
+				time.Sleep(allocDuration)
+				err := f.AgonesClient.AgonesV1().GameServers(gsa.Namespace).Delete(context.Background(), gsa.Status.GameServerName, metav1.DeleteOptions{})
+				require.NoError(t, err)
+			}(gsa)
+		}
+	}
+}
+
 // AssertFleetCondition waits for the Fleet to be in a specific condition or fails the test if the condition can't be met in 5 minutes.
 func (f *Framework) AssertFleetCondition(t *testing.T, flt *agonesv1.Fleet, condition func(*logrus.Entry, *agonesv1.Fleet) bool) {
 	err := f.WaitForFleetCondition(t, flt, condition)