Skip to content

Commit

Permalink
Automate Zero Downtime Scale tests
Browse files Browse the repository at this point in the history
Problem: We want our NFR tests to be fully automated to save developer time for each release cycle, and have a repetitive way of running the testis.

Solution: Automate the zero downtime scaling test. No longer collecting logs as done previously, because error logs would be unreliable to collect for pods that are scaling down (we don't have persistence that we can easily use in the automation to gather historic logs). Ultimately we are still ensuring that traffic is flowing and status updates occur, which are the important pieces here.
  • Loading branch information
sjberman committed Jul 18, 2024
1 parent 07b9d60 commit d75e499
Show file tree
Hide file tree
Showing 138 changed files with 483 additions and 88,553 deletions.
4 changes: 2 additions & 2 deletions tests/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ run-tests-on-vm: ## Run the functional tests on a GCP VM

.PHONY: nfr-test
nfr-test: ## Run the NFR tests on a GCP VM
NFR=true bash scripts/run-tests-gcp-vm.sh
NFR=true CI=$(CI) bash scripts/run-tests-gcp-vm.sh

.PHONY: start-longevity-test
start-longevity-test: export START_LONGEVITY=true
Expand All @@ -110,7 +110,7 @@ stop-longevity-test: nfr-test ## Stop the longevity test and collects results
.PHONY: .vm-nfr-test
.vm-nfr-test: ## Runs the NFR tests on the GCP VM (called by `nfr-test`)
go run github.com/onsi/ginkgo/v2/ginkgo --randomize-all --randomize-suites --keep-going --fail-on-pending --trace -r -v \
--label-filter "nfr" $(GINKGO_FLAGS) ./suite -- --gateway-api-version=$(GW_API_VERSION) \
--label-filter "nfr" $(GINKGO_FLAGS) --timeout 3h ./suite -- --gateway-api-version=$(GW_API_VERSION) \
--gateway-api-prev-version=$(GW_API_PREV_VERSION) --image-tag=$(TAG) --version-under-test=$(NGF_VERSION) \
--plus-enabled=$(PLUS_ENABLED) --ngf-image-repo=$(PREFIX) --nginx-image-repo=$(NGINX_PREFIX) --nginx-plus-image-repo=$(NGINX_PLUS_PREFIX) \
--pull-policy=$(PULL_POLICY) --service-type=$(GW_SERVICE_TYPE) \
Expand Down
29 changes: 29 additions & 0 deletions tests/framework/resourcemanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -744,3 +744,32 @@ func (rm *ResourceManager) WaitForPodsToBeReadyWithCount(ctx context.Context, na
},
)
}

// WaitForGatewayObservedGeneration waits for the provided Gateway's ObservedGeneration to equal the expected value.
func (rm *ResourceManager) WaitForGatewayObservedGeneration(
ctx context.Context,
namespace,
name string,
generation int,
) error {
return wait.PollUntilContextCancel(
ctx,
500*time.Millisecond,
true, /* poll immediately */
func(ctx context.Context) (bool, error) {
var gw v1.Gateway
key := types.NamespacedName{Namespace: namespace, Name: name}
if err := rm.K8sClient.Get(ctx, key, &gw); err != nil {
return false, err
}

for _, cond := range gw.Status.Conditions {
if cond.ObservedGeneration == int64(generation) {
return true, nil
}
}

return false, nil
},
)
}
8 changes: 7 additions & 1 deletion tests/scripts/run-tests-gcp-vm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@ gcloud compute ssh --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} username@
bash -s" < ${SCRIPT_DIR}/remote-scripts/${SCRIPT}

if [ "${NFR}" = "true" ]; then
gcloud compute scp --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} --recurse username@${RESOURCE_NAME}:~/nginx-gateway-fabric/tests/results .
## Use rsync if running locally (faster); otherwise if in the pipeline don't download an SSH config
if [ "${CI}" = "false" ]; then
gcloud compute config-ssh --ssh-config-file ngf-gcp.ssh > /dev/null
rsync -ave 'ssh -F ngf-gcp.ssh' username@${RESOURCE_NAME}.${GKE_CLUSTER_ZONE}.${GKE_PROJECT}:~/nginx-gateway-fabric/tests/results .
else
gcloud compute scp --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} --recurse username@${RESOURCE_NAME}:~/nginx-gateway-fabric/tests/results .
fi
fi

## If tearing down the longevity test, we need to collect logs from gcloud and add to the results
Expand Down
4 changes: 0 additions & 4 deletions tests/suite/manifests/ngf-upgrade/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,3 @@ affinity:
labelSelector:
matchLabels:
app.kubernetes.io/name: nginx-gateway

service:
annotations:
networking.gke.io/load-balancer-type: "Internal"
Original file line number Diff line number Diff line change
@@ -1,18 +1,11 @@
nginxGateway:
image:
repository: ghcr.io/nginxinc/nginx-gateway-fabric
tag: edge # change this tag if you are testing a different version
pullPolicy: IfNotPresent
lifecycle:
preStop:
exec:
command:
- /usr/bin/gateway
- sleep
- --duration=40s
config:
logging:
level: debug

nginx:
lifecycle:
Expand All @@ -31,7 +24,3 @@ affinity:
labelSelector:
matchLabels:
app.kubernetes.io/name: nginx-gateway

service:
annotations:
networking.gke.io/load-balancer-type: "Internal"
18 changes: 18 additions & 0 deletions tests/suite/manifests/scale/zero-downtime/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
nginxGateway:
lifecycle:
preStop:
exec:
command:
- /usr/bin/gateway
- sleep
- --duration=40s

nginx:
lifecycle:
preStop:
exec:
command:
- /bin/sleep
- "40"

terminationGracePeriodSeconds: 50
Loading

0 comments on commit d75e499

Please sign in to comment.