From 28c753da63995c270bd9d9388125c58be8c5d114 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Wed, 12 Jun 2024 16:23:07 -0700 Subject: [PATCH 01/32] Automate remaining graceful recovery tests --- tests/suite/graceful_recovery_test.go | 202 ++++++++++++++++++++++---- 1 file changed, 172 insertions(+), 30 deletions(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index 033cd3be3f..fd9e29eec6 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "net/http" + "os/exec" "strings" "time" @@ -15,6 +16,7 @@ import ( core "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + ctlr "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/yaml" @@ -98,8 +100,126 @@ var _ = Describe("Graceful Recovery test", Ordered, Label("functional", "gracefu It("recovers when nginx container is restarted", func() { runRecoveryTest(teaURL, coffeeURL, ngfPodName, nginxContainerName, files, &ns) }) + + It("recovers when drained node is restarted", func() { + runRestartNodeTest(teaURL, coffeeURL, files, &ns, true) + }) + + It("recovers when node is restarted abruptly", func() { + // FIXME(bjee19) remove Skip() when https://github.com/nginxinc/nginx-gateway-fabric/issues/1108 is completed. + Skip("Test currently fails due to this issue: https://github.com/nginxinc/nginx-gateway-fabric/issues/1108") + runRestartNodeTest(teaURL, coffeeURL, files, &ns, false) + }) }) +func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Namespace, drain bool) { + nodeNames, err := getNodeNames() + Expect(err).ToNot(HaveOccurred()) + Expect(nodeNames).To(HaveLen(1)) + + kindNodeName := nodeNames[0] + + if portFwdPort != 0 { + close(portForwardStopCh) + } + + if drain { + _, err := exec.Command( + "kubectl", + "drain", + kindNodeName, + "--ignore-daemonsets", + "--delete-local-data", + ).CombinedOutput() + if err != nil { + Expect(err).ToNot(HaveOccurred()) + } + + _, err = exec.Command( + "kubectl", + "delete", + "node", + kindNodeName, + ).CombinedOutput() + if err != nil { + Expect(err).ToNot(HaveOccurred()) + } + } + + containerOutput, err := exec.Command( + "docker", + "container", + "ls", + ).CombinedOutput() + if err != nil { + Expect(err).ToNot(HaveOccurred()) + } + fmt.Println(string(containerOutput)) + + var containerName string + for _, line := range strings.Split(string(containerOutput), "\n") { + for _, word := range strings.Split(line, " ") { + // This is a potential weak spot in the code where we rely on the container which NGF + // is running on to contain "control-plane" in the name and for no other container to have that either. + // This is currently working in our test framework may break in the future. + if strings.Contains(word, "control-plane") { + containerName = strings.TrimSpace(word) + break + } + } + } + Expect(containerName).ToNot(Equal("")) + + // really jank - get the string that contains "control-plane" + fmt.Println("This is our container name: " + containerName) + + _, err = exec.Command( + "docker", + "restart", + containerName, + ).CombinedOutput() + if err != nil { + fmt.Println(fmt.Sprint(err.Error())) + Expect(err).ToNot(HaveOccurred()) + } + + // need to wait for docker container to restart and be running before polling for ready NGF Pods or else we will error + Eventually( + func() bool { + output, err := exec.Command( + "docker", + "container", + "inspect", + containerName, + ).CombinedOutput() + return strings.Contains(string(output), "\"Running\": true") && err == nil + }). + WithTimeout(timeoutConfig.CreateTimeout). + WithPolling(500 * time.Millisecond). + Should(BeTrue()) + + var podNames []string + Eventually( + func() bool { + podNames, err = framework.GetReadyNGFPodNames(k8sClient, ngfNamespace, releaseName, timeoutConfig.GetStatusTimeout) + return len(podNames) == 1 && err == nil + }). + WithTimeout(timeoutConfig.CreateTimeout). + WithPolling(500 * time.Millisecond). + Should(BeTrue()) + ngfPodName := podNames[0] + Expect(ngfPodName).ToNot(Equal("")) + + if portFwdPort != 0 { + ports := []string{fmt.Sprintf("%d:80", ngfHTTPForwardedPort), fmt.Sprintf("%d:443", ngfHTTPSForwardedPort)} + portForwardStopCh = make(chan struct{}) + err = framework.PortForward(ctlr.GetConfigOrDie(), ngfNamespace, ngfPodName, ports, portForwardStopCh) + Expect(err).ToNot(HaveOccurred()) + } + + checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, files, ns) +} + func runRecoveryTest(teaURL, coffeeURL, ngfPodName, containerName string, files []string, ns *core.Namespace) { var ( err error @@ -127,36 +247,7 @@ func runRecoveryTest(teaURL, coffeeURL, ngfPodName, containerName string, files Should(Succeed()) } - Eventually( - func() error { - return checkForWorkingTraffic(teaURL, coffeeURL) - }). - WithTimeout(timeoutConfig.RequestTimeout). - WithPolling(500 * time.Millisecond). - Should(Succeed()) - - Expect(resourceManager.DeleteFromFiles(files, ns.Name)).To(Succeed()) - - Eventually( - func() error { - return checkForFailingTraffic(teaURL, coffeeURL) - }). - WithTimeout(timeoutConfig.RequestTimeout). - WithPolling(500 * time.Millisecond). - Should(Succeed()) - - Expect(resourceManager.ApplyFromFiles(files, ns.Name)).To(Succeed()) - Expect(resourceManager.WaitForAppsToBeReadyWithPodCount(ns.Name, 2)).To(Succeed()) - - Eventually( - func() error { - return checkForWorkingTraffic(teaURL, coffeeURL) - }). - WithTimeout(timeoutConfig.RequestTimeout * 2). - WithPolling(500 * time.Millisecond). - Should(Succeed()) - - checkContainerLogsForErrors(ngfPodName, containerName == nginxContainerName) + checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, files, ns) } func restartContainer(ngfPodName, containerName string) { @@ -254,6 +345,39 @@ func expectRequestToFail(appURL, address string) error { return nil } +func checkNGFFunctionality(teaURL, coffeeURL, ngfPodName string, files []string, ns *core.Namespace) { + Eventually( + func() error { + return checkForWorkingTraffic(teaURL, coffeeURL) + }). + WithTimeout(timeoutConfig.RequestTimeout). + WithPolling(500 * time.Millisecond). + Should(Succeed()) + + Expect(resourceManager.DeleteFromFiles(files, ns.Name)).To(Succeed()) + + Eventually( + func() error { + return checkForFailingTraffic(teaURL, coffeeURL) + }). + WithTimeout(timeoutConfig.RequestTimeout). + WithPolling(500 * time.Millisecond). + Should(Succeed()) + + Expect(resourceManager.ApplyFromFiles(files, ns.Name)).To(Succeed()) + Expect(resourceManager.WaitForAppsToBeReadyWithPodCount(ns.Name, 2)).To(Succeed()) + + Eventually( + func() error { + return checkForWorkingTraffic(teaURL, coffeeURL) + }). + WithTimeout(timeoutConfig.RequestTimeout). + WithPolling(500 * time.Millisecond). + Should(Succeed()) + + checkContainerLogsForErrors(ngfPodName, true) +} + // checkContainerLogsForErrors checks both nginx and ngf container's logs for any possible errors. // Since this function retrieves all the logs from both containers and the NGF pod may be shared between tests, // the logs retrieved may contain log messages from previous tests, thus any errors in the logs from previous tests @@ -349,6 +473,24 @@ func getContainerRestartCount(ngfPodName, containerName string) (int, error) { return restartCount, nil } +func getNodeNames() ([]string, error) { + ctx, cancel := context.WithTimeout(context.Background(), timeoutConfig.GetTimeout) + defer cancel() + var nodes core.NodeList + + if err := k8sClient.List(ctx, &nodes); err != nil { + return nil, fmt.Errorf("error getting nodes: %w", err) + } + + names := make([]string, 0, len(nodes.Items)) + + for _, node := range nodes.Items { + names = append(names, node.Name) + } + + return names, nil +} + func runNodeDebuggerJob(ngfPodName, jobScript string) (*v1.Job, error) { ctx, cancel := context.WithTimeout(context.Background(), timeoutConfig.GetTimeout) defer cancel() From 6b039b837a1291fccc9e6b841e9e1e480b63ace7 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Mon, 17 Jun 2024 13:56:45 -0700 Subject: [PATCH 02/32] Remove debugging statements --- tests/suite/graceful_recovery_test.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index fd9e29eec6..5b891ea523 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -154,7 +154,6 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names if err != nil { Expect(err).ToNot(HaveOccurred()) } - fmt.Println(string(containerOutput)) var containerName string for _, line := range strings.Split(string(containerOutput), "\n") { @@ -170,9 +169,6 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names } Expect(containerName).ToNot(Equal("")) - // really jank - get the string that contains "control-plane" - fmt.Println("This is our container name: " + containerName) - _, err = exec.Command( "docker", "restart", From 0ea468209881f45568c6f8c7ce21b300bcbef964 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Mon, 17 Jun 2024 14:30:24 -0700 Subject: [PATCH 03/32] Run pipeline with failing test --- tests/suite/graceful_recovery_test.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index 5b891ea523..b0bdbfe703 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -169,10 +169,20 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names } Expect(containerName).ToNot(Equal("")) + //_, err = exec.Command( + // "docker", + // "restart", + // containerName, + //).CombinedOutput() + //if err != nil { + // fmt.Println(fmt.Sprint(err.Error())) + // Expect(err).ToNot(HaveOccurred()) + //} + _, err = exec.Command( "docker", "restart", - containerName, + "", ).CombinedOutput() if err != nil { fmt.Println(fmt.Sprint(err.Error())) From 1971d9642a94ef4ec668f2bc6e678dd94755e768 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Mon, 17 Jun 2024 15:02:23 -0700 Subject: [PATCH 04/32] Revert the test to make it work --- tests/suite/graceful_recovery_test.go | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index b0bdbfe703..5b891ea523 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -169,20 +169,10 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names } Expect(containerName).ToNot(Equal("")) - //_, err = exec.Command( - // "docker", - // "restart", - // containerName, - //).CombinedOutput() - //if err != nil { - // fmt.Println(fmt.Sprint(err.Error())) - // Expect(err).ToNot(HaveOccurred()) - //} - _, err = exec.Command( "docker", "restart", - "", + containerName, ).CombinedOutput() if err != nil { fmt.Println(fmt.Sprint(err.Error())) From 4b31588b23485c05158ed73427cf2d423406266f Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Thu, 20 Jun 2024 07:51:08 -0700 Subject: [PATCH 05/32] Run pipeline --- .github/workflows/functional.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/functional.yml b/.github/workflows/functional.yml index b55e114e6c..33b7e65ceb 100644 --- a/.github/workflows/functional.yml +++ b/.github/workflows/functional.yml @@ -119,6 +119,13 @@ jobs: make test${{ inputs.image == 'plus' && '-with-plus' || ''}} PREFIX=${ngf_prefix} TAG=${ngf_tag} GINKGO_LABEL=telemetry GW_SERVICE_TYPE=LoadBalancer working-directory: ./tests + - name: Run functional graceful-recovery tests + run: | + ngf_prefix=ghcr.io/nginxinc/nginx-gateway-fabric + ngf_tag=${{ steps.ngf-meta.outputs.version }} + make test${{ inputs.image == 'plus' && '-with-plus' || ''}} PREFIX=${ngf_prefix} TAG=${ngf_tag} GINKGO_LABEL=graceful-recovery GW_SERVICE_TYPE=LoadBalancer + working-directory: ./tests + - name: Run functional tests run: | ngf_prefix=ghcr.io/nginxinc/nginx-gateway-fabric From 7f52961ce01c420ac20ef1069e98536f5090ce9c Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Thu, 20 Jun 2024 08:12:04 -0700 Subject: [PATCH 06/32] Run pipeline with failing test --- tests/suite/graceful_recovery_test.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index 5b891ea523..c3b6c50803 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -169,10 +169,19 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names } Expect(containerName).ToNot(Equal("")) + //_, err = exec.Command( + // "docker", + // "restart", + // containerName, + //).CombinedOutput() + //if err != nil { + // fmt.Println(fmt.Sprint(err.Error())) + // Expect(err).ToNot(HaveOccurred()) + //} + _, err = exec.Command( "docker", "restart", - containerName, ).CombinedOutput() if err != nil { fmt.Println(fmt.Sprint(err.Error())) From ea8fa3da310d5ef0e36ae8fcf195fbd678b4cb43 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Thu, 20 Jun 2024 08:57:00 -0700 Subject: [PATCH 07/32] Remove manual test document --- tests/graceful-recovery/graceful-recovery.md | 101 ------------------ .../graceful-recovery}/1.0.0/1.0.0.md | 0 .../graceful-recovery}/1.1.0/1.1.0.md | 0 .../graceful-recovery}/1.2.0/1.2.0.md | 0 .../graceful-recovery}/1.3.0/1.3.0.md | 0 5 files changed, 101 deletions(-) delete mode 100644 tests/graceful-recovery/graceful-recovery.md rename tests/{graceful-recovery/results => results/graceful-recovery}/1.0.0/1.0.0.md (100%) rename tests/{graceful-recovery/results => results/graceful-recovery}/1.1.0/1.1.0.md (100%) rename tests/{graceful-recovery/results => results/graceful-recovery}/1.2.0/1.2.0.md (100%) rename tests/{graceful-recovery/results => results/graceful-recovery}/1.3.0/1.3.0.md (100%) diff --git a/tests/graceful-recovery/graceful-recovery.md b/tests/graceful-recovery/graceful-recovery.md deleted file mode 100644 index 15be7e2dc8..0000000000 --- a/tests/graceful-recovery/graceful-recovery.md +++ /dev/null @@ -1,101 +0,0 @@ -# Graceful recovery from restarts - -This document describes how we test graceful recovery from restarts on NGF. - - -- [Graceful recovery from restarts](#graceful-recovery-from-restarts) - - [Goal](#goal) - - [Test Environment](#test-environment) - - [Steps](#steps) - - [Setup](#setup) - - [Run the tests](#run-the-tests) - - [Restart Node with draining](#restart-node-with-draining) - - [Restart Node without draining](#restart-node-without-draining) - - -## Goal - -Ensure that NGF can recover gracefully from container failures without any user intervention. - -## Test Environment - -- A Kind cluster - -## Steps - -### Setup - -1. Deploy a one-Node Kind cluster. Can run `make create-kind-cluster` from main directory. - -2. Go into `deploy/manifests/nginx-gateway.yaml` and change the following: - - - `runAsNonRoot` from `true` to `false`: this allows us to insert our ephemeral container as root which enables us to restart the nginx-gateway container. - - Add the `--product-telemetry-disable` argument to the nginx-gateway container args. - -3. Follow [this guide](https://docs.nginx.com/nginx-gateway-fabric/installation/running-on-kind/) to deploy NGINX Gateway Fabric using manifests and expose it through a NodePort Service. - -4. In a separate terminal track NGF logs. - - ```console - kubectl -n nginx-gateway logs -f deploy/nginx-gateway -c nginx-gateway - ``` - -5. In a separate terminal track NGINX container logs. - - ```console - kubectl -n nginx-gateway logs -f deploy/nginx-gateway -c nginx - ``` - -6. In a separate terminal Exec into the NGINX container inside the NGF pod. - - ```console - kubectl exec -it -n nginx-gateway $(kubectl get pods -n nginx-gateway | sed -n '2s/^\([^[:space:]]*\).*$/\1/p') --container nginx -- sh - ``` - -7. In a different terminal, deploy the -[https-termination example](https://github.com/nginxinc/nginx-gateway-fabric/tree/main/examples/https-termination). -8. Send traffic through the example application and ensure it is working correctly. - -### Run the tests - -#### Restart Node with draining - -1. Drain the Node of its resources. - - ```console - kubectl drain kind-control-plane --ignore-daemonsets --delete-local-data - ``` - -2. Delete the Node. - - ```console - kubectl delete node kind-control-plane - ``` - -3. Restart the Docker container. - - ```console - docker restart kind-control-plane - ``` - -4. Check the logs of the old and new NGF and NGINX containers for errors. -5. Send traffic through the example application and ensure it is working correctly. -6. Check that NGF can still process changes of resources. - 1. Delete the HTTPRoute resources. - - ```console - kubectl delete -f ../../examples/https-termination/cafe-routes.yaml - ``` - - 2. Send traffic through the example application using the updated resources and ensure traffic does not flow. - 3. Apply the HTTPRoute resources. - - ```console - kubectl apply -f ../../examples/https-termination/cafe-routes.yaml - ``` - - 4. Send traffic through the example application using the updated resources and ensure traffic flows correctly. - -#### Restart Node without draining - -1. Repeat the above test but remove steps 1-2 which include draining and deleting the Node. diff --git a/tests/graceful-recovery/results/1.0.0/1.0.0.md b/tests/results/graceful-recovery/1.0.0/1.0.0.md similarity index 100% rename from tests/graceful-recovery/results/1.0.0/1.0.0.md rename to tests/results/graceful-recovery/1.0.0/1.0.0.md diff --git a/tests/graceful-recovery/results/1.1.0/1.1.0.md b/tests/results/graceful-recovery/1.1.0/1.1.0.md similarity index 100% rename from tests/graceful-recovery/results/1.1.0/1.1.0.md rename to tests/results/graceful-recovery/1.1.0/1.1.0.md diff --git a/tests/graceful-recovery/results/1.2.0/1.2.0.md b/tests/results/graceful-recovery/1.2.0/1.2.0.md similarity index 100% rename from tests/graceful-recovery/results/1.2.0/1.2.0.md rename to tests/results/graceful-recovery/1.2.0/1.2.0.md diff --git a/tests/graceful-recovery/results/1.3.0/1.3.0.md b/tests/results/graceful-recovery/1.3.0/1.3.0.md similarity index 100% rename from tests/graceful-recovery/results/1.3.0/1.3.0.md rename to tests/results/graceful-recovery/1.3.0/1.3.0.md From b104e2bd587e0556f8163f68dade37b93ae2cce1 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Thu, 20 Jun 2024 09:50:09 -0700 Subject: [PATCH 08/32] Run pipeline --- tests/suite/graceful_recovery_test.go | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index c3b6c50803..5b891ea523 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -169,19 +169,10 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names } Expect(containerName).ToNot(Equal("")) - //_, err = exec.Command( - // "docker", - // "restart", - // containerName, - //).CombinedOutput() - //if err != nil { - // fmt.Println(fmt.Sprint(err.Error())) - // Expect(err).ToNot(HaveOccurred()) - //} - _, err = exec.Command( "docker", "restart", + containerName, ).CombinedOutput() if err != nil { fmt.Println(fmt.Sprint(err.Error())) From f7932bba79af7c58945feca793fd6754aee8a397 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Thu, 20 Jun 2024 15:23:22 -0700 Subject: [PATCH 09/32] Add separate functions for draining and abrupt restart tests --- tests/suite/graceful_recovery_test.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index 5b891ea523..884f77a253 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -102,16 +102,24 @@ var _ = Describe("Graceful Recovery test", Ordered, Label("functional", "gracefu }) It("recovers when drained node is restarted", func() { - runRestartNodeTest(teaURL, coffeeURL, files, &ns, true) + runRestartNodeWithDrainingTest(teaURL, coffeeURL, files, &ns) }) It("recovers when node is restarted abruptly", func() { // FIXME(bjee19) remove Skip() when https://github.com/nginxinc/nginx-gateway-fabric/issues/1108 is completed. Skip("Test currently fails due to this issue: https://github.com/nginxinc/nginx-gateway-fabric/issues/1108") - runRestartNodeTest(teaURL, coffeeURL, files, &ns, false) + runRestartNodeAbruptlyTest(teaURL, coffeeURL, files, &ns) }) }) +func runRestartNodeWithDrainingTest(teaURL, coffeeURL string, files []string, ns *core.Namespace) { + runRestartNodeTest(teaURL, coffeeURL, files, ns, true) +} + +func runRestartNodeAbruptlyTest(teaURL, coffeeURL string, files []string, ns *core.Namespace) { + runRestartNodeTest(teaURL, coffeeURL, files, ns, false) +} + func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Namespace, drain bool) { nodeNames, err := getNodeNames() Expect(err).ToNot(HaveOccurred()) From 9ad9fb808ba851e284fdf765c39dca838a8f6f70 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Thu, 20 Jun 2024 15:26:28 -0700 Subject: [PATCH 10/32] Add review feedback --- tests/suite/graceful_recovery_test.go | 37 ++++++--------------------- 1 file changed, 8 insertions(+), 29 deletions(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index 884f77a253..c674ad1391 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -139,30 +139,15 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names "--ignore-daemonsets", "--delete-local-data", ).CombinedOutput() - if err != nil { - Expect(err).ToNot(HaveOccurred()) - } - - _, err = exec.Command( - "kubectl", - "delete", - "node", - kindNodeName, - ).CombinedOutput() - if err != nil { - Expect(err).ToNot(HaveOccurred()) - } - } + Expect(err).ToNot(HaveOccurred()) - containerOutput, err := exec.Command( - "docker", - "container", - "ls", - ).CombinedOutput() - if err != nil { + _, err = exec.Command("kubectl", "delete", "node", kindNodeName).CombinedOutput() Expect(err).ToNot(HaveOccurred()) } + containerOutput, err := exec.Command("docker", "container", "ls").CombinedOutput() + Expect(err).ToNot(HaveOccurred()) + var containerName string for _, line := range strings.Split(string(containerOutput), "\n") { for _, word := range strings.Split(line, " ") { @@ -177,15 +162,8 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names } Expect(containerName).ToNot(Equal("")) - _, err = exec.Command( - "docker", - "restart", - containerName, - ).CombinedOutput() - if err != nil { - fmt.Println(fmt.Sprint(err.Error())) - Expect(err).ToNot(HaveOccurred()) - } + _, err = exec.Command("docker", "restart", containerName).CombinedOutput() + Expect(err).ToNot(HaveOccurred()) // need to wait for docker container to restart and be running before polling for ready NGF Pods or else we will error Eventually( @@ -211,6 +189,7 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names WithTimeout(timeoutConfig.CreateTimeout). WithPolling(500 * time.Millisecond). Should(BeTrue()) + ngfPodName := podNames[0] Expect(ngfPodName).ToNot(Equal("")) From 23f6ff080fc6b5876f22da5e51e76c25081b2de5 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Fri, 21 Jun 2024 10:09:21 -0700 Subject: [PATCH 11/32] Refactor docker inspect command --- tests/suite/graceful_recovery_test.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index c674ad1391..d5caf3fbc4 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -170,11 +170,12 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names func() bool { output, err := exec.Command( "docker", - "container", "inspect", + "-f", + "{{.State.Running}}", containerName, ).CombinedOutput() - return strings.Contains(string(output), "\"Running\": true") && err == nil + return strings.TrimSpace(string(output)) == "true" && err == nil }). WithTimeout(timeoutConfig.CreateTimeout). WithPolling(500 * time.Millisecond). From d14560155d484133a5b03ed4047d10114fa68fbc Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Fri, 21 Jun 2024 10:38:35 -0700 Subject: [PATCH 12/32] Use cluster name passed in from flag --- .github/workflows/functional.yml | 2 +- tests/Makefile | 2 +- tests/suite/graceful_recovery_test.go | 34 ++++++++++++++------------- tests/suite/system_suite_test.go | 1 + 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/.github/workflows/functional.yml b/.github/workflows/functional.yml index 33b7e65ceb..4667a8609e 100644 --- a/.github/workflows/functional.yml +++ b/.github/workflows/functional.yml @@ -123,7 +123,7 @@ jobs: run: | ngf_prefix=ghcr.io/nginxinc/nginx-gateway-fabric ngf_tag=${{ steps.ngf-meta.outputs.version }} - make test${{ inputs.image == 'plus' && '-with-plus' || ''}} PREFIX=${ngf_prefix} TAG=${ngf_tag} GINKGO_LABEL=graceful-recovery GW_SERVICE_TYPE=LoadBalancer + make test${{ inputs.image == 'plus' && '-with-plus' || ''}} PREFIX=${ngf_prefix} TAG=${ngf_tag} GINKGO_LABEL=graceful-recovery GW_SERVICE_TYPE=LoadBalancer CLUSTER_NAME=${{ github.run_id }} working-directory: ./tests - name: Run functional tests diff --git a/tests/Makefile b/tests/Makefile index ea035cc297..69912aeff3 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -114,7 +114,7 @@ stop-longevity-test: nfr-test ## Stop the longevity test and collects results --gateway-api-prev-version=$(GW_API_PREV_VERSION) --image-tag=$(TAG) --version-under-test=$(NGF_VERSION) \ --plus-enabled=$(PLUS_ENABLED) --ngf-image-repo=$(PREFIX) --nginx-image-repo=$(NGINX_PREFIX) --nginx-plus-image-repo=$(NGINX_PLUS_PREFIX) \ --pull-policy=$(PULL_POLICY) --service-type=$(GW_SERVICE_TYPE) \ - --is-gke-internal-lb=$(GW_SVC_GKE_INTERNAL) + --is-gke-internal-lb=$(GW_SVC_GKE_INTERNAL) --cluster_name=$(CLUSTER_NAME) .PHONY: test test: ## Runs the functional tests on your default k8s cluster diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index d5caf3fbc4..a2bbfcb316 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -145,22 +145,24 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names Expect(err).ToNot(HaveOccurred()) } - containerOutput, err := exec.Command("docker", "container", "ls").CombinedOutput() - Expect(err).ToNot(HaveOccurred()) - - var containerName string - for _, line := range strings.Split(string(containerOutput), "\n") { - for _, word := range strings.Split(line, " ") { - // This is a potential weak spot in the code where we rely on the container which NGF - // is running on to contain "control-plane" in the name and for no other container to have that either. - // This is currently working in our test framework may break in the future. - if strings.Contains(word, "control-plane") { - containerName = strings.TrimSpace(word) - break - } - } - } - Expect(containerName).ToNot(Equal("")) + // containerOutput, err := exec.Command("docker", "container", "ls").CombinedOutput() + // Expect(err).ToNot(HaveOccurred()) + + containerName := *clusterName + "-control-plane" + fmt.Println("This is the container name: " + containerName) + //var containerName string + //for _, line := range strings.Split(string(containerOutput), "\n") { + // for _, word := range strings.Split(line, " ") { + // // This is a potential weak spot in the code where we rely on the container which NGF + // // is running on to contain "control-plane" in the name and for no other container to have that either. + // // This is currently working in our test framework may break in the future. + // if strings.Contains(word, "control-plane") { + // containerName = strings.TrimSpace(word) + // break + // } + // } + //} + //Expect(containerName).ToNot(Equal("")) _, err = exec.Command("docker", "restart", containerName).CombinedOutput() Expect(err).ToNot(HaveOccurred()) diff --git a/tests/suite/system_suite_test.go b/tests/suite/system_suite_test.go index 2a7a391ea5..f959e3d01c 100644 --- a/tests/suite/system_suite_test.go +++ b/tests/suite/system_suite_test.go @@ -58,6 +58,7 @@ var ( serviceType = flag.String("service-type", "NodePort", "Type of service fronting NGF to be deployed") isGKEInternalLB = flag.Bool("is-gke-internal-lb", false, "Is the LB service GKE internal only") plusEnabled = flag.Bool("plus-enabled", false, "Is NGINX Plus enabled") + clusterName = flag.String("cluster-name", "kind", "Cluster name") ) var ( From 5c26b3962f9801d996e72badb2702ad80c770394 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Fri, 21 Jun 2024 10:55:04 -0700 Subject: [PATCH 13/32] Add cluster name flag to right make command --- tests/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/Makefile b/tests/Makefile index 69912aeff3..c0b4978c76 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -114,7 +114,7 @@ stop-longevity-test: nfr-test ## Stop the longevity test and collects results --gateway-api-prev-version=$(GW_API_PREV_VERSION) --image-tag=$(TAG) --version-under-test=$(NGF_VERSION) \ --plus-enabled=$(PLUS_ENABLED) --ngf-image-repo=$(PREFIX) --nginx-image-repo=$(NGINX_PREFIX) --nginx-plus-image-repo=$(NGINX_PLUS_PREFIX) \ --pull-policy=$(PULL_POLICY) --service-type=$(GW_SERVICE_TYPE) \ - --is-gke-internal-lb=$(GW_SVC_GKE_INTERNAL) --cluster_name=$(CLUSTER_NAME) + --is-gke-internal-lb=$(GW_SVC_GKE_INTERNAL) .PHONY: test test: ## Runs the functional tests on your default k8s cluster @@ -124,7 +124,7 @@ test: ## Runs the functional tests on your default k8s cluster --image-tag=$(TAG) --version-under-test=$(NGF_VERSION) \ --plus-enabled=$(PLUS_ENABLED) --ngf-image-repo=$(PREFIX) --nginx-image-repo=$(NGINX_PREFIX) --nginx-plus-image-repo=$(NGINX_PLUS_PREFIX) \ --pull-policy=$(PULL_POLICY) --service-type=$(GW_SERVICE_TYPE) \ - --is-gke-internal-lb=$(GW_SVC_GKE_INTERNAL) + --is-gke-internal-lb=$(GW_SVC_GKE_INTERNAL) --cluster_name=$(CLUSTER_NAME) .PHONY: test-with-plus test-with-plus: PLUS_ENABLED=true From a19bce479c4563694928dbf7ff578456513679af Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Fri, 21 Jun 2024 11:10:54 -0700 Subject: [PATCH 14/32] Correct flag name --- tests/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Makefile b/tests/Makefile index c0b4978c76..dfe8506f56 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -124,7 +124,7 @@ test: ## Runs the functional tests on your default k8s cluster --image-tag=$(TAG) --version-under-test=$(NGF_VERSION) \ --plus-enabled=$(PLUS_ENABLED) --ngf-image-repo=$(PREFIX) --nginx-image-repo=$(NGINX_PREFIX) --nginx-plus-image-repo=$(NGINX_PLUS_PREFIX) \ --pull-policy=$(PULL_POLICY) --service-type=$(GW_SERVICE_TYPE) \ - --is-gke-internal-lb=$(GW_SVC_GKE_INTERNAL) --cluster_name=$(CLUSTER_NAME) + --is-gke-internal-lb=$(GW_SVC_GKE_INTERNAL) --cluster-name=$(CLUSTER_NAME) .PHONY: test-with-plus test-with-plus: PLUS_ENABLED=true From e752d21c5a94b66238ba9c0f7167b4f7a83cc427 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Fri, 21 Jun 2024 11:33:17 -0700 Subject: [PATCH 15/32] Remove comments --- tests/suite/graceful_recovery_test.go | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index a2bbfcb316..da78420407 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -145,25 +145,7 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names Expect(err).ToNot(HaveOccurred()) } - // containerOutput, err := exec.Command("docker", "container", "ls").CombinedOutput() - // Expect(err).ToNot(HaveOccurred()) - containerName := *clusterName + "-control-plane" - fmt.Println("This is the container name: " + containerName) - //var containerName string - //for _, line := range strings.Split(string(containerOutput), "\n") { - // for _, word := range strings.Split(line, " ") { - // // This is a potential weak spot in the code where we rely on the container which NGF - // // is running on to contain "control-plane" in the name and for no other container to have that either. - // // This is currently working in our test framework may break in the future. - // if strings.Contains(word, "control-plane") { - // containerName = strings.TrimSpace(word) - // break - // } - // } - //} - //Expect(containerName).ToNot(Equal("")) - _, err = exec.Command("docker", "restart", containerName).CombinedOutput() Expect(err).ToNot(HaveOccurred()) From dd252007a3c3b21c8c035df0493d987cf99cc789 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Mon, 24 Jun 2024 13:18:40 -0700 Subject: [PATCH 16/32] Rebase with fixes to skipped failing tests --- tests/suite/graceful_recovery_test.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index da78420407..0ed4e462f3 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -106,8 +106,6 @@ var _ = Describe("Graceful Recovery test", Ordered, Label("functional", "gracefu }) It("recovers when node is restarted abruptly", func() { - // FIXME(bjee19) remove Skip() when https://github.com/nginxinc/nginx-gateway-fabric/issues/1108 is completed. - Skip("Test currently fails due to this issue: https://github.com/nginxinc/nginx-gateway-fabric/issues/1108") runRestartNodeAbruptlyTest(teaURL, coffeeURL, files, &ns) }) }) @@ -178,6 +176,8 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names ngfPodName := podNames[0] Expect(ngfPodName).ToNot(Equal("")) + time.Sleep(20 * time.Second) + if portFwdPort != 0 { ports := []string{fmt.Sprintf("%d:80", ngfHTTPForwardedPort), fmt.Sprintf("%d:443", ngfHTTPSForwardedPort)} portForwardStopCh = make(chan struct{}) @@ -185,7 +185,7 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names Expect(err).ToNot(HaveOccurred()) } - checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, files, ns) + checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, "", files, ns) } func runRecoveryTest(teaURL, coffeeURL, ngfPodName, containerName string, files []string, ns *core.Namespace) { @@ -215,7 +215,7 @@ func runRecoveryTest(teaURL, coffeeURL, ngfPodName, containerName string, files Should(Succeed()) } - checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, files, ns) + checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, containerName, files, ns) } func restartContainer(ngfPodName, containerName string) { @@ -313,12 +313,12 @@ func expectRequestToFail(appURL, address string) error { return nil } -func checkNGFFunctionality(teaURL, coffeeURL, ngfPodName string, files []string, ns *core.Namespace) { +func checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, containerName string, files []string, ns *core.Namespace) { Eventually( func() error { return checkForWorkingTraffic(teaURL, coffeeURL) }). - WithTimeout(timeoutConfig.RequestTimeout). + WithTimeout(timeoutConfig.RequestTimeout * 2). WithPolling(500 * time.Millisecond). Should(Succeed()) @@ -339,11 +339,11 @@ func checkNGFFunctionality(teaURL, coffeeURL, ngfPodName string, files []string, func() error { return checkForWorkingTraffic(teaURL, coffeeURL) }). - WithTimeout(timeoutConfig.RequestTimeout). + WithTimeout(timeoutConfig.RequestTimeout * 2). WithPolling(500 * time.Millisecond). Should(Succeed()) - checkContainerLogsForErrors(ngfPodName, true) + checkContainerLogsForErrors(ngfPodName, containerName == nginxContainerName) } // checkContainerLogsForErrors checks both nginx and ngf container's logs for any possible errors. From c79b64401622a8a8ea6df44e63e677eea2065d18 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Mon, 24 Jun 2024 14:50:33 -0700 Subject: [PATCH 17/32] Teardown NGF between each test --- tests/suite/graceful_recovery_test.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index 0ed4e462f3..ac8816d31c 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -47,7 +47,7 @@ var _ = Describe("Graceful Recovery test", Ordered, Label("functional", "gracefu var ngfPodName string - BeforeAll(func() { + BeforeEach(func() { // this test is unique in that it will check the entire log of both ngf and nginx containers // for any errors, so in order to avoid errors generated in previous tests we will uninstall // NGF installed at the suite level, then re-deploy our own @@ -66,9 +66,7 @@ var _ = Describe("Graceful Recovery test", Ordered, Label("functional", "gracefu if portFwdHTTPSPort != 0 { teaURL = fmt.Sprintf("%s:%d/tea", baseHTTPSURL, portFwdHTTPSPort) } - }) - BeforeEach(func() { ns = core.Namespace{ ObjectMeta: metav1.ObjectMeta{ Name: "graceful-recovery", From 3d2151b1a89abf07008a05e66ec326906d2a2fcb Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Mon, 24 Jun 2024 15:15:26 -0700 Subject: [PATCH 18/32] Run pipeline --- tests/suite/graceful_recovery_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index ac8816d31c..6b0c6bea6c 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -142,6 +142,7 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names } containerName := *clusterName + "-control-plane" + fmt.Println("This is the containerName: " + containerName) _, err = exec.Command("docker", "restart", containerName).CombinedOutput() Expect(err).ToNot(HaveOccurred()) From 71affc507f623d03c0a22fd840fb2afc72c17cea Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Mon, 24 Jun 2024 15:22:03 -0700 Subject: [PATCH 19/32] Use BeEmpty instead of empty string --- tests/suite/graceful_recovery_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index 6b0c6bea6c..cb8f5b596c 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -173,7 +173,7 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names Should(BeTrue()) ngfPodName := podNames[0] - Expect(ngfPodName).ToNot(Equal("")) + Expect(ngfPodName).ToNot(BeEmpty()) time.Sleep(20 * time.Second) From 7fe4691f1f945b95e2cf45295fa737f1063b31c5 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Mon, 24 Jun 2024 15:39:13 -0700 Subject: [PATCH 20/32] Remove functional label --- tests/suite/graceful_recovery_test.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index cb8f5b596c..c483592ea4 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -30,7 +30,7 @@ const ( // Since checkContainerLogsForErrors may experience interference from previous tests (as explained in the function // documentation), this test is recommended to be run separate from other tests. -var _ = Describe("Graceful Recovery test", Ordered, Label("functional", "graceful-recovery"), func() { +var _ = Describe("Graceful Recovery test", Ordered, Label("graceful-recovery"), func() { files := []string{ "graceful-recovery/cafe.yaml", "graceful-recovery/cafe-secret.yaml", @@ -142,7 +142,6 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names } containerName := *clusterName + "-control-plane" - fmt.Println("This is the containerName: " + containerName) _, err = exec.Command("docker", "restart", containerName).CombinedOutput() Expect(err).ToNot(HaveOccurred()) From 53a65294d0132dd8a3c82d16454947c2b0d70bbc Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Tue, 25 Jun 2024 10:57:07 -0700 Subject: [PATCH 21/32] Add stable readiness check --- tests/suite/graceful_recovery_test.go | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index c483592ea4..72f4e99723 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -141,7 +141,9 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names Expect(err).ToNot(HaveOccurred()) } + Expect(*clusterName).ToNot(BeEmpty()) containerName := *clusterName + "-control-plane" + _, err = exec.Command("docker", "restart", containerName).CombinedOutput() Expect(err).ToNot(HaveOccurred()) @@ -161,11 +163,18 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names WithPolling(500 * time.Millisecond). Should(BeTrue()) + // ngf can often oscillate between ready and error, so we wait for a stable readiness in ngf var podNames []string + var count int Eventually( func() bool { podNames, err = framework.GetReadyNGFPodNames(k8sClient, ngfNamespace, releaseName, timeoutConfig.GetStatusTimeout) - return len(podNames) == 1 && err == nil + if len(podNames) == 1 { + count++ + return count == 10 && err == nil + } + count = 0 + return false }). WithTimeout(timeoutConfig.CreateTimeout). WithPolling(500 * time.Millisecond). @@ -174,8 +183,6 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names ngfPodName := podNames[0] Expect(ngfPodName).ToNot(BeEmpty()) - time.Sleep(20 * time.Second) - if portFwdPort != 0 { ports := []string{fmt.Sprintf("%d:80", ngfHTTPForwardedPort), fmt.Sprintf("%d:443", ngfHTTPSForwardedPort)} portForwardStopCh = make(chan struct{}) @@ -316,7 +323,7 @@ func checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, containerName string, func() error { return checkForWorkingTraffic(teaURL, coffeeURL) }). - WithTimeout(timeoutConfig.RequestTimeout * 2). + WithTimeout(timeoutConfig.RequestTimeout). WithPolling(500 * time.Millisecond). Should(Succeed()) @@ -337,7 +344,7 @@ func checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, containerName string, func() error { return checkForWorkingTraffic(teaURL, coffeeURL) }). - WithTimeout(timeoutConfig.RequestTimeout * 2). + WithTimeout(timeoutConfig.RequestTimeout). WithPolling(500 * time.Millisecond). Should(Succeed()) @@ -345,10 +352,7 @@ func checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, containerName string, } // checkContainerLogsForErrors checks both nginx and ngf container's logs for any possible errors. -// Since this function retrieves all the logs from both containers and the NGF pod may be shared between tests, -// the logs retrieved may contain log messages from previous tests, thus any errors in the logs from previous tests -// may cause an interference with this test and cause this test to fail. -// Additionally, when the NGINX process is killed, some errors are expected in the NGF logs while we wait for the +// When the NGINX process is killed, some errors are expected in the NGF logs while we wait for the // NGINX container to be restarted. func checkContainerLogsForErrors(ngfPodName string, checkNginxLogsOnly bool) { nginxLogs, err := resourceManager.GetPodLogs( From 43597adbf4c939aa23eca0e3941e3bf6c0e41160 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Tue, 25 Jun 2024 11:26:34 -0700 Subject: [PATCH 22/32] Add back in extended timeout --- tests/suite/graceful_recovery_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index 72f4e99723..c1526e0b82 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -323,7 +323,7 @@ func checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, containerName string, func() error { return checkForWorkingTraffic(teaURL, coffeeURL) }). - WithTimeout(timeoutConfig.RequestTimeout). + WithTimeout(timeoutConfig.RequestTimeout * 2). WithPolling(500 * time.Millisecond). Should(Succeed()) @@ -344,7 +344,7 @@ func checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, containerName string, func() error { return checkForWorkingTraffic(teaURL, coffeeURL) }). - WithTimeout(timeoutConfig.RequestTimeout). + WithTimeout(timeoutConfig.RequestTimeout * 2). WithPolling(500 * time.Millisecond). Should(Succeed()) From 666f35ff017bbb8f1dc02440ac77841826a1580b Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Tue, 25 Jun 2024 11:59:19 -0700 Subject: [PATCH 23/32] Extend timout duration for waiting on NGF --- tests/suite/graceful_recovery_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index c1526e0b82..38e56067e6 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -176,7 +176,7 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names count = 0 return false }). - WithTimeout(timeoutConfig.CreateTimeout). + WithTimeout(timeoutConfig.CreateTimeout * 2). WithPolling(500 * time.Millisecond). Should(BeTrue()) From 13ae8585f1f41a8703424c0976610b3bd127312a Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Tue, 25 Jun 2024 13:41:08 -0700 Subject: [PATCH 24/32] Add skip if test is running on GKE --- tests/suite/system_suite_test.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/suite/system_suite_test.go b/tests/suite/system_suite_test.go index f959e3d01c..bfb82040bf 100644 --- a/tests/suite/system_suite_test.go +++ b/tests/suite/system_suite_test.go @@ -139,6 +139,10 @@ func setup(cfg setupConfig, extraInstallArgs ...string) { Skip("GW_SERVICE_TYPE must be 'LoadBalancer' for NFR tests") } + if clusterInfo.IsGKE && strings.Contains(GinkgoLabelFilter(), "graceful-recovery") { + Skip("Graceful Recovery test must be run on Kind") + } + if *versionUnderTest != "" { version = *versionUnderTest } else if *imageTag != "" { From 88ac69a1bb00f0890e651516ab6b150bbead01fd Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Tue, 25 Jun 2024 14:03:07 -0700 Subject: [PATCH 25/32] Increase stable readiness count --- tests/suite/graceful_recovery_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index 38e56067e6..144aa3b943 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -171,7 +171,7 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names podNames, err = framework.GetReadyNGFPodNames(k8sClient, ngfNamespace, releaseName, timeoutConfig.GetStatusTimeout) if len(podNames) == 1 { count++ - return count == 10 && err == nil + return count == 30 && err == nil } count = 0 return false From 1a7027f2a524e44a2c70e22347e6a53239a43af5 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Tue, 25 Jun 2024 14:45:32 -0700 Subject: [PATCH 26/32] Adjust readiness count --- tests/suite/graceful_recovery_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index 144aa3b943..a2f628c42b 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -171,7 +171,7 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names podNames, err = framework.GetReadyNGFPodNames(k8sClient, ngfNamespace, releaseName, timeoutConfig.GetStatusTimeout) if len(podNames) == 1 { count++ - return count == 30 && err == nil + return count == 20 && err == nil } count = 0 return false From 4c305d085ff1eaf2f5d75fd10c6d5f0ca5da5958 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Wed, 26 Jun 2024 09:59:02 -0700 Subject: [PATCH 27/32] Update comment --- tests/suite/graceful_recovery_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index a2f628c42b..178828d0a1 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -50,7 +50,8 @@ var _ = Describe("Graceful Recovery test", Ordered, Label("graceful-recovery"), BeforeEach(func() { // this test is unique in that it will check the entire log of both ngf and nginx containers // for any errors, so in order to avoid errors generated in previous tests we will uninstall - // NGF installed at the suite level, then re-deploy our own + // NGF installed at the suite level, then re-deploy our own. We will also uninstall and re-install + // NGF between each graceful-recovery test for the same reason. teardown(releaseName) setup(getDefaultSetupCfg()) From bb392b188def28421b3791e0931dcf7a96f5913b Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Wed, 26 Jun 2024 10:00:49 -0700 Subject: [PATCH 28/32] Add nil check for clusterName --- tests/suite/graceful_recovery_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index 178828d0a1..60cf888049 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -142,6 +142,7 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names Expect(err).ToNot(HaveOccurred()) } + Expect(clusterName).ToNot(BeNil(), "clusterName variable not set") Expect(*clusterName).ToNot(BeEmpty()) containerName := *clusterName + "-control-plane" From 97693034e79c3fdfbd12fc4e7eea44ebf79ecc90 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Wed, 26 Jun 2024 10:01:19 -0700 Subject: [PATCH 29/32] Adjust wording on error --- tests/suite/graceful_recovery_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index 60cf888049..91aa261601 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -451,7 +451,7 @@ func getNodeNames() ([]string, error) { var nodes core.NodeList if err := k8sClient.List(ctx, &nodes); err != nil { - return nil, fmt.Errorf("error getting nodes: %w", err) + return nil, fmt.Errorf("error listing nodes: %w", err) } names := make([]string, 0, len(nodes.Items)) From 18d153bb39ab9ebed2e2d07db2515023bd9f0bb8 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Wed, 26 Jun 2024 10:31:54 -0700 Subject: [PATCH 30/32] Add MustPassRepeatedly to Eventually check --- tests/suite/graceful_recovery_test.go | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index 91aa261601..5d8cf2452e 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -167,19 +167,14 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names // ngf can often oscillate between ready and error, so we wait for a stable readiness in ngf var podNames []string - var count int Eventually( func() bool { podNames, err = framework.GetReadyNGFPodNames(k8sClient, ngfNamespace, releaseName, timeoutConfig.GetStatusTimeout) - if len(podNames) == 1 { - count++ - return count == 20 && err == nil - } - count = 0 - return false + return len(podNames) == 1 && err == nil }). WithTimeout(timeoutConfig.CreateTimeout * 2). WithPolling(500 * time.Millisecond). + MustPassRepeatedly(20). Should(BeTrue()) ngfPodName := podNames[0] From 9370b2511372f52049b5494532bee63439f6aef7 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Wed, 26 Jun 2024 11:48:45 -0700 Subject: [PATCH 31/32] Move checks on clusterName earlier --- tests/suite/graceful_recovery_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index 5d8cf2452e..573572ac93 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -124,6 +124,10 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names kindNodeName := nodeNames[0] + Expect(clusterName).ToNot(BeNil(), "clusterName variable not set") + Expect(*clusterName).ToNot(BeEmpty()) + containerName := *clusterName + "-control-plane" + if portFwdPort != 0 { close(portForwardStopCh) } @@ -142,10 +146,6 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names Expect(err).ToNot(HaveOccurred()) } - Expect(clusterName).ToNot(BeNil(), "clusterName variable not set") - Expect(*clusterName).ToNot(BeEmpty()) - containerName := *clusterName + "-control-plane" - _, err = exec.Command("docker", "restart", containerName).CombinedOutput() Expect(err).ToNot(HaveOccurred()) From df28aa1496b93bd6163638d59aabe15571adac91 Mon Sep 17 00:00:00 2001 From: Benjamin Jee Date: Wed, 26 Jun 2024 14:43:10 -0700 Subject: [PATCH 32/32] Re-run pipeline --- tests/suite/graceful_recovery_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index 573572ac93..a9edbbe2d8 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -348,7 +348,7 @@ func checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, containerName string, checkContainerLogsForErrors(ngfPodName, containerName == nginxContainerName) } -// checkContainerLogsForErrors checks both nginx and ngf container's logs for any possible errors. +// checkContainerLogsForErrors checks both nginx and NGF container's logs for any possible errors. // When the NGINX process is killed, some errors are expected in the NGF logs while we wait for the // NGINX container to be restarted. func checkContainerLogsForErrors(ngfPodName string, checkNginxLogsOnly bool) {