From 50d84112dc1f2404ec14aefa85ed0f404a363070 Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Tue, 24 Sep 2019 14:06:45 -0700 Subject: [PATCH] =?UTF-8?q?test:=20more=20resilient=20=E2=80=9Cwait=20for?= =?UTF-8?q?=20successful=20pod=20readiness=20checks=E2=80=9D=20(#2015)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/e2e/kubernetes/kubernetes_test.go | 42 +++++++++++++------------- test/e2e/kubernetes/pod/pod.go | 41 ++++++++++++++----------- 2 files changed, 44 insertions(+), 39 deletions(-) diff --git a/test/e2e/kubernetes/kubernetes_test.go b/test/e2e/kubernetes/kubernetes_test.go index 6ebd91e9f8..869dee1183 100644 --- a/test/e2e/kubernetes/kubernetes_test.go +++ b/test/e2e/kubernetes/kubernetes_test.go @@ -438,7 +438,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu var pods []pod.Pod testPortForward := func(deploymentName string) { - running, podWaitErr := pod.WaitOnReady(deploymentName, deploymentNamespace, 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, podWaitErr := pod.WaitOnSuccesses(deploymentName, deploymentNamespace, 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(podWaitErr).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) pods, err = deploy.Pods() @@ -609,11 +609,11 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu var running bool if common.IsKubernetesVersionGe(eng.ExpandedDefinition.Properties.OrchestratorProfile.OrchestratorVersion, "1.12.0") { By("Ensuring that coredns is running") - running, err = pod.WaitOnReady("coredns", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err = pod.WaitOnSuccesses("coredns", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) } else { By("Ensuring that kube-dns is running") - running, err = pod.WaitOnReady("kube-dns", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err = pod.WaitOnSuccesses("kube-dns", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) } Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) @@ -626,7 +626,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu } for _, componentName := range coreComponents { By(fmt.Sprintf("Ensuring that %s is Running", componentName)) - running, err := pod.WaitOnReady(componentName, "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err := pod.WaitOnSuccesses(componentName, "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) } @@ -689,7 +689,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu if hasAddon, addon := eng.HasAddon(addonName); hasAddon { for _, addonPod := range addonPods { By(fmt.Sprintf("Ensuring that the %s addon is Running", addonName)) - running, err := pod.WaitOnReady(addonPod, addonNamespace, kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err := pod.WaitOnSuccesses(addonPod, addonNamespace, kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) By(fmt.Sprintf("Ensuring that the correct resources have been applied for %s", addonPod)) @@ -710,7 +710,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu It("should have the correct tiller configuration", func() { if hasTiller, tillerAddon := eng.HasAddon("tiller"); hasTiller { - running, err := pod.WaitOnReady("tiller", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err := pod.WaitOnSuccesses("tiller", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) pods, err := pod.GetAllByPrefix("tiller-deploy", "kube-system") @@ -729,7 +729,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu It("should have the expected omsagent cluster footprint", func() { if hasContainerMonitoring, _ := eng.HasAddon("container-monitoring"); hasContainerMonitoring { By("Validating the omsagent replicaset") - running, err := pod.WaitOnReady("omsagent-rs", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err := pod.WaitOnSuccesses("omsagent-rs", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) pods, err := pod.GetAllByPrefix("omsagent-rs", "kube-system") @@ -743,7 +743,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu Expect(err).NotTo(HaveOccurred()) Expect(pass).To(BeTrue()) By("Validating the omsagent daemonset") - running, err = pod.WaitOnReady("omsagent", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err = pod.WaitOnSuccesses("omsagent", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) pods, err = pod.GetAllByPrefix("omsagent", "kube-system") @@ -775,7 +775,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu Expect(err).NotTo(HaveOccurred()) By("Ensuring that php-apache pod is running") - running, err := pod.WaitOnReady(longRunningApacheDeploymentName, "default", 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err := pod.WaitOnSuccesses(longRunningApacheDeploymentName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) @@ -989,7 +989,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu curlDeploymentName := fmt.Sprintf("%s-%v", deploymentPrefix, r.Intn(99999)) curlDeploy, err := deployment.CreateLinuxDeployDeleteIfExists(deploymentPrefix, "library/nginx:latest", curlDeploymentName, "default", "--replicas=2") Expect(err).NotTo(HaveOccurred()) - running, err := pod.WaitOnReady(curlDeploymentName, "default", 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err := pod.WaitOnSuccesses(curlDeploymentName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) curlPods, err := curlDeploy.Pods() @@ -1081,7 +1081,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu Expect(err).NotTo(HaveOccurred()) By("Ensuring that the php-apache pod is running") - running, err := pod.WaitOnReady(longRunningApacheDeploymentName, "default", 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err := pod.WaitOnSuccesses(longRunningApacheDeploymentName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) @@ -1115,7 +1115,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu Expect(err).NotTo(HaveOccurred()) By("Ensuring there are 3 load test pods") - running, err = pod.WaitOnReady(loadTestName, "default", 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err = pod.WaitOnSuccesses(loadTestName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) @@ -1353,22 +1353,22 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu Expect(err).NotTo(HaveOccurred()) By("Ensure there is a running frontend-prod pod") - running, err := pod.WaitOnReady(frontendProdDeploymentName, nsProd, 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err := pod.WaitOnSuccesses(frontendProdDeploymentName, nsProd, 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) By("Ensure there is a running frontend-dev pod") - running, err = pod.WaitOnReady(frontendDevDeploymentName, nsDev, 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err = pod.WaitOnSuccesses(frontendDevDeploymentName, nsDev, 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) By("Ensure there is a running backend pod") - running, err = pod.WaitOnReady(backendDeploymentName, nsDev, 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err = pod.WaitOnSuccesses(backendDeploymentName, nsDev, 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) By("Ensure there is a running network-policy pod") - running, err = pod.WaitOnReady(nwpolicyDeploymentName, nsDev, 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err = pod.WaitOnSuccesses(nwpolicyDeploymentName, nsDev, 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) @@ -1562,7 +1562,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu Expect(err).NotTo(HaveOccurred()) By("Waiting on pod to be Ready") - running, err := pod.WaitOnReady(deploymentName, "default", 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err := pod.WaitOnSuccesses(deploymentName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) @@ -1597,7 +1597,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu Expect(err).NotTo(HaveOccurred()) By("Waiting on 5 pods to be Ready") - running, err = pod.WaitOnReady(deploymentName, "default", 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err = pod.WaitOnSuccesses(deploymentName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) iisPods, err = iisDeploy.Pods() @@ -1678,12 +1678,12 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu Expect(err).NotTo(HaveOccurred()) By("Ensure there is a Running nginx pod") - running, err := pod.WaitOnReady(nginxDeploymentName, "default", 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err := pod.WaitOnSuccesses(nginxDeploymentName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) By("Ensure there is a Running iis pod") - running, err = pod.WaitOnReady(windowsDeploymentName, "default", 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) + running, err = pod.WaitOnSuccesses(windowsDeploymentName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) @@ -1743,7 +1743,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu deploymentName := fmt.Sprintf("iis-%s-%v", cfg.Name, r.Intn(99999)) iisDeploy, err := deployment.CreateWindowsDeployIfNotExist(iisImage, deploymentName, "default", 80, hostport) Expect(err).NotTo(HaveOccurred()) - running, err := pod.WaitOnReady(deploymentName, "default", 3, 30*time.Second, cfg.Timeout) + running, err := pod.WaitOnSuccesses(deploymentName, "default", 4, 30*time.Second, cfg.Timeout) Expect(err).NotTo(HaveOccurred()) Expect(running).To(Equal(true)) iisPods, err := iisDeploy.Pods() diff --git a/test/e2e/kubernetes/pod/pod.go b/test/e2e/kubernetes/pod/pod.go index 369c27fa2c..e31bcbe4fa 100644 --- a/test/e2e/kubernetes/pod/pod.go +++ b/test/e2e/kubernetes/pod/pod.go @@ -537,15 +537,16 @@ func GetPodAsync(name, namespace string, timeout time.Duration) GetPodResult { } } -// WaitOnReady returns true if all pods matching a prefix substring are in a succeeded state within a period of time +// WaitOnSuccesses returns true if all pods matching a prefix substring are in a succeeded state within a period of time // successesNeeded is used to make sure we return the correct value even if the pod is in a CrashLoop -func WaitOnReady(podPrefix, namespace string, successesNeeded int, sleep, timeout time.Duration) (bool, error) { +func WaitOnSuccesses(podPrefix, namespace string, successesNeeded int, sleep, timeout time.Duration) (bool, error) { ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() ch := make(chan AreAllPodsRunningResult) - var mostRecentWaitOnReadyErr error + var mostRecentWaitOnSuccessesErr error successCount := 0 - failureCount := 0 + flapCount := 0 + var lastResult bool go func() { for { select { @@ -559,24 +560,28 @@ func WaitOnReady(podPrefix, namespace string, successesNeeded int, sleep, timeou for { select { case result := <-ch: - mostRecentWaitOnReadyErr = result.err - if result.ready { - successCount++ - if successCount >= successesNeeded { - return true, nil - } - } else { - if successCount > 1 { - failureCount++ - if failureCount >= successesNeeded { - PrintPodsLogs(podPrefix, namespace) - return false, errors.Errorf("Pods from deployment (%s) in namespace (%s) have been checked out as all Ready %d times, but NotReady %d times. This behavior may mean it is in a crashloop", podPrefix, namespace, successCount, failureCount) + mostRecentWaitOnSuccessesErr = result.err + if mostRecentWaitOnSuccessesErr == nil { + if result.ready { + lastResult = true + successCount++ + if successCount >= successesNeeded { + return true, nil + } + } else { + if lastResult { + flapCount++ + if flapCount >= (successesNeeded - 1) { + PrintPodsLogs(podPrefix, namespace) + return false, errors.Errorf("Pods from deployment (%s) in namespace (%s) have been checked out as all Ready %d times, but included %d transitions away from a Ready state. This behavior may mean it is in a crashloop", podPrefix, namespace, successCount, flapCount) + } + lastResult = false } } } case <-ctx.Done(): PrintPodsLogs(podPrefix, namespace) - return false, errors.Errorf("WaitOnReady timed out: %s\n", mostRecentWaitOnReadyErr) + return false, errors.Errorf("WaitOnReady timed out: %s\n", mostRecentWaitOnSuccessesErr) } } } @@ -683,7 +688,7 @@ func WaitOnTerminated(name, namespace, containerName string, sleep, containerExe // WaitOnReady will call the static method WaitOnReady passing in p.Metadata.Name and p.Metadata.Namespace func (p *Pod) WaitOnReady(sleep, timeout time.Duration) (bool, error) { - return WaitOnReady(p.Metadata.Name, p.Metadata.Namespace, 6, sleep, timeout) + return WaitOnSuccesses(p.Metadata.Name, p.Metadata.Namespace, 6, sleep, timeout) } // WaitOnSucceeded will call the static method WaitOnSucceeded passing in p.Metadata.Name and p.Metadata.Namespace