Skip to content
This repository has been archived by the owner on Oct 24, 2023. It is now read-only.

Commit

Permalink
test: more resilient “wait for successful pod readiness checks” (#2015)
Browse files Browse the repository at this point in the history
  • Loading branch information
jackfrancis authored Sep 24, 2019
1 parent 03c5613 commit 50d8411
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 39 deletions.
42 changes: 21 additions & 21 deletions test/e2e/kubernetes/kubernetes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
var pods []pod.Pod

testPortForward := func(deploymentName string) {
running, podWaitErr := pod.WaitOnReady(deploymentName, deploymentNamespace, 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, podWaitErr := pod.WaitOnSuccesses(deploymentName, deploymentNamespace, 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(podWaitErr).NotTo(HaveOccurred())
Expect(running).To(Equal(true))
pods, err = deploy.Pods()
Expand Down Expand Up @@ -609,11 +609,11 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
var running bool
if common.IsKubernetesVersionGe(eng.ExpandedDefinition.Properties.OrchestratorProfile.OrchestratorVersion, "1.12.0") {
By("Ensuring that coredns is running")
running, err = pod.WaitOnReady("coredns", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err = pod.WaitOnSuccesses("coredns", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)

} else {
By("Ensuring that kube-dns is running")
running, err = pod.WaitOnReady("kube-dns", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err = pod.WaitOnSuccesses("kube-dns", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
}
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))
Expand All @@ -626,7 +626,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
}
for _, componentName := range coreComponents {
By(fmt.Sprintf("Ensuring that %s is Running", componentName))
running, err := pod.WaitOnReady(componentName, "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err := pod.WaitOnSuccesses(componentName, "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))
}
Expand Down Expand Up @@ -689,7 +689,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
if hasAddon, addon := eng.HasAddon(addonName); hasAddon {
for _, addonPod := range addonPods {
By(fmt.Sprintf("Ensuring that the %s addon is Running", addonName))
running, err := pod.WaitOnReady(addonPod, addonNamespace, kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err := pod.WaitOnSuccesses(addonPod, addonNamespace, kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))
By(fmt.Sprintf("Ensuring that the correct resources have been applied for %s", addonPod))
Expand All @@ -710,7 +710,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu

It("should have the correct tiller configuration", func() {
if hasTiller, tillerAddon := eng.HasAddon("tiller"); hasTiller {
running, err := pod.WaitOnReady("tiller", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err := pod.WaitOnSuccesses("tiller", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))
pods, err := pod.GetAllByPrefix("tiller-deploy", "kube-system")
Expand All @@ -729,7 +729,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
It("should have the expected omsagent cluster footprint", func() {
if hasContainerMonitoring, _ := eng.HasAddon("container-monitoring"); hasContainerMonitoring {
By("Validating the omsagent replicaset")
running, err := pod.WaitOnReady("omsagent-rs", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err := pod.WaitOnSuccesses("omsagent-rs", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))
pods, err := pod.GetAllByPrefix("omsagent-rs", "kube-system")
Expand All @@ -743,7 +743,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
Expect(err).NotTo(HaveOccurred())
Expect(pass).To(BeTrue())
By("Validating the omsagent daemonset")
running, err = pod.WaitOnReady("omsagent", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err = pod.WaitOnSuccesses("omsagent", "kube-system", kubeSystemPodsReadinessChecks, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))
pods, err = pod.GetAllByPrefix("omsagent", "kube-system")
Expand Down Expand Up @@ -775,7 +775,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
Expect(err).NotTo(HaveOccurred())

By("Ensuring that php-apache pod is running")
running, err := pod.WaitOnReady(longRunningApacheDeploymentName, "default", 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err := pod.WaitOnSuccesses(longRunningApacheDeploymentName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))

Expand Down Expand Up @@ -989,7 +989,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
curlDeploymentName := fmt.Sprintf("%s-%v", deploymentPrefix, r.Intn(99999))
curlDeploy, err := deployment.CreateLinuxDeployDeleteIfExists(deploymentPrefix, "library/nginx:latest", curlDeploymentName, "default", "--replicas=2")
Expect(err).NotTo(HaveOccurred())
running, err := pod.WaitOnReady(curlDeploymentName, "default", 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err := pod.WaitOnSuccesses(curlDeploymentName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))
curlPods, err := curlDeploy.Pods()
Expand Down Expand Up @@ -1081,7 +1081,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
Expect(err).NotTo(HaveOccurred())

By("Ensuring that the php-apache pod is running")
running, err := pod.WaitOnReady(longRunningApacheDeploymentName, "default", 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err := pod.WaitOnSuccesses(longRunningApacheDeploymentName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))

Expand Down Expand Up @@ -1115,7 +1115,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
Expect(err).NotTo(HaveOccurred())

By("Ensuring there are 3 load test pods")
running, err = pod.WaitOnReady(loadTestName, "default", 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err = pod.WaitOnSuccesses(loadTestName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))

Expand Down Expand Up @@ -1353,22 +1353,22 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
Expect(err).NotTo(HaveOccurred())

By("Ensure there is a running frontend-prod pod")
running, err := pod.WaitOnReady(frontendProdDeploymentName, nsProd, 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err := pod.WaitOnSuccesses(frontendProdDeploymentName, nsProd, 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))

By("Ensure there is a running frontend-dev pod")
running, err = pod.WaitOnReady(frontendDevDeploymentName, nsDev, 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err = pod.WaitOnSuccesses(frontendDevDeploymentName, nsDev, 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))

By("Ensure there is a running backend pod")
running, err = pod.WaitOnReady(backendDeploymentName, nsDev, 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err = pod.WaitOnSuccesses(backendDeploymentName, nsDev, 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))

By("Ensure there is a running network-policy pod")
running, err = pod.WaitOnReady(nwpolicyDeploymentName, nsDev, 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err = pod.WaitOnSuccesses(nwpolicyDeploymentName, nsDev, 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))

Expand Down Expand Up @@ -1562,7 +1562,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
Expect(err).NotTo(HaveOccurred())

By("Waiting on pod to be Ready")
running, err := pod.WaitOnReady(deploymentName, "default", 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err := pod.WaitOnSuccesses(deploymentName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))

Expand Down Expand Up @@ -1597,7 +1597,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
Expect(err).NotTo(HaveOccurred())

By("Waiting on 5 pods to be Ready")
running, err = pod.WaitOnReady(deploymentName, "default", 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err = pod.WaitOnSuccesses(deploymentName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))
iisPods, err = iisDeploy.Pods()
Expand Down Expand Up @@ -1678,12 +1678,12 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
Expect(err).NotTo(HaveOccurred())

By("Ensure there is a Running nginx pod")
running, err := pod.WaitOnReady(nginxDeploymentName, "default", 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err := pod.WaitOnSuccesses(nginxDeploymentName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))

By("Ensure there is a Running iis pod")
running, err = pod.WaitOnReady(windowsDeploymentName, "default", 3, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
running, err = pod.WaitOnSuccesses(windowsDeploymentName, "default", 4, sleepBetweenRetriesWhenWaitingForPodReady, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))

Expand Down Expand Up @@ -1743,7 +1743,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
deploymentName := fmt.Sprintf("iis-%s-%v", cfg.Name, r.Intn(99999))
iisDeploy, err := deployment.CreateWindowsDeployIfNotExist(iisImage, deploymentName, "default", 80, hostport)
Expect(err).NotTo(HaveOccurred())
running, err := pod.WaitOnReady(deploymentName, "default", 3, 30*time.Second, cfg.Timeout)
running, err := pod.WaitOnSuccesses(deploymentName, "default", 4, 30*time.Second, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))
iisPods, err := iisDeploy.Pods()
Expand Down
41 changes: 23 additions & 18 deletions test/e2e/kubernetes/pod/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -537,15 +537,16 @@ func GetPodAsync(name, namespace string, timeout time.Duration) GetPodResult {
}
}

// WaitOnReady returns true if all pods matching a prefix substring are in a succeeded state within a period of time
// WaitOnSuccesses returns true if all pods matching a prefix substring are in a succeeded state within a period of time
// successesNeeded is used to make sure we return the correct value even if the pod is in a CrashLoop
func WaitOnReady(podPrefix, namespace string, successesNeeded int, sleep, timeout time.Duration) (bool, error) {
func WaitOnSuccesses(podPrefix, namespace string, successesNeeded int, sleep, timeout time.Duration) (bool, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
ch := make(chan AreAllPodsRunningResult)
var mostRecentWaitOnReadyErr error
var mostRecentWaitOnSuccessesErr error
successCount := 0
failureCount := 0
flapCount := 0
var lastResult bool
go func() {
for {
select {
Expand All @@ -559,24 +560,28 @@ func WaitOnReady(podPrefix, namespace string, successesNeeded int, sleep, timeou
for {
select {
case result := <-ch:
mostRecentWaitOnReadyErr = result.err
if result.ready {
successCount++
if successCount >= successesNeeded {
return true, nil
}
} else {
if successCount > 1 {
failureCount++
if failureCount >= successesNeeded {
PrintPodsLogs(podPrefix, namespace)
return false, errors.Errorf("Pods from deployment (%s) in namespace (%s) have been checked out as all Ready %d times, but NotReady %d times. This behavior may mean it is in a crashloop", podPrefix, namespace, successCount, failureCount)
mostRecentWaitOnSuccessesErr = result.err
if mostRecentWaitOnSuccessesErr == nil {
if result.ready {
lastResult = true
successCount++
if successCount >= successesNeeded {
return true, nil
}
} else {
if lastResult {
flapCount++
if flapCount >= (successesNeeded - 1) {
PrintPodsLogs(podPrefix, namespace)
return false, errors.Errorf("Pods from deployment (%s) in namespace (%s) have been checked out as all Ready %d times, but included %d transitions away from a Ready state. This behavior may mean it is in a crashloop", podPrefix, namespace, successCount, flapCount)
}
lastResult = false
}
}
}
case <-ctx.Done():
PrintPodsLogs(podPrefix, namespace)
return false, errors.Errorf("WaitOnReady timed out: %s\n", mostRecentWaitOnReadyErr)
return false, errors.Errorf("WaitOnReady timed out: %s\n", mostRecentWaitOnSuccessesErr)
}
}
}
Expand Down Expand Up @@ -683,7 +688,7 @@ func WaitOnTerminated(name, namespace, containerName string, sleep, containerExe

// WaitOnReady will call the static method WaitOnReady passing in p.Metadata.Name and p.Metadata.Namespace
func (p *Pod) WaitOnReady(sleep, timeout time.Duration) (bool, error) {
return WaitOnReady(p.Metadata.Name, p.Metadata.Namespace, 6, sleep, timeout)
return WaitOnSuccesses(p.Metadata.Name, p.Metadata.Namespace, 6, sleep, timeout)
}

// WaitOnSucceeded will call the static method WaitOnSucceeded passing in p.Metadata.Name and p.Metadata.Namespace
Expand Down

0 comments on commit 50d8411

Please sign in to comment.