Skip to content

Commit

Permalink
Don't deregister checks on stopped pods (#384)
Browse files Browse the repository at this point in the history
* Don't deregister checks on stopped pods

* Warn instead of error when svc not registered

There are cases where a pod shutting down will cause us to log errors
about not being able to register a health check. Instead these should be
warnings because it's not a situation we can recover from.
  • Loading branch information
lkysow authored Nov 10, 2020
1 parent 548ae2e commit 2b86fdd
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 6 deletions.
42 changes: 40 additions & 2 deletions connect-inject/health_check_resource.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package connectinject

import (
"errors"
"fmt"
"net/url"
"strings"
"sync"
"time"

Expand All @@ -28,6 +30,9 @@ const (
podPendingReasonMsg = "Pod is pending"
)

// ServiceNotFoundErr is returned when a Consul service instance is not registered.
var ServiceNotFoundErr = errors.New("service is not registered in Consul")

type HealthCheckResource struct {
Log hclog.Logger
KubernetesClientset kubernetes.Interface
Expand Down Expand Up @@ -166,7 +171,10 @@ func (h *HealthCheckResource) reconcilePod(pod *corev1.Pod) error {
// Create a new health check.
h.Log.Debug("registering new health check", "name", pod.Name, "id", healthCheckID)
err = h.registerConsulHealthCheck(client, healthCheckID, serviceID, status)
if err != nil {
if errors.Is(err, ServiceNotFoundErr) {
h.Log.Warn("skipping registration because service not registered with Consul - this may be because the pod is shutting down", "serviceID", serviceID)
return nil
} else if err != nil {
return fmt.Errorf("unable to register health check: %s", err)
}
// Also update it, the reason this is separate is there is no way to set the Output field of the health check
Expand Down Expand Up @@ -212,6 +220,11 @@ func (h *HealthCheckResource) registerConsulHealthCheck(client *api.Client, cons
},
})
if err != nil {
// Full error looks like:
// Unexpected response code: 500 (ServiceID "consulnamespace/svc-id" does not exist)
if strings.Contains(err.Error(), fmt.Sprintf("%s\" does not exist", serviceID)) {
return ServiceNotFoundErr
}
return fmt.Errorf("registering health check for service %q: %s", serviceID, err)
}
return nil
Expand Down Expand Up @@ -278,11 +291,36 @@ func (h *HealthCheckResource) shouldProcess(pod *corev1.Pod) bool {
if pod.Annotations[annotationStatus] != injected {
return false
}

// If the pod has been terminated, we don't want to try and modify its
// health check status because the preStop hook will have deregistered
// this pod and so we'll get errors making API calls to set the status
// of a check for a service that doesn't exist.
// We detect a terminated pod by looking to see if all the containers
// have their state set as "terminated". Kubernetes will only send
// an update to this reconciler when all containers have stopped so if
// the conditions below are satisfied we're guaranteed that the preStop
// hook has run.
if pod.Status.Phase == corev1.PodRunning && len(pod.Status.ContainerStatuses) > 0 {
allTerminated := true
for _, c := range pod.Status.ContainerStatuses {
if c.State.Terminated == nil {
allTerminated = false
break
}
}
if allTerminated {
return false
}
// Otherwise we fall through to checking if the service has been
// registered yet.
}

// We process any pod that has had its injection init container completed because
// this means the service instance has been registered with Consul and so we can
// and should set its health check status. If we don't set the health check
// immediately after registration, the pod will start to receive traffic,
// even if its non-init containers haven't yet been started.
// even if its non-init containers haven't yet reached the running state.
for _, c := range pod.Status.InitContainerStatuses {
if c.Name == InjectInitContainerName {
return c.State.Terminated != nil && c.State.Terminated.Reason == "Completed"
Expand Down
77 changes: 73 additions & 4 deletions connect-inject/health_check_resource_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,9 @@ func TestReconcilePod(t *testing.T) {
}
}

func TestUpsert_PodWithNoServiceReturnsError(t *testing.T) {
// Test that when we call upsert and the service hasn't been registered
// in Consul yet, we don't return an error.
func TestUpsert_PodWithNoService(t *testing.T) {
t.Parallel()
require := require.New(t)
pod := &corev1.Pod{
Expand All @@ -403,9 +405,8 @@ func TestUpsert_PodWithNoServiceReturnsError(t *testing.T) {
}
server, _, resource := testServerAgentResourceAndController(t, pod)
defer server.Stop()
// Start Upsert, it will attempt to reconcile the Pod but the service doesnt exist in Consul so will fail.
err := resource.Upsert("", pod)
require.Contains(err.Error(), "test-pod-test-service\" does not exist)")
require.Nil(err)
}

func TestReconcile_IgnorePodsWithoutInjectLabel(t *testing.T) {
Expand Down Expand Up @@ -441,7 +442,7 @@ func TestReconcile_IgnorePodsWithoutInjectLabel(t *testing.T) {
}

// Test pod statuses that the reconciler should ignore.
// These test cases are based on actual observed startup phases.
// These test cases are based on actual observed startup and termination phases.
func TestReconcile_IgnoreStatuses(t *testing.T) {
t.Parallel()
cases := map[string]corev1.PodStatus{
Expand Down Expand Up @@ -522,6 +523,74 @@ func TestReconcile_IgnoreStatuses(t *testing.T) {
},
ContainerStatuses: unreadyAppContainers,
},
"pod is terminating": {
Phase: corev1.PodRunning,
Conditions: []corev1.PodCondition{
{
Type: corev1.PodScheduled,
Status: corev1.ConditionTrue,
},
{
Type: corev1.PodInitialized,
Status: corev1.ConditionTrue,
},
{
Type: corev1.PodReady,
Status: corev1.ConditionFalse,
},
{
Type: corev1.ContainersReady,
Status: corev1.ConditionFalse,
},
},
InitContainerStatuses: []corev1.ContainerStatus{
{
Name: InjectInitContainerName,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{StartedAt: metav1.Now()},
},
Ready: true,
},
},
ContainerStatuses: []corev1.ContainerStatus{
{
Name: "consul-connect-envoy-sidecar",
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 0,
Reason: "Completed",
StartedAt: metav1.Time{},
FinishedAt: metav1.Time{},
},
},
Ready: false,
},
{
Name: "consul-connect-lifecycle-sidecar",
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 2,
Reason: "Error",
StartedAt: metav1.Time{},
FinishedAt: metav1.Time{},
},
},
Ready: false,
},
{
Name: "app",
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 137,
Reason: "Error",
StartedAt: metav1.Time{},
FinishedAt: metav1.Time{},
},
},
Ready: false,
},
},
},
}
for name, podStatus := range cases {
t.Run(name, func(t *testing.T) {
Expand Down

0 comments on commit 2b86fdd

Please sign in to comment.