From 972772440ac217267e3337ed8b7a53aa338cb1aa Mon Sep 17 00:00:00 2001 From: Aleksandra Malinowska Date: Thu, 25 May 2017 14:22:10 +0200 Subject: [PATCH] Add failing health check if autoscaler loop consistently returns error --- cluster-autoscaler/main.go | 7 +- cluster-autoscaler/metrics/liveness.go | 49 ++++++++++---- cluster-autoscaler/metrics/liveness_test.go | 72 ++++++++++++++++++--- 3 files changed, 106 insertions(+), 22 deletions(-) diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index fc4b4c0f26fa..4018a4fd60aa 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -99,7 +99,8 @@ var ( "Type of node group expander to be used in scale up. Available values: ["+strings.Join(expander.AvailableExpanders, ",")+"]") writeStatusConfigMapFlag = flag.Bool("write-status-configmap", true, "Should CA write status information to a configmap") - maxInactivityFlag = flag.Duration("max-inactivity", 10*time.Minute, "Maximum time from last recorded autoscaler activity before automatic restart") + maxInactivityTimeFlag = flag.Duration("max-inactivity", 10*time.Minute, "Maximum time from last recorded autoscaler activity before automatic restart") + maxFailingTimeFlag = flag.Duration("max-failing-time", 15*time.Minute, "Maximum time from last recorded successful autoscaler run before automatic restart") ) func createAutoscalerOptions() core.AutoscalerOptions { @@ -195,6 +196,8 @@ func run(healthCheck *metrics.HealthCheck) { err := autoscaler.RunOnce(loopStart) if err != nil && err.Type() != errors.TransientError { metrics.RegisterError(err) + } else { + healthCheck.UpdateLastSuccessfulRun(time.Now()) } metrics.UpdateDuration("main", loopStart) @@ -212,7 +215,7 @@ func main() { "Can be used multiple times. Format: ::") kube_flag.InitFlags() - healthCheck := metrics.NewHealthCheck(*maxInactivityFlag) + healthCheck := metrics.NewHealthCheck(*maxInactivityTimeFlag, *maxFailingTimeFlag) glog.Infof("Cluster Autoscaler %s", ClusterAutoscalerVersion) diff --git a/cluster-autoscaler/metrics/liveness.go b/cluster-autoscaler/metrics/liveness.go index 321f10bf7aaf..ee85551fb7be 100644 --- a/cluster-autoscaler/metrics/liveness.go +++ b/cluster-autoscaler/metrics/liveness.go @@ -25,19 +25,24 @@ import ( // HealthCheck contains information about last time of autoscaler activity and timeout type HealthCheck struct { - lastActivity time.Time - mutex *sync.Mutex - timeout time.Duration - checkTimeout bool + lastActivity time.Time + lastSuccessfulRun time.Time + mutex *sync.Mutex + activityTimeout time.Duration + successTimeout time.Duration + checkTimeout bool } // NewHealthCheck builds new HealthCheck object with given timeout -func NewHealthCheck(timeout time.Duration) *HealthCheck { +func NewHealthCheck(activityTimeout, successTimeout time.Duration) *HealthCheck { + now := time.Now() return &HealthCheck{ - lastActivity: time.Now(), - mutex: &sync.Mutex{}, - timeout: timeout, - checkTimeout: false, + lastActivity: now, + lastSuccessfulRun: now, + mutex: &sync.Mutex{}, + activityTimeout: activityTimeout, + successTimeout: successTimeout, + checkTimeout: false, } } @@ -50,18 +55,27 @@ func (hc *HealthCheck) StartMonitoring() { if now.After(hc.lastActivity) { hc.lastActivity = now } + if now.After(hc.lastSuccessfulRun) { + hc.lastSuccessfulRun = now + } } // ServeHTTP implements http.Handler interface to provide a health-check endpoint func (hc *HealthCheck) ServeHTTP(w http.ResponseWriter, r *http.Request) { hc.mutex.Lock() + lastActivity := hc.lastActivity - timedOut := hc.checkTimeout && time.Now().After(lastActivity.Add(hc.timeout)) + lastSuccessfulRun := hc.lastSuccessfulRun + now := time.Now() + activityTimedOut := now.After(lastActivity.Add(hc.activityTimeout)) + successTimedOut := now.After(lastSuccessfulRun.Add(hc.successTimeout)) + timedOut := hc.checkTimeout && (activityTimedOut || successTimedOut) + hc.mutex.Unlock() if timedOut { w.WriteHeader(500) - w.Write([]byte(fmt.Sprintf("Error: last activity more than %v ago", time.Now().Sub(lastActivity).String()))) + w.Write([]byte(fmt.Sprintf("Error: last activity more %v ago, last success more than %v ago", time.Now().Sub(lastActivity).String(), time.Now().Sub(lastSuccessfulRun).String()))) } else { w.WriteHeader(200) w.Write([]byte("OK")) @@ -76,3 +90,16 @@ func (hc *HealthCheck) UpdateLastActivity(timestamp time.Time) { hc.lastActivity = timestamp } } + +// UpdateLastSuccessfulRun updates last time of successful (i.e. not ending in error) activity +func (hc *HealthCheck) UpdateLastSuccessfulRun(timestamp time.Time) { + hc.mutex.Lock() + defer hc.mutex.Unlock() + if timestamp.After(hc.lastSuccessfulRun) { + hc.lastSuccessfulRun = timestamp + } + // finishing successful run is also a sign of activity + if timestamp.After(hc.lastActivity) { + hc.lastActivity = timestamp + } +} diff --git a/cluster-autoscaler/metrics/liveness_test.go b/cluster-autoscaler/metrics/liveness_test.go index ed57e3dfc460..9341a5cb8a66 100644 --- a/cluster-autoscaler/metrics/liveness_test.go +++ b/cluster-autoscaler/metrics/liveness_test.go @@ -24,48 +24,52 @@ import ( "github.com/stretchr/testify/assert" ) -func getTestResponse(start time.Time, timeout time.Duration, checkMonitoring bool) *httptest.ResponseRecorder { +func getTestResponse(start time.Time, activityTimeout, successTimeout time.Duration, checkMonitoring bool) *httptest.ResponseRecorder { req := httptest.NewRequest("GET", "/health-check", nil) w := httptest.NewRecorder() - healthCheck := NewHealthCheck(timeout) + healthCheck := NewHealthCheck(activityTimeout, successTimeout) if checkMonitoring { healthCheck.StartMonitoring() } healthCheck.lastActivity = start + healthCheck.lastSuccessfulRun = start healthCheck.ServeHTTP(w, req) return w } func TestOkServeHTTP(t *testing.T) { - w := getTestResponse(time.Now(), time.Second, true) + w := getTestResponse(time.Now(), time.Second, time.Second, true) assert.Equal(t, 200, w.Code) } -func TestFailServeHTTP(t *testing.T) { - w := getTestResponse(time.Now().Add(time.Second*-2), time.Second, true) +func TestFailTimeoutServeHTTP(t *testing.T) { + w := getTestResponse(time.Now().Add(time.Second*-2), time.Second, time.Second, true) assert.Equal(t, 500, w.Code) } func TestMonitoringOffAfterTimeout(t *testing.T) { - w := getTestResponse(time.Now(), time.Second, false) + w := getTestResponse(time.Now().Add(time.Second*-2), time.Second, time.Second, false) assert.Equal(t, 200, w.Code) } func TestMonitoringOffBeforeTimeout(t *testing.T) { - w := getTestResponse(time.Now().Add(time.Second*-2), time.Second, false) + w := getTestResponse(time.Now().Add(time.Second*2), time.Second, time.Second, false) assert.Equal(t, 200, w.Code) } func TestUpdateLastActivity(t *testing.T) { timeout := time.Second start := time.Now().Add(timeout * -2) + // to make sure it doesn't cause health check failure + lastSuccess := time.Now().Add(timeout * 10) req := httptest.NewRequest("GET", "/health-check", nil) - w := httptest.NewRecorder() - healthCheck := NewHealthCheck(timeout) + healthCheck := NewHealthCheck(timeout, timeout) healthCheck.StartMonitoring() healthCheck.lastActivity = start + healthCheck.lastSuccessfulRun = lastSuccess + w := httptest.NewRecorder() healthCheck.ServeHTTP(w, req) assert.Equal(t, 500, w.Code) @@ -74,3 +78,53 @@ func TestUpdateLastActivity(t *testing.T) { healthCheck.ServeHTTP(w, req) assert.Equal(t, 200, w.Code) } + +func TestUpdateActivityAtUpdateLastSuccessfulRun(t *testing.T) { + timeout := time.Second + start := time.Now().Add(timeout * -2) + // to make sure it doesn't cause health check failure + lastSuccess := time.Now().Add(timeout * 10) + + req := httptest.NewRequest("GET", "/health-check", nil) + healthCheck := NewHealthCheck(timeout, timeout) + healthCheck.StartMonitoring() + healthCheck.lastActivity = start + healthCheck.lastSuccessfulRun = lastSuccess + + w := httptest.NewRecorder() + healthCheck.ServeHTTP(w, req) + assert.Equal(t, 500, w.Code) + + w = httptest.NewRecorder() + healthCheck.UpdateLastSuccessfulRun(time.Now()) + healthCheck.ServeHTTP(w, req) + assert.Equal(t, 200, w.Code) + + // verify last successful run from the future wasn't overwritten + assert.Equal(t, true, healthCheck.lastSuccessfulRun.After(healthCheck.lastActivity)) +} + +func TestUpdateLastSuccessfulRun(t *testing.T) { + timeout := time.Second + start := time.Now().Add(timeout * -2) + // to make sure it doesn't cause health check failure + lastActivity := time.Now().Add(timeout * 10) + + req := httptest.NewRequest("GET", "/health-check", nil) + healthCheck := NewHealthCheck(timeout, timeout) + healthCheck.StartMonitoring() + healthCheck.lastActivity = lastActivity + healthCheck.lastSuccessfulRun = start + + w := httptest.NewRecorder() + healthCheck.ServeHTTP(w, req) + assert.Equal(t, 500, w.Code) + + w = httptest.NewRecorder() + healthCheck.UpdateLastSuccessfulRun(time.Now()) + healthCheck.ServeHTTP(w, req) + assert.Equal(t, 200, w.Code) + + // verify last activity timestamp from the future wasn't overwritten + assert.Equal(t, true, healthCheck.lastActivity.After(healthCheck.lastSuccessfulRun)) +}