Skip to content

Commit

Permalink
Add failing health check if autoscaler loop consistently returns error
Browse files Browse the repository at this point in the history
  • Loading branch information
aleksandra-malinowska committed May 29, 2017
1 parent 5988fec commit 9727724
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 22 deletions.
7 changes: 5 additions & 2 deletions cluster-autoscaler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ var (
"Type of node group expander to be used in scale up. Available values: ["+strings.Join(expander.AvailableExpanders, ",")+"]")

writeStatusConfigMapFlag = flag.Bool("write-status-configmap", true, "Should CA write status information to a configmap")
maxInactivityFlag = flag.Duration("max-inactivity", 10*time.Minute, "Maximum time from last recorded autoscaler activity before automatic restart")
maxInactivityTimeFlag = flag.Duration("max-inactivity", 10*time.Minute, "Maximum time from last recorded autoscaler activity before automatic restart")
maxFailingTimeFlag = flag.Duration("max-failing-time", 15*time.Minute, "Maximum time from last recorded successful autoscaler run before automatic restart")
)

func createAutoscalerOptions() core.AutoscalerOptions {
Expand Down Expand Up @@ -195,6 +196,8 @@ func run(healthCheck *metrics.HealthCheck) {
err := autoscaler.RunOnce(loopStart)
if err != nil && err.Type() != errors.TransientError {
metrics.RegisterError(err)
} else {
healthCheck.UpdateLastSuccessfulRun(time.Now())
}

metrics.UpdateDuration("main", loopStart)
Expand All @@ -212,7 +215,7 @@ func main() {
"Can be used multiple times. Format: <min>:<max>:<other...>")
kube_flag.InitFlags()

healthCheck := metrics.NewHealthCheck(*maxInactivityFlag)
healthCheck := metrics.NewHealthCheck(*maxInactivityTimeFlag, *maxFailingTimeFlag)

glog.Infof("Cluster Autoscaler %s", ClusterAutoscalerVersion)

Expand Down
49 changes: 38 additions & 11 deletions cluster-autoscaler/metrics/liveness.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,24 @@ import (

// HealthCheck contains information about last time of autoscaler activity and timeout
type HealthCheck struct {
lastActivity time.Time
mutex *sync.Mutex
timeout time.Duration
checkTimeout bool
lastActivity time.Time
lastSuccessfulRun time.Time
mutex *sync.Mutex
activityTimeout time.Duration
successTimeout time.Duration
checkTimeout bool
}

// NewHealthCheck builds new HealthCheck object with given timeout
func NewHealthCheck(timeout time.Duration) *HealthCheck {
func NewHealthCheck(activityTimeout, successTimeout time.Duration) *HealthCheck {
now := time.Now()
return &HealthCheck{
lastActivity: time.Now(),
mutex: &sync.Mutex{},
timeout: timeout,
checkTimeout: false,
lastActivity: now,
lastSuccessfulRun: now,
mutex: &sync.Mutex{},
activityTimeout: activityTimeout,
successTimeout: successTimeout,
checkTimeout: false,
}
}

Expand All @@ -50,18 +55,27 @@ func (hc *HealthCheck) StartMonitoring() {
if now.After(hc.lastActivity) {
hc.lastActivity = now
}
if now.After(hc.lastSuccessfulRun) {
hc.lastSuccessfulRun = now
}
}

// ServeHTTP implements http.Handler interface to provide a health-check endpoint
func (hc *HealthCheck) ServeHTTP(w http.ResponseWriter, r *http.Request) {
hc.mutex.Lock()

lastActivity := hc.lastActivity
timedOut := hc.checkTimeout && time.Now().After(lastActivity.Add(hc.timeout))
lastSuccessfulRun := hc.lastSuccessfulRun
now := time.Now()
activityTimedOut := now.After(lastActivity.Add(hc.activityTimeout))
successTimedOut := now.After(lastSuccessfulRun.Add(hc.successTimeout))
timedOut := hc.checkTimeout && (activityTimedOut || successTimedOut)

hc.mutex.Unlock()

if timedOut {
w.WriteHeader(500)
w.Write([]byte(fmt.Sprintf("Error: last activity more than %v ago", time.Now().Sub(lastActivity).String())))
w.Write([]byte(fmt.Sprintf("Error: last activity more %v ago, last success more than %v ago", time.Now().Sub(lastActivity).String(), time.Now().Sub(lastSuccessfulRun).String())))
} else {
w.WriteHeader(200)
w.Write([]byte("OK"))
Expand All @@ -76,3 +90,16 @@ func (hc *HealthCheck) UpdateLastActivity(timestamp time.Time) {
hc.lastActivity = timestamp
}
}

// UpdateLastSuccessfulRun updates last time of successful (i.e. not ending in error) activity
func (hc *HealthCheck) UpdateLastSuccessfulRun(timestamp time.Time) {
hc.mutex.Lock()
defer hc.mutex.Unlock()
if timestamp.After(hc.lastSuccessfulRun) {
hc.lastSuccessfulRun = timestamp
}
// finishing successful run is also a sign of activity
if timestamp.After(hc.lastActivity) {
hc.lastActivity = timestamp
}
}
72 changes: 63 additions & 9 deletions cluster-autoscaler/metrics/liveness_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,48 +24,52 @@ import (
"github.com/stretchr/testify/assert"
)

func getTestResponse(start time.Time, timeout time.Duration, checkMonitoring bool) *httptest.ResponseRecorder {
func getTestResponse(start time.Time, activityTimeout, successTimeout time.Duration, checkMonitoring bool) *httptest.ResponseRecorder {
req := httptest.NewRequest("GET", "/health-check", nil)
w := httptest.NewRecorder()
healthCheck := NewHealthCheck(timeout)
healthCheck := NewHealthCheck(activityTimeout, successTimeout)
if checkMonitoring {
healthCheck.StartMonitoring()
}
healthCheck.lastActivity = start
healthCheck.lastSuccessfulRun = start
healthCheck.ServeHTTP(w, req)
return w
}

func TestOkServeHTTP(t *testing.T) {
w := getTestResponse(time.Now(), time.Second, true)
w := getTestResponse(time.Now(), time.Second, time.Second, true)
assert.Equal(t, 200, w.Code)
}

func TestFailServeHTTP(t *testing.T) {
w := getTestResponse(time.Now().Add(time.Second*-2), time.Second, true)
func TestFailTimeoutServeHTTP(t *testing.T) {
w := getTestResponse(time.Now().Add(time.Second*-2), time.Second, time.Second, true)
assert.Equal(t, 500, w.Code)
}

func TestMonitoringOffAfterTimeout(t *testing.T) {
w := getTestResponse(time.Now(), time.Second, false)
w := getTestResponse(time.Now().Add(time.Second*-2), time.Second, time.Second, false)
assert.Equal(t, 200, w.Code)
}

func TestMonitoringOffBeforeTimeout(t *testing.T) {
w := getTestResponse(time.Now().Add(time.Second*-2), time.Second, false)
w := getTestResponse(time.Now().Add(time.Second*2), time.Second, time.Second, false)
assert.Equal(t, 200, w.Code)
}

func TestUpdateLastActivity(t *testing.T) {
timeout := time.Second
start := time.Now().Add(timeout * -2)
// to make sure it doesn't cause health check failure
lastSuccess := time.Now().Add(timeout * 10)

req := httptest.NewRequest("GET", "/health-check", nil)
w := httptest.NewRecorder()
healthCheck := NewHealthCheck(timeout)
healthCheck := NewHealthCheck(timeout, timeout)
healthCheck.StartMonitoring()
healthCheck.lastActivity = start
healthCheck.lastSuccessfulRun = lastSuccess

w := httptest.NewRecorder()
healthCheck.ServeHTTP(w, req)
assert.Equal(t, 500, w.Code)

Expand All @@ -74,3 +78,53 @@ func TestUpdateLastActivity(t *testing.T) {
healthCheck.ServeHTTP(w, req)
assert.Equal(t, 200, w.Code)
}

func TestUpdateActivityAtUpdateLastSuccessfulRun(t *testing.T) {
timeout := time.Second
start := time.Now().Add(timeout * -2)
// to make sure it doesn't cause health check failure
lastSuccess := time.Now().Add(timeout * 10)

req := httptest.NewRequest("GET", "/health-check", nil)
healthCheck := NewHealthCheck(timeout, timeout)
healthCheck.StartMonitoring()
healthCheck.lastActivity = start
healthCheck.lastSuccessfulRun = lastSuccess

w := httptest.NewRecorder()
healthCheck.ServeHTTP(w, req)
assert.Equal(t, 500, w.Code)

w = httptest.NewRecorder()
healthCheck.UpdateLastSuccessfulRun(time.Now())
healthCheck.ServeHTTP(w, req)
assert.Equal(t, 200, w.Code)

// verify last successful run from the future wasn't overwritten
assert.Equal(t, true, healthCheck.lastSuccessfulRun.After(healthCheck.lastActivity))
}

func TestUpdateLastSuccessfulRun(t *testing.T) {
timeout := time.Second
start := time.Now().Add(timeout * -2)
// to make sure it doesn't cause health check failure
lastActivity := time.Now().Add(timeout * 10)

req := httptest.NewRequest("GET", "/health-check", nil)
healthCheck := NewHealthCheck(timeout, timeout)
healthCheck.StartMonitoring()
healthCheck.lastActivity = lastActivity
healthCheck.lastSuccessfulRun = start

w := httptest.NewRecorder()
healthCheck.ServeHTTP(w, req)
assert.Equal(t, 500, w.Code)

w = httptest.NewRecorder()
healthCheck.UpdateLastSuccessfulRun(time.Now())
healthCheck.ServeHTTP(w, req)
assert.Equal(t, 200, w.Code)

// verify last activity timestamp from the future wasn't overwritten
assert.Equal(t, true, healthCheck.lastActivity.After(healthCheck.lastSuccessfulRun))
}

0 comments on commit 9727724

Please sign in to comment.