From 80d44358410b50a4955cdd0aa27b258ca1234c78 Mon Sep 17 00:00:00 2001 From: Ben Hoyt Date: Wed, 28 Feb 2024 16:36:09 +1300 Subject: [PATCH] fix(daemon): avoid acquiring state lock in health check endpoint (#369) I don't love the addition of the (exported) State.LockCount method, but I couldn't think of another way to ensure that State.Lock was not being called, apart from significant surgery allowing us to swap out the state lock with an interface. In addition, I'm not sure we need the maintenance/warnings stuff at all in Pebble, as they're not used, but leaving that for a separate discussion and PR. I tested by saving the script https://go.dev/play/p/LJDLEaXeBpd to cmd/hithealth/main.go, running a Pebble server, and then running a task to heavily load my CPU cores (though the latter doesn't seem to make much of a difference!). Without the fix, GET /v1/health response times are all over 20ms, and commonly in the 100-300ms range. In other words, very long for a do-nothing health check endpoint. This is because the other clients (in the hithealth script) are starting services which modifies state and holds the state lock for a relatively long time. $ go run ./cmd/hithealth/ 0 > 20ms: 0.028897s 1 > 20ms: 0.192796s 2 > 20ms: 0.268904s 3 > 20ms: 0.082985s 4 > 20ms: 0.030554s 5 > 20ms: 0.156912s 6 > 20ms: 0.245212s 7 > 20ms: 0.047436s 8 > 20ms: 0.099474s 9 > 20ms: 0.070440s 10 > 20ms: 0.060641s 11 > 20ms: 0.202535s 12 > 20ms: 0.206226s ... With the fix, GET /v1/health doesn't hold the state lock at all. Response times are only over 20ms every 10,000 or so requests, and never above 100ms even under heavy load: $ go run ./cmd/hithealth/ 13891 > 20ms: 0.023703s 15923 > 20ms: 0.024769s 21915 > 20ms: 0.076423s ... --- internals/daemon/api_health.go | 34 ++++++++++++++++--- internals/daemon/api_health_test.go | 51 +++++++++++++++++++++++++++++ internals/overlord/state/state.go | 10 ++++++ 3 files changed, 91 insertions(+), 4 deletions(-) diff --git a/internals/daemon/api_health.go b/internals/daemon/api_health.go index 28eef5e6..3384a3a0 100644 --- a/internals/daemon/api_health.go +++ b/internals/daemon/api_health.go @@ -15,6 +15,7 @@ package daemon import ( + "encoding/json" "net/http" "github.com/canonical/x-go/strutil" @@ -57,9 +58,34 @@ func v1Health(c *Command, r *http.Request, _ *UserState) Response { } } - return SyncResponse(&resp{ - Type: ResponseTypeSync, - Status: status, - Result: healthInfo{Healthy: healthy}, + return SyncResponse(&healthResp{ + Type: ResponseTypeSync, + Status: status, + StatusText: http.StatusText(status), + Result: healthInfo{Healthy: healthy}, }) } + +// Like the resp struct, but without the warning/maintenance fields, so that +// the health endpoint doesn't have to acquire the state lock (resulting in a +// slow response on heavily-loaded systems). +type healthResp struct { + Type ResponseType `json:"type"` + Status int `json:"status-code"` + StatusText string `json:"status,omitempty"` + Result interface{} `json:"result,omitempty"` +} + +func (r *healthResp) ServeHTTP(w http.ResponseWriter, _ *http.Request) { + status := r.Status + bs, err := json.Marshal(r) + if err != nil { + logger.Noticef("Cannot marshal %#v to JSON: %v", *r, err) + bs = nil + status = http.StatusInternalServerError + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + w.Write(bs) +} diff --git a/internals/daemon/api_health_test.go b/internals/daemon/api_health_test.go index b2887a01..96dc79fe 100644 --- a/internals/daemon/api_health_test.go +++ b/internals/daemon/api_health_test.go @@ -20,9 +20,11 @@ import ( "io" "net/http" "net/http/httptest" + "time" . "gopkg.in/check.v1" + "github.com/canonical/pebble/client" "github.com/canonical/pebble/internals/overlord" "github.com/canonical/pebble/internals/overlord/checkstate" "github.com/canonical/pebble/internals/plan" @@ -206,6 +208,55 @@ func (s *healthSuite) TestChecksError(c *C) { }) } +// Ensure state lock is not held at all for GET /v1/health requests. +// Regression test for issue described at: +// +// - https://github.com/canonical/pebble/issues/366 +// - https://bugs.launchpad.net/juju/+bug/2052517 +func (s *apiSuite) TestStateLockNotHeld(c *C) { + daemonOpts := &Options{ + Dir: s.pebbleDir, + SocketPath: s.pebbleDir + ".pebble.socket", + } + daemon, err := New(daemonOpts) + c.Assert(err, IsNil) + c.Assert(daemon.Init(), IsNil) + c.Assert(daemon.Start(), IsNil) + defer func() { + c.Assert(daemon.Stop(nil), IsNil) + }() + + // Wait for lock count to stabilise (daemon starts goroutine(s) that use + // it on startup). Normally it will stabilise almost instantly. + prevCount := daemon.state.LockCount() + for i := 0; i < 50; i++ { + if i == 49 { + c.Fatal("timed out waiting for lock count to stabilise") + } + time.Sleep(100 * time.Millisecond) + newCount := daemon.state.LockCount() + if newCount == prevCount { + break + } + prevCount = newCount + } + + // Use real HTTP client so we're exercising the full ServeHTTP flow. + // Could use daemon.serve.Handler directly, but this seems even better. + pebble, err := client.New(&client.Config{ + Socket: daemonOpts.SocketPath, + }) + c.Assert(err, IsNil) + + // Ensure lock count doesn't change during GET /v1/health request. + before := daemon.state.LockCount() + healthy, err := pebble.Health(&client.HealthOptions{}) + c.Assert(err, IsNil) + c.Assert(healthy, Equals, true) + after := daemon.state.LockCount() + c.Assert(after, Equals, before) +} + func serveHealth(c *C, method, url string, body io.Reader) (int, map[string]interface{}) { recorder := httptest.NewRecorder() request, err := http.NewRequest(method, url, body) diff --git a/internals/overlord/state/state.go b/internals/overlord/state/state.go index e72dc3ef..d72c8186 100644 --- a/internals/overlord/state/state.go +++ b/internals/overlord/state/state.go @@ -79,6 +79,8 @@ type State struct { mu sync.Mutex muC int32 + lockCount int32 // used only for testing + lastTaskId int lastChangeId int lastLaneId int @@ -135,6 +137,14 @@ func (s *State) Modified() bool { func (s *State) Lock() { s.mu.Lock() atomic.AddInt32(&s.muC, 1) + atomic.AddInt32(&s.lockCount, 1) +} + +// LockCount returns the number of times the state lock was held. +// +// NOTE: This needs to be exported, but should only be used in testing. +func (s *State) LockCount() int { + return int(atomic.LoadInt32(&s.lockCount)) } func (s *State) reading() {