Skip to content

Commit

Permalink
Limit statsz updates (#5470)
Browse files Browse the repository at this point in the history
This PR adds a filter to `sendStatsz` that limits statsz updates to the
current heartbeat interval (max once per second), adding a `time.Time`
field to track the time the last statsz update was sent. This limit
should reduce overall `STATSZ` system event load in large clusters while
still allowing initial statsz update to quickly reach newly-discovered
nodes.

Fixes #5469.

Signed-off-by: Will Jordan <[email protected]>
  • Loading branch information
wjordan authored Jun 5, 2024
1 parent b294e53 commit 958b61f
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 0 deletions.
43 changes: 43 additions & 0 deletions server/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ const (
// FIXME(dlc) - make configurable.
var eventsHBInterval = 30 * time.Second

var statszRateLimit = 1 * time.Second

type sysMsgHandler func(sub *subscription, client *client, acc *Account, subject, reply string, hdr, msg []byte)

// Used if we have to queue things internally to avoid the route/gw path.
Expand Down Expand Up @@ -134,6 +136,7 @@ type internal struct {
shash string
inboxPre string
remoteStatsSub *subscription
lastStatsz time.Time
}

// ServerStatsMsg is sent periodically with stats updates.
Expand Down Expand Up @@ -808,6 +811,10 @@ func (s *Server) sendStatsz(subj string) {
var m ServerStatsMsg
s.updateServerUsage(&m.Stats)

if s.limitStatsz(subj) {
return
}

s.mu.RLock()
defer s.mu.RUnlock()

Expand Down Expand Up @@ -949,6 +956,35 @@ func (s *Server) sendStatsz(subj string) {
s.sendInternalMsg(subj, _EMPTY_, &m.Server, &m)
}

// Limit updates to the heartbeat interval, max one second.
func (s *Server) limitStatsz(subj string) bool {
s.mu.Lock()
defer s.mu.Unlock()

if s.sys == nil {
return true
}

// Only limit the normal broadcast subject.
if subj != fmt.Sprintf(serverStatsSubj, s.ID()) {
return false
}

interval := statszRateLimit
if s.sys.cstatsz < interval {
interval = s.sys.cstatsz
}
if time.Since(s.sys.lastStatsz) < interval {
// Reschedule heartbeat for the next interval.
if s.sys.stmr != nil {
s.sys.stmr.Reset(time.Until(s.sys.lastStatsz.Add(interval)))
}
return true
}
s.sys.lastStatsz = time.Now()
return false
}

// Send out our statz update.
// This should be wrapChk() to setup common locking.
func (s *Server) heartbeatStatsz() {
Expand All @@ -966,6 +1002,12 @@ func (s *Server) heartbeatStatsz() {
go s.sendStatszUpdate()
}

// Reset statsz rate limit for the next broadcast.
// This should be wrapChk() to setup common locking.
func (s *Server) resetLastStatsz() {
s.sys.lastStatsz = time.Time{}
}

func (s *Server) sendStatszUpdate() {
s.sendStatsz(fmt.Sprintf(serverStatsSubj, s.ID()))
}
Expand Down Expand Up @@ -1869,6 +1911,7 @@ func (s *Server) statszReq(sub *subscription, c *client, _ *Account, subject, re
// No reply is a signal that we should use our normal broadcast subject.
if reply == _EMPTY_ {
reply = fmt.Sprintf(serverStatsSubj, s.info.ID)
s.wrapChk(s.resetLastStatsz)
}

opts := StatszEventOptions{}
Expand Down
14 changes: 14 additions & 0 deletions server/events_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3538,3 +3538,17 @@ func Benchmark_GetHash(b *testing.B) {
default:
}
}

func TestClusterSetupMsgs(t *testing.T) {
numServers := 10
c := createClusterEx(t, false, 0, false, "cluster", numServers)
var totalOut int

for _, server := range c.servers {
totalOut += int(server.outMsgs)
}
totalExpected := numServers * numServers
if totalOut >= totalExpected {
t.Fatalf("Total outMsgs is %d, expected < %d\n", totalOut, totalExpected)
}
}
4 changes: 4 additions & 0 deletions server/jetstream_super_cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1355,6 +1355,10 @@ func TestJetStreamSuperClusterPushConsumerInterest(t *testing.T) {
}

func TestJetStreamSuperClusterOverflowPlacement(t *testing.T) {
orgStatszRateLimit := statszRateLimit
statszRateLimit = time.Millisecond * 100
defer func() { statszRateLimit = orgStatszRateLimit }()

sc := createJetStreamSuperClusterWithTemplate(t, jsClusterMaxBytesTempl, 3, 3)
defer sc.shutdown()

Expand Down

0 comments on commit 958b61f

Please sign in to comment.