Skip to content

Commit

Permalink
Add /raftz monitoring endpoint
Browse files Browse the repository at this point in the history
We often find ourselves without good visibility into what's really going
on in the Raft layer. This endpoint dumps quite a substantial amount of
internal Raft node state.

Filters include:

* `?acc=ACCNAME` to filter by account (defaults to system account if not specified)
* `?group=GROUP` to show only specific groups

Signed-off-by: Neil Twigg <[email protected]>
  • Loading branch information
neilalexander committed Jun 13, 2024
1 parent f263d75 commit 90fe702
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 0 deletions.
137 changes: 137 additions & 0 deletions server/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -1421,6 +1421,7 @@ func (s *Server) HandleRoot(w http.ResponseWriter, r *http.Request) {
<a href=.%s>Routes</a>
<a href=.%s>LeafNodes</a>
<a href=.%s>Gateways</a>
<a href=.%s>Raft Groups</a>
<a href=.%s class=last>Health Probe</a>
<a href=https://docs.nats.io/running-a-nats-service/nats_admin/monitoring class="help">Help</a>
</body>
Expand All @@ -1436,6 +1437,7 @@ func (s *Server) HandleRoot(w http.ResponseWriter, r *http.Request) {
s.basePath(RoutezPath),
s.basePath(LeafzPath),
s.basePath(GatewayzPath),
s.basePath(RaftzPath),
s.basePath(HealthzPath),
)
}
Expand Down Expand Up @@ -3741,3 +3743,138 @@ func (s *Server) profilez(opts *ProfilezOptions) *ProfilezStatus {
Profile: buffer.Bytes(),
}
}

type RaftzGroup struct {
ID string `json:"id"`
State string `json:"state"`
Size int `json:"size"`
QuorumNeeded int `json:"quorum_needed"`
Observer bool `json:"observer,omitempty"`
Paused bool `json:"paused,omitempty"`
Committed uint64 `json:"committed"`
Applied uint64 `json:"applied"`
CatchingUp bool `json:"catching_up,omitempty"`
Leader string `json:"leader,omitempty"`
EverHadLeader bool `json:"ever_had_leader"`
Term uint64 `json:"term"`
Vote string `json:"voted_for,omitempty"`
PTerm uint64 `json:"pterm"`
PIndex uint64 `json:"pindex"`
IPQPropLen int `json:"ipq_proposal_len"`
IPQEntryLen int `json:"ipq_entry_len"`
IPQRespLen int `json:"ipq_resp_len"`
IPQApplyLen int `json:"ipq_apply_len"`
WAL StreamState `json:"wal"`
WALError error `json:"wal_error,omitempty"`
Peers map[string]RaftzGroupPeer `json:"peers"`
}

type RaftzGroupPeer struct {
Name string `json:"name"`
Known bool `json:"known"`
LastReplicatedIndex uint64 `json:"last_replicated_index,omitempty"`
LastSeen string `json:"last_seen,omitempty"`
}

func (s *Server) HandleRaftz(w http.ResponseWriter, r *http.Request) {
if s.raftNodes == nil {
w.WriteHeader(404)
w.Write([]byte("No Raft nodes registered"))
return
}

gfilter := r.URL.Query().Get("group")
afilter := r.URL.Query().Get("acc")
if afilter == "" {
afilter = s.SystemAccount().Name
}

groups := map[string]RaftNode{}
infos := map[string]map[string]RaftzGroup{} // account -> group ID

s.rnMu.RLock()
if gfilter != _EMPTY_ {
if rg, ok := s.raftNodes[gfilter]; ok && rg != nil {
if n, ok := rg.(*raft); ok {
if n.accName == afilter {
groups[gfilter] = rg
}
}
}
} else {
for name, rg := range s.raftNodes {
if rg == nil {
continue
}
if n, ok := rg.(*raft); ok {
if n.accName != afilter {
continue
}
groups[name] = rg
}
}
}
s.rnMu.RUnlock()

if len(groups) == 0 {
w.WriteHeader(404)
w.Write([]byte("No Raft nodes found, does the specified account/group exist?"))
return
}

for name, rg := range groups {
n, ok := rg.(*raft)
if n == nil || !ok {
continue
}
if _, ok := infos[n.accName]; !ok {
infos[n.accName] = map[string]RaftzGroup{}
}
// Only take the lock once, using the public RaftNode functions would
// cause us to take and release the locks over and over again.
n.RLock()
info := RaftzGroup{
ID: n.id,
State: RaftState(n.state.Load()).String(),
Size: n.csz,
QuorumNeeded: n.qn,
Observer: n.observer,
Paused: n.paused,
Committed: n.commit,
Applied: n.applied,
CatchingUp: n.catchup != nil,
Leader: n.leader,
EverHadLeader: n.pleader,
Term: n.term,
Vote: n.vote,
PTerm: n.pterm,
PIndex: n.pindex,
IPQPropLen: n.prop.len(),
IPQEntryLen: n.entry.len(),
IPQRespLen: n.resp.len(),
IPQApplyLen: n.apply.len(),
WALError: n.werr,
Peers: map[string]RaftzGroupPeer{},
}
n.wal.FastState(&info.WAL)
for id, p := range n.peers {
if id == n.id {
continue
}
peer := RaftzGroupPeer{
Name: s.serverNameForNode(id),
Known: p.kp,
LastReplicatedIndex: p.li,
}
if p.ts > 0 {
peer.LastSeen = time.Since(time.Unix(0, p.ts)).String()
}
info.Peers[id] = peer
}
n.RUnlock()
infos[n.accName][name] = info
}

b, _ := json.MarshalIndent(infos, "", " ")
ResponseHandler(w, r, b)
}
3 changes: 3 additions & 0 deletions server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -2892,6 +2892,7 @@ const (
JszPath = "/jsz"
HealthzPath = "/healthz"
IPQueuesPath = "/ipqueuesz"
RaftzPath = "/raftz"
)

func (s *Server) basePath(p string) string {
Expand Down Expand Up @@ -3006,6 +3007,8 @@ func (s *Server) startMonitoring(secure bool) error {
mux.HandleFunc(s.basePath(HealthzPath), s.HandleHealthz)
// IPQueuesz
mux.HandleFunc(s.basePath(IPQueuesPath), s.HandleIPQueuesz)
// Raftz
mux.HandleFunc(s.basePath(RaftzPath), s.HandleRaftz)

// Do not set a WriteTimeout because it could cause cURL/browser
// to return empty response or unable to display page if the
Expand Down

0 comments on commit 90fe702

Please sign in to comment.