Skip to content

Commit

Permalink
Improve health check description before messages have been sent (#1263)
Browse files Browse the repository at this point in the history
  • Loading branch information
ceyonur authored Mar 30, 2023
1 parent 0399745 commit 705c3fa
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 20 deletions.
53 changes: 41 additions & 12 deletions network/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -347,20 +347,28 @@ func (n *network) HealthCheck(context.Context) (interface{}, error) {
// Make sure we've received an incoming message within the threshold
now := n.peerConfig.Clock.Time()

lastMsgReceivedAt := time.Unix(atomic.LoadInt64(&n.peerConfig.LastReceived), 0)
timeSinceLastMsgReceived := now.Sub(lastMsgReceivedAt)
wasMsgReceivedRecently := timeSinceLastMsgReceived <= n.config.HealthConfig.MaxTimeSinceMsgReceived
lastMsgReceivedAt, msgReceived := n.getLastReceived()
wasMsgReceivedRecently := msgReceived
timeSinceLastMsgReceived := time.Duration(0)
if msgReceived {
timeSinceLastMsgReceived = now.Sub(lastMsgReceivedAt)
wasMsgReceivedRecently = timeSinceLastMsgReceived <= n.config.HealthConfig.MaxTimeSinceMsgReceived
details[TimeSinceLastMsgReceivedKey] = timeSinceLastMsgReceived.String()
n.metrics.timeSinceLastMsgReceived.Set(float64(timeSinceLastMsgReceived))
}
healthy = healthy && wasMsgReceivedRecently
details[TimeSinceLastMsgReceivedKey] = timeSinceLastMsgReceived.String()
n.metrics.timeSinceLastMsgReceived.Set(float64(timeSinceLastMsgReceived))

// Make sure we've sent an outgoing message within the threshold
lastMsgSentAt := time.Unix(atomic.LoadInt64(&n.peerConfig.LastSent), 0)
timeSinceLastMsgSent := now.Sub(lastMsgSentAt)
wasMsgSentRecently := timeSinceLastMsgSent <= n.config.HealthConfig.MaxTimeSinceMsgSent
lastMsgSentAt, msgSent := n.getLastSent()
wasMsgSentRecently := msgSent
timeSinceLastMsgSent := time.Duration(0)
if msgSent {
timeSinceLastMsgSent = now.Sub(lastMsgSentAt)
wasMsgSentRecently = timeSinceLastMsgSent <= n.config.HealthConfig.MaxTimeSinceMsgSent
details[TimeSinceLastMsgSentKey] = timeSinceLastMsgSent.String()
n.metrics.timeSinceLastMsgSent.Set(float64(timeSinceLastMsgSent))
}
healthy = healthy && wasMsgSentRecently
details[TimeSinceLastMsgSentKey] = timeSinceLastMsgSent.String()
n.metrics.timeSinceLastMsgSent.Set(float64(timeSinceLastMsgSent))

// Make sure the message send failed rate isn't too high
isMsgFailRate := sendFailRate <= n.config.HealthConfig.MaxSendFailRate
Expand All @@ -380,12 +388,17 @@ func (n *network) HealthCheck(context.Context) (interface{}, error) {
if !isConnected {
errorReasons = append(errorReasons, fmt.Sprintf("not connected to a minimum of %d peer(s) only %d", n.config.HealthConfig.MinConnectedPeers, connectedTo))
}
if !wasMsgReceivedRecently {
if !msgReceived {
errorReasons = append(errorReasons, "no messages received from network")
} else if !wasMsgReceivedRecently {
errorReasons = append(errorReasons, fmt.Sprintf("no messages from network received in %s > %s", timeSinceLastMsgReceived, n.config.HealthConfig.MaxTimeSinceMsgReceived))
}
if !wasMsgSentRecently {
if !msgSent {
errorReasons = append(errorReasons, "no messages sent to network")
} else if !wasMsgSentRecently {
errorReasons = append(errorReasons, fmt.Sprintf("no messages from network sent in %s > %s", timeSinceLastMsgSent, n.config.HealthConfig.MaxTimeSinceMsgSent))
}

if !isMsgFailRate {
errorReasons = append(errorReasons, fmt.Sprintf("messages failure send rate %g > %g", sendFailRate, n.config.HealthConfig.MaxSendFailRate))
}
Expand Down Expand Up @@ -1418,3 +1431,19 @@ func (n *network) gossipPeerLists() {
p.StartSendPeerList()
}
}

func (n *network) getLastReceived() (time.Time, bool) {
lastReceived := atomic.LoadInt64(&n.peerConfig.LastReceived)
if lastReceived == 0 {
return time.Time{}, false
}
return time.Unix(lastReceived, 0), true
}

func (n *network) getLastSent() (time.Time, bool) {
lastSent := atomic.LoadInt64(&n.peerConfig.LastSent)
if lastSent == 0 {
return time.Time{}, false
}
return time.Unix(lastSent, 0), true
}
26 changes: 18 additions & 8 deletions network/peer/peer.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,8 @@ func (p *peer) Info() Info {
PublicIP: publicIPStr,
ID: p.id,
Version: p.version.String(),
LastSent: time.Unix(atomic.LoadInt64(&p.lastSent), 0),
LastReceived: time.Unix(atomic.LoadInt64(&p.lastReceived), 0),
LastSent: p.LastSent(),
LastReceived: p.LastReceived(),
ObservedUptime: json.Uint32(primaryUptime),
ObservedSubnetUptimes: uptimes,
TrackedSubnets: trackedSubnets,
Expand Down Expand Up @@ -468,9 +468,8 @@ func (p *peer) readMessages() {
continue
}

now := p.Clock.Time().Unix()
atomic.StoreInt64(&p.Config.LastReceived, now)
atomic.StoreInt64(&p.lastReceived, now)
now := p.Clock.Time()
p.storeLastReceived(now)
p.Metrics.Received(msg, msgLen)

// Handle the message. Note that when we are done handling this message,
Expand Down Expand Up @@ -578,9 +577,8 @@ func (p *peer) writeMessage(writer io.Writer, msg message.OutboundMessage) {
return
}

now := p.Clock.Time().Unix()
atomic.StoreInt64(&p.Config.LastSent, now)
atomic.StoreInt64(&p.lastSent, now)
now := p.Clock.Time()
p.storeLastSent(now)
p.Metrics.Sent(msg)
}

Expand Down Expand Up @@ -1073,3 +1071,15 @@ func (p *peer) handlePeerListAck(msg *p2p.PeerListAck) {
func (p *peer) nextTimeout() time.Time {
return p.Clock.Time().Add(p.PongTimeout)
}

func (p *peer) storeLastSent(time time.Time) {
unixTime := time.Unix()
atomic.StoreInt64(&p.Config.LastSent, unixTime)
atomic.StoreInt64(&p.lastSent, unixTime)
}

func (p *peer) storeLastReceived(time time.Time) {
unixTime := time.Unix()
atomic.StoreInt64(&p.Config.LastReceived, unixTime)
atomic.StoreInt64(&p.lastReceived, unixTime)
}

0 comments on commit 705c3fa

Please sign in to comment.