diff --git a/network/network.go b/network/network.go index 6b6d9e35969..8a41babafba 100644 --- a/network/network.go +++ b/network/network.go @@ -347,20 +347,28 @@ func (n *network) HealthCheck(context.Context) (interface{}, error) { // Make sure we've received an incoming message within the threshold now := n.peerConfig.Clock.Time() - lastMsgReceivedAt := time.Unix(atomic.LoadInt64(&n.peerConfig.LastReceived), 0) - timeSinceLastMsgReceived := now.Sub(lastMsgReceivedAt) - wasMsgReceivedRecently := timeSinceLastMsgReceived <= n.config.HealthConfig.MaxTimeSinceMsgReceived + lastMsgReceivedAt, msgReceived := n.getLastReceived() + wasMsgReceivedRecently := msgReceived + timeSinceLastMsgReceived := time.Duration(0) + if msgReceived { + timeSinceLastMsgReceived = now.Sub(lastMsgReceivedAt) + wasMsgReceivedRecently = timeSinceLastMsgReceived <= n.config.HealthConfig.MaxTimeSinceMsgReceived + details[TimeSinceLastMsgReceivedKey] = timeSinceLastMsgReceived.String() + n.metrics.timeSinceLastMsgReceived.Set(float64(timeSinceLastMsgReceived)) + } healthy = healthy && wasMsgReceivedRecently - details[TimeSinceLastMsgReceivedKey] = timeSinceLastMsgReceived.String() - n.metrics.timeSinceLastMsgReceived.Set(float64(timeSinceLastMsgReceived)) // Make sure we've sent an outgoing message within the threshold - lastMsgSentAt := time.Unix(atomic.LoadInt64(&n.peerConfig.LastSent), 0) - timeSinceLastMsgSent := now.Sub(lastMsgSentAt) - wasMsgSentRecently := timeSinceLastMsgSent <= n.config.HealthConfig.MaxTimeSinceMsgSent + lastMsgSentAt, msgSent := n.getLastSent() + wasMsgSentRecently := msgSent + timeSinceLastMsgSent := time.Duration(0) + if msgSent { + timeSinceLastMsgSent = now.Sub(lastMsgSentAt) + wasMsgSentRecently = timeSinceLastMsgSent <= n.config.HealthConfig.MaxTimeSinceMsgSent + details[TimeSinceLastMsgSentKey] = timeSinceLastMsgSent.String() + n.metrics.timeSinceLastMsgSent.Set(float64(timeSinceLastMsgSent)) + } healthy = healthy && wasMsgSentRecently - details[TimeSinceLastMsgSentKey] = timeSinceLastMsgSent.String() - n.metrics.timeSinceLastMsgSent.Set(float64(timeSinceLastMsgSent)) // Make sure the message send failed rate isn't too high isMsgFailRate := sendFailRate <= n.config.HealthConfig.MaxSendFailRate @@ -380,12 +388,17 @@ func (n *network) HealthCheck(context.Context) (interface{}, error) { if !isConnected { errorReasons = append(errorReasons, fmt.Sprintf("not connected to a minimum of %d peer(s) only %d", n.config.HealthConfig.MinConnectedPeers, connectedTo)) } - if !wasMsgReceivedRecently { + if !msgReceived { + errorReasons = append(errorReasons, "no messages received from network") + } else if !wasMsgReceivedRecently { errorReasons = append(errorReasons, fmt.Sprintf("no messages from network received in %s > %s", timeSinceLastMsgReceived, n.config.HealthConfig.MaxTimeSinceMsgReceived)) } - if !wasMsgSentRecently { + if !msgSent { + errorReasons = append(errorReasons, "no messages sent to network") + } else if !wasMsgSentRecently { errorReasons = append(errorReasons, fmt.Sprintf("no messages from network sent in %s > %s", timeSinceLastMsgSent, n.config.HealthConfig.MaxTimeSinceMsgSent)) } + if !isMsgFailRate { errorReasons = append(errorReasons, fmt.Sprintf("messages failure send rate %g > %g", sendFailRate, n.config.HealthConfig.MaxSendFailRate)) } @@ -1418,3 +1431,19 @@ func (n *network) gossipPeerLists() { p.StartSendPeerList() } } + +func (n *network) getLastReceived() (time.Time, bool) { + lastReceived := atomic.LoadInt64(&n.peerConfig.LastReceived) + if lastReceived == 0 { + return time.Time{}, false + } + return time.Unix(lastReceived, 0), true +} + +func (n *network) getLastSent() (time.Time, bool) { + lastSent := atomic.LoadInt64(&n.peerConfig.LastSent) + if lastSent == 0 { + return time.Time{}, false + } + return time.Unix(lastSent, 0), true +} diff --git a/network/peer/peer.go b/network/peer/peer.go index 4655b6f51e9..dfe670842be 100644 --- a/network/peer/peer.go +++ b/network/peer/peer.go @@ -267,8 +267,8 @@ func (p *peer) Info() Info { PublicIP: publicIPStr, ID: p.id, Version: p.version.String(), - LastSent: time.Unix(atomic.LoadInt64(&p.lastSent), 0), - LastReceived: time.Unix(atomic.LoadInt64(&p.lastReceived), 0), + LastSent: p.LastSent(), + LastReceived: p.LastReceived(), ObservedUptime: json.Uint32(primaryUptime), ObservedSubnetUptimes: uptimes, TrackedSubnets: trackedSubnets, @@ -468,9 +468,8 @@ func (p *peer) readMessages() { continue } - now := p.Clock.Time().Unix() - atomic.StoreInt64(&p.Config.LastReceived, now) - atomic.StoreInt64(&p.lastReceived, now) + now := p.Clock.Time() + p.storeLastReceived(now) p.Metrics.Received(msg, msgLen) // Handle the message. Note that when we are done handling this message, @@ -578,9 +577,8 @@ func (p *peer) writeMessage(writer io.Writer, msg message.OutboundMessage) { return } - now := p.Clock.Time().Unix() - atomic.StoreInt64(&p.Config.LastSent, now) - atomic.StoreInt64(&p.lastSent, now) + now := p.Clock.Time() + p.storeLastSent(now) p.Metrics.Sent(msg) } @@ -1073,3 +1071,15 @@ func (p *peer) handlePeerListAck(msg *p2p.PeerListAck) { func (p *peer) nextTimeout() time.Time { return p.Clock.Time().Add(p.PongTimeout) } + +func (p *peer) storeLastSent(time time.Time) { + unixTime := time.Unix() + atomic.StoreInt64(&p.Config.LastSent, unixTime) + atomic.StoreInt64(&p.lastSent, unixTime) +} + +func (p *peer) storeLastReceived(time time.Time) { + unixTime := time.Unix() + atomic.StoreInt64(&p.Config.LastReceived, unixTime) + atomic.StoreInt64(&p.lastReceived, unixTime) +}