From 0078be2647bc3e2ec0138e6d1472fdd3968a6100 Mon Sep 17 00:00:00 2001 From: Ceyhun Onur Date: Wed, 29 Mar 2023 14:04:17 +0300 Subject: [PATCH 1/2] special case no sent/received message in network health check --- network/network.go | 64 ++++++++++++++++++++++++++++++++++---------- network/peer/peer.go | 26 ++++++++++++------ 2 files changed, 68 insertions(+), 22 deletions(-) diff --git a/network/network.go b/network/network.go index 6b6d9e35969..ce5cc3075dc 100644 --- a/network/network.go +++ b/network/network.go @@ -347,20 +347,32 @@ func (n *network) HealthCheck(context.Context) (interface{}, error) { // Make sure we've received an incoming message within the threshold now := n.peerConfig.Clock.Time() - lastMsgReceivedAt := time.Unix(atomic.LoadInt64(&n.peerConfig.LastReceived), 0) - timeSinceLastMsgReceived := now.Sub(lastMsgReceivedAt) - wasMsgReceivedRecently := timeSinceLastMsgReceived <= n.config.HealthConfig.MaxTimeSinceMsgReceived - healthy = healthy && wasMsgReceivedRecently - details[TimeSinceLastMsgReceivedKey] = timeSinceLastMsgReceived.String() - n.metrics.timeSinceLastMsgReceived.Set(float64(timeSinceLastMsgReceived)) + lastMsgReceivedAt, msgReceived := n.getLastReceived() + wasMsgReceivedRecently := false + timeSinceLastMsgReceived := time.Duration(0) + if !msgReceived { + healthy = false + } else { + timeSinceLastMsgReceived = now.Sub(lastMsgReceivedAt) + wasMsgReceivedRecently = timeSinceLastMsgReceived <= n.config.HealthConfig.MaxTimeSinceMsgReceived + healthy = healthy && wasMsgReceivedRecently + details[TimeSinceLastMsgReceivedKey] = timeSinceLastMsgReceived.String() + n.metrics.timeSinceLastMsgReceived.Set(float64(timeSinceLastMsgReceived)) + } // Make sure we've sent an outgoing message within the threshold - lastMsgSentAt := time.Unix(atomic.LoadInt64(&n.peerConfig.LastSent), 0) - timeSinceLastMsgSent := now.Sub(lastMsgSentAt) - wasMsgSentRecently := timeSinceLastMsgSent <= n.config.HealthConfig.MaxTimeSinceMsgSent - healthy = healthy && wasMsgSentRecently - details[TimeSinceLastMsgSentKey] = timeSinceLastMsgSent.String() - n.metrics.timeSinceLastMsgSent.Set(float64(timeSinceLastMsgSent)) + lastMsgSentAt, msgSent := n.getLastSent() + wasMsgSentRecently := false + timeSinceLastMsgSent := time.Duration(0) + if !msgSent { + healthy = false + } else { + timeSinceLastMsgSent = now.Sub(lastMsgSentAt) + wasMsgSentRecently = timeSinceLastMsgSent <= n.config.HealthConfig.MaxTimeSinceMsgSent + healthy = healthy && wasMsgSentRecently + details[TimeSinceLastMsgSentKey] = timeSinceLastMsgSent.String() + n.metrics.timeSinceLastMsgSent.Set(float64(timeSinceLastMsgSent)) + } // Make sure the message send failed rate isn't too high isMsgFailRate := sendFailRate <= n.config.HealthConfig.MaxSendFailRate @@ -381,10 +393,18 @@ func (n *network) HealthCheck(context.Context) (interface{}, error) { errorReasons = append(errorReasons, fmt.Sprintf("not connected to a minimum of %d peer(s) only %d", n.config.HealthConfig.MinConnectedPeers, connectedTo)) } if !wasMsgReceivedRecently { - errorReasons = append(errorReasons, fmt.Sprintf("no messages from network received in %s > %s", timeSinceLastMsgReceived, n.config.HealthConfig.MaxTimeSinceMsgReceived)) + if !msgReceived { + errorReasons = append(errorReasons, "no messages received from network") + } else { + errorReasons = append(errorReasons, fmt.Sprintf("no messages from network received in %s > %s", timeSinceLastMsgReceived, n.config.HealthConfig.MaxTimeSinceMsgReceived)) + } } if !wasMsgSentRecently { - errorReasons = append(errorReasons, fmt.Sprintf("no messages from network sent in %s > %s", timeSinceLastMsgSent, n.config.HealthConfig.MaxTimeSinceMsgSent)) + if !msgSent { + errorReasons = append(errorReasons, "no messages sent to network") + } else { + errorReasons = append(errorReasons, fmt.Sprintf("no messages from network sent in %s > %s", timeSinceLastMsgSent, n.config.HealthConfig.MaxTimeSinceMsgSent)) + } } if !isMsgFailRate { errorReasons = append(errorReasons, fmt.Sprintf("messages failure send rate %g > %g", sendFailRate, n.config.HealthConfig.MaxSendFailRate)) @@ -1418,3 +1438,19 @@ func (n *network) gossipPeerLists() { p.StartSendPeerList() } } + +func (n *network) getLastReceived() (time.Time, bool) { + lastReceived := atomic.LoadInt64(&n.peerConfig.LastReceived) + if lastReceived == 0 { + return time.Time{}, false + } + return time.Unix(lastReceived, 0), true +} + +func (n *network) getLastSent() (time.Time, bool) { + lastSent := atomic.LoadInt64(&n.peerConfig.LastSent) + if lastSent == 0 { + return time.Time{}, false + } + return time.Unix(lastSent, 0), true +} diff --git a/network/peer/peer.go b/network/peer/peer.go index 4655b6f51e9..dfe670842be 100644 --- a/network/peer/peer.go +++ b/network/peer/peer.go @@ -267,8 +267,8 @@ func (p *peer) Info() Info { PublicIP: publicIPStr, ID: p.id, Version: p.version.String(), - LastSent: time.Unix(atomic.LoadInt64(&p.lastSent), 0), - LastReceived: time.Unix(atomic.LoadInt64(&p.lastReceived), 0), + LastSent: p.LastSent(), + LastReceived: p.LastReceived(), ObservedUptime: json.Uint32(primaryUptime), ObservedSubnetUptimes: uptimes, TrackedSubnets: trackedSubnets, @@ -468,9 +468,8 @@ func (p *peer) readMessages() { continue } - now := p.Clock.Time().Unix() - atomic.StoreInt64(&p.Config.LastReceived, now) - atomic.StoreInt64(&p.lastReceived, now) + now := p.Clock.Time() + p.storeLastReceived(now) p.Metrics.Received(msg, msgLen) // Handle the message. Note that when we are done handling this message, @@ -578,9 +577,8 @@ func (p *peer) writeMessage(writer io.Writer, msg message.OutboundMessage) { return } - now := p.Clock.Time().Unix() - atomic.StoreInt64(&p.Config.LastSent, now) - atomic.StoreInt64(&p.lastSent, now) + now := p.Clock.Time() + p.storeLastSent(now) p.Metrics.Sent(msg) } @@ -1073,3 +1071,15 @@ func (p *peer) handlePeerListAck(msg *p2p.PeerListAck) { func (p *peer) nextTimeout() time.Time { return p.Clock.Time().Add(p.PongTimeout) } + +func (p *peer) storeLastSent(time time.Time) { + unixTime := time.Unix() + atomic.StoreInt64(&p.Config.LastSent, unixTime) + atomic.StoreInt64(&p.lastSent, unixTime) +} + +func (p *peer) storeLastReceived(time time.Time) { + unixTime := time.Unix() + atomic.StoreInt64(&p.Config.LastReceived, unixTime) + atomic.StoreInt64(&p.lastReceived, unixTime) +} From a42de255223fb5a39e529f7e70dbe486f2181eac Mon Sep 17 00:00:00 2001 From: Ceyhun Onur Date: Thu, 30 Mar 2023 16:14:02 +0300 Subject: [PATCH 2/2] refactor conditions --- network/network.go | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/network/network.go b/network/network.go index ce5cc3075dc..8a41babafba 100644 --- a/network/network.go +++ b/network/network.go @@ -348,31 +348,27 @@ func (n *network) HealthCheck(context.Context) (interface{}, error) { now := n.peerConfig.Clock.Time() lastMsgReceivedAt, msgReceived := n.getLastReceived() - wasMsgReceivedRecently := false + wasMsgReceivedRecently := msgReceived timeSinceLastMsgReceived := time.Duration(0) - if !msgReceived { - healthy = false - } else { + if msgReceived { timeSinceLastMsgReceived = now.Sub(lastMsgReceivedAt) wasMsgReceivedRecently = timeSinceLastMsgReceived <= n.config.HealthConfig.MaxTimeSinceMsgReceived - healthy = healthy && wasMsgReceivedRecently details[TimeSinceLastMsgReceivedKey] = timeSinceLastMsgReceived.String() n.metrics.timeSinceLastMsgReceived.Set(float64(timeSinceLastMsgReceived)) } + healthy = healthy && wasMsgReceivedRecently // Make sure we've sent an outgoing message within the threshold lastMsgSentAt, msgSent := n.getLastSent() - wasMsgSentRecently := false + wasMsgSentRecently := msgSent timeSinceLastMsgSent := time.Duration(0) - if !msgSent { - healthy = false - } else { + if msgSent { timeSinceLastMsgSent = now.Sub(lastMsgSentAt) wasMsgSentRecently = timeSinceLastMsgSent <= n.config.HealthConfig.MaxTimeSinceMsgSent - healthy = healthy && wasMsgSentRecently details[TimeSinceLastMsgSentKey] = timeSinceLastMsgSent.String() n.metrics.timeSinceLastMsgSent.Set(float64(timeSinceLastMsgSent)) } + healthy = healthy && wasMsgSentRecently // Make sure the message send failed rate isn't too high isMsgFailRate := sendFailRate <= n.config.HealthConfig.MaxSendFailRate @@ -392,20 +388,17 @@ func (n *network) HealthCheck(context.Context) (interface{}, error) { if !isConnected { errorReasons = append(errorReasons, fmt.Sprintf("not connected to a minimum of %d peer(s) only %d", n.config.HealthConfig.MinConnectedPeers, connectedTo)) } - if !wasMsgReceivedRecently { - if !msgReceived { - errorReasons = append(errorReasons, "no messages received from network") - } else { - errorReasons = append(errorReasons, fmt.Sprintf("no messages from network received in %s > %s", timeSinceLastMsgReceived, n.config.HealthConfig.MaxTimeSinceMsgReceived)) - } + if !msgReceived { + errorReasons = append(errorReasons, "no messages received from network") + } else if !wasMsgReceivedRecently { + errorReasons = append(errorReasons, fmt.Sprintf("no messages from network received in %s > %s", timeSinceLastMsgReceived, n.config.HealthConfig.MaxTimeSinceMsgReceived)) } - if !wasMsgSentRecently { - if !msgSent { - errorReasons = append(errorReasons, "no messages sent to network") - } else { - errorReasons = append(errorReasons, fmt.Sprintf("no messages from network sent in %s > %s", timeSinceLastMsgSent, n.config.HealthConfig.MaxTimeSinceMsgSent)) - } + if !msgSent { + errorReasons = append(errorReasons, "no messages sent to network") + } else if !wasMsgSentRecently { + errorReasons = append(errorReasons, fmt.Sprintf("no messages from network sent in %s > %s", timeSinceLastMsgSent, n.config.HealthConfig.MaxTimeSinceMsgSent)) } + if !isMsgFailRate { errorReasons = append(errorReasons, fmt.Sprintf("messages failure send rate %g > %g", sendFailRate, n.config.HealthConfig.MaxSendFailRate)) }