Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

deal with a temporary loss of network connectivity #1354

Merged
merged 7 commits into from
Jul 23, 2020
73 changes: 46 additions & 27 deletions beacon_chain/eth2_network.nim
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import
# Beacon node modules
version, conf, eth2_discovery, libp2p_json_serialization, conf,
ssz/ssz_serialization,
peer_pool, spec/[datatypes, network]
peer_pool, spec/[datatypes, network], ./time

export
version, multiaddress, peer_pool, peerinfo, p2pProtocol,
Expand Down Expand Up @@ -720,41 +720,50 @@ proc dialPeer*(node: Eth2Node, peerInfo: PeerInfo) {.async.} =
inc nbc_successful_dials
debug "Network handshakes completed"

proc connectWorker(network: Eth2Node) {.async.} =
proc connectWorker(network: Eth2Node, bootstrapPeerIDs: seq[PeerID]) {.async.} =
debug "Connection worker started"

while true:
let pi = await network.connQueue.popFirst()
let r1 = network.peerPool.hasPeer(pi.peerId)
let r2 = network.isSeen(pi)
let r3 = network.connTable.hasKey(pi.peerId)
let remotePeerInfo = await network.connQueue.popFirst()
let peerPoolHasRemotePeer = network.peerPool.hasPeer(remotePeerInfo.peerId)
let seenTableHasRemotePeer = network.isSeen(remotePeerInfo)
let remotePeerAlreadyConnected = network.connTable.hasKey(remotePeerInfo.peerId)

if not(r1) and not(r2) and not(r3):
network.connTable[pi.peerId] = pi
if not(peerPoolHasRemotePeer) and not(seenTableHasRemotePeer) and not(remotePeerAlreadyConnected):
network.connTable[remotePeerInfo.peerId] = remotePeerInfo
try:
# We trying to connect to peers which are not in PeerPool, SeenTable and
# ConnTable.
var fut = network.dialPeer(pi)
var fut = network.dialPeer(remotePeerInfo)
# We discarding here just because we going to check future state, to avoid
# condition where connection happens and timeout reached.
let res = await withTimeout(fut, network.connectTimeout)
discard await withTimeout(fut, network.connectTimeout)
# We handling only timeout and errors, because successfull connections
# will be stored in PeerPool.
if fut.finished():
if fut.failed() and not(fut.cancelled()):
debug "Unable to establish connection with peer", peer = pi.id,
debug "Unable to establish connection with peer", peer = remotePeerInfo.id,
errMsg = fut.readError().msg
inc nbc_failed_dials
network.addSeen(pi, SeenTableTimeDeadPeer)
if remotePeerInfo.peerId in bootstrapPeerIDs:
# keep trying
await network.connQueue.addLast(remotePeerInfo)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, this is add is redundant, as in discovery bootstrap nodes are also treated special and are never removed from the routing table. So they would keep getting added in the discovery loop. The not adding to the seen list that you do there is required of course

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see that bootstrap nodes are exempt from removal in Protocol.replaceNode() but where are they re-added to Eth2Node.connQueue?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the discoveryLoop they have a chance of being passed again through the randomNodes call. Chance, as it is a random selection, but if no bootstrap nodes were reachable ( = no other nodes added to the routing table) it will always be those bootstrap nodes.

And as you no longer do the addSeen for those bootstrap nodes, a new connection should be attempted I believe.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here you need to check not only PeerID but also you need to check if PeerInfo you are sending to connection worker has TCP based address inside. Otherwise it will fail immediately, but you will continue adding this item back to connection in queue. If number of bootstrap nodes will be more then number of connection workers it will create endless loop node will never attempt to connect to real nodes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, this is add is redundant

Removed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here you need to check not only PeerID but also you need to check if PeerInfo you are sending to connection worker has TCP based address inside. Otherwise it will fail immediately, but you will continue adding this item back to connection in queue. If number of bootstrap nodes will be more then number of connection workers it will create endless loop node will never attempt to connect to real nodes.

That filter should be done earlier, in the discovery loop (it happens now already on eth2 field)

else:
network.addSeen(remotePeerInfo, SeenTableTimeDeadPeer)
continue
debug "Connection to remote peer timed out", peer = pi.id
debug "Connection to remote peer timed out", peer = remotePeerInfo.id
inc nbc_timeout_dials
network.addSeen(pi, SeenTableTimeTimeout)
if remotePeerInfo.peerId in bootstrapPeerIDs:
# keep trying
await network.connQueue.addLast(remotePeerInfo)
stefantalpalaru marked this conversation as resolved.
Show resolved Hide resolved
else:
network.addSeen(remotePeerInfo, SeenTableTimeTimeout)
finally:
network.connTable.del(pi.peerId)
network.connTable.del(remotePeerInfo.peerId)
else:
trace "Peer is already connected, connecting or already seen",
peer = pi.id, peer_pool_has_peer = $r1, seen_table_has_peer = $r2,
connecting_peer = $r3, seen_table_size = len(network.seenTable)
peer = remotePeerInfo.id, peer_pool_has_peer = $peerPoolHasRemotePeer, seen_table_has_peer = $seenTableHasRemotePeer,
connecting_peer = $remotePeerAlreadyConnected, seen_table_size = len(network.seenTable)

proc runDiscoveryLoop*(node: Eth2Node) {.async.} =
debug "Starting discovery loop"
Expand Down Expand Up @@ -833,8 +842,16 @@ proc startListening*(node: Eth2Node) =
node.discovery.open()

proc start*(node: Eth2Node) {.async.} =
var bootstrapPeerIDs: seq[PeerID]
for record in node.discovery.bootstrapRecords:
let typedRecord = record.toTypedRecord()
if typedRecord.isOk:
let peerInfo = typedRecord.value.toPeerInfo()
if peerInfo != nil:
bootstrapPeerIDs.add(peerInfo.peerId)

for i in 0 ..< ConcurrentConnections:
node.connWorkers.add connectWorker(node)
node.connWorkers.add connectWorker(node, bootstrapPeerIDs)

node.libp2pTransportLoops = await node.switch.start()
node.discovery.start()
Expand Down Expand Up @@ -1130,17 +1147,19 @@ proc announcedENR*(node: Eth2Node): enr.Record =
proc shortForm*(id: KeyPair): string =
$PeerID.init(id.pubkey)

let BOOTSTRAP_NODE_CHECK_INTERVAL = 30.seconds
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is constant and should be placed at the beginning of the file with comment where it used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The further it is from the place it's used, the harder it is to find. Jumping all over the file just to understand a few lines of code makes our job harder.

proc checkIfConnectedToBootstrapNode(p: pointer) {.gcsafe.} =
Copy link
Contributor

@cheatfate cheatfate Jul 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This procedure looks very ugly, but it can look like this:

  proc checkIfConnected(node: Eth2Node) {.async.} =
    while true:
      await sleepAsync(30.seconds)
      if node.discovery.bootstrapRecords.len > 0 and len(node.peerPool) == 0:
        warn "Failed to connect to any node",  bootstrapEnrs = node.discovery.bootstrapRecords

traceAsyncErrors checkIfConnected(node))

But also you should not use metric values for program logic, it is better to check PeerPool for number of connections currently available.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you should not use metric values for program logic

Why not? Keeps the number of variables low.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There already present place which perform tracking of libp2p peers.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not? Keeps the number of variables low.

metrics are a one-way data flow out of the application - they're an optional feature and we should be able to compile the application with them completely disabled at compile time - using them as regular variables makes the code harder to analyze as they now start to serve multiple orthogonal concerns

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your version might look better to you, but it does the wrong thing by continuing to check for that condition after it became false. You fix that and it becomes uglier than my version. Further more, you dropped that part of its name that made clear what it actually does.

Also, do you remember what traceAsyncErrors() does without reading its code? I don't. Its name sounds like something that's currently broken in Chronos.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@stefantalpalaru rename your PR title please, because PR content is absolutely different from PR title.

You should name it: deal with temporary loss of network connectivity while node is starting.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, for what you want, "Failed to connect to any node" is the wrong message, because it implies historical data which you don't have. You need "Not connected to any peer right now. There may be a problem with your network connectivity."

I'm not an author of this PR i have made a proposal, if you do not like message you can easily change it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exactly. I want that warning to only appear before succesfully connecting to a peer. Detecting any other problem is a separate concern better addressed in a separate procedure, in a separate PR.

Your PR could easily fix the issue for both cases - "network loss on node start", "network loss while node is working". And this can be done by changing 2 LOCs... So why not introduce such changes in this PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

create a dedicated boolean for it then

Done.

You should name it: deal with temporary loss of network connectivity while node is starting.

But reducing those two timeouts is useful all the time, not just at program start.

traceAsyncErrors catching exceptions in absolutely legal way https://github.com/status-im/nim-eth/blob/master/eth/async_utils.nim#L11

The logic inside catchOrQuit() is wrong right now: https://github.com/status-im/nim-eth/blob/765883c454be726799f4e724b4dc2ca8fe25bc74/eth/async_utils.nim#L7

Defects don't end up in Future.error, but are raised directly, so that else branch is unreachable.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your PR could easily fix the issue for both cases - "network loss on node start", "network loss while node is working". And this can be done by changing 2 LOCs...

It might get more complicated than that. Before telling the average user that his network might be down (or there's a problem with the firewall), I should look at any other network traffic I might have available - stuff like Discovery v5 nodes or any UDP multicasting we might be doing and getting replies from.

I should also have more reliable ways of establishing if the problem is with the Eth2 network itself (and I should probably ping some servers with high uptime before making a fool of myself by lying to the user).

# Keep showing warnings until we connect to at least one bootstrap node
# successfully, in order to allow detection of an invalid configuration.
let node = cast[Eth2Node](p)
if node.discovery.bootstrapRecords.len > 0 and nbc_successful_dials.value == 0:
warn "Failed to connect to any bootstrap node",
bootstrapEnrs = node.discovery.bootstrapRecords
addTimer(BOOTSTRAP_NODE_CHECK_INTERVAL, checkIfConnectedToBootstrapNode, p)

proc startLookingForPeers*(node: Eth2Node) {.async.} =
await node.start()

proc checkIfConnectedToBootstrapNode {.async.} =
await sleepAsync(30.seconds)
if node.discovery.bootstrapRecords.len > 0 and nbc_successful_dials.value == 0:
fatal "Failed to connect to any bootstrap node. Quitting",
bootstrapEnrs = node.discovery.bootstrapRecords
quit 1

traceAsyncErrors checkIfConnectedToBootstrapNode()
addTimer(BOOTSTRAP_NODE_CHECK_INTERVAL, checkIfConnectedToBootstrapNode, node[].addr)

func peersCount*(node: Eth2Node): int =
len(node.peerPool)
Expand Down