-
Notifications
You must be signed in to change notification settings - Fork 244
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
deal with a temporary loss of network connectivity #1354
Changes from 2 commits
43961f9
09f31ab
2ccd959
f6a6e30
646bb91
c7e5cae
cbb0aca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,7 +23,7 @@ import | |
# Beacon node modules | ||
version, conf, eth2_discovery, libp2p_json_serialization, conf, | ||
ssz/ssz_serialization, | ||
peer_pool, spec/[datatypes, network] | ||
peer_pool, spec/[datatypes, network], ./time | ||
|
||
export | ||
version, multiaddress, peer_pool, peerinfo, p2pProtocol, | ||
|
@@ -720,41 +720,50 @@ proc dialPeer*(node: Eth2Node, peerInfo: PeerInfo) {.async.} = | |
inc nbc_successful_dials | ||
debug "Network handshakes completed" | ||
|
||
proc connectWorker(network: Eth2Node) {.async.} = | ||
proc connectWorker(network: Eth2Node, bootstrapPeerIDs: seq[PeerID]) {.async.} = | ||
debug "Connection worker started" | ||
|
||
while true: | ||
let pi = await network.connQueue.popFirst() | ||
let r1 = network.peerPool.hasPeer(pi.peerId) | ||
let r2 = network.isSeen(pi) | ||
let r3 = network.connTable.hasKey(pi.peerId) | ||
let remotePeerInfo = await network.connQueue.popFirst() | ||
let peerPoolHasRemotePeer = network.peerPool.hasPeer(remotePeerInfo.peerId) | ||
let seenTableHasRemotePeer = network.isSeen(remotePeerInfo) | ||
let remotePeerAlreadyConnected = network.connTable.hasKey(remotePeerInfo.peerId) | ||
|
||
if not(r1) and not(r2) and not(r3): | ||
network.connTable[pi.peerId] = pi | ||
if not(peerPoolHasRemotePeer) and not(seenTableHasRemotePeer) and not(remotePeerAlreadyConnected): | ||
network.connTable[remotePeerInfo.peerId] = remotePeerInfo | ||
try: | ||
# We trying to connect to peers which are not in PeerPool, SeenTable and | ||
# ConnTable. | ||
var fut = network.dialPeer(pi) | ||
var fut = network.dialPeer(remotePeerInfo) | ||
# We discarding here just because we going to check future state, to avoid | ||
# condition where connection happens and timeout reached. | ||
let res = await withTimeout(fut, network.connectTimeout) | ||
discard await withTimeout(fut, network.connectTimeout) | ||
# We handling only timeout and errors, because successfull connections | ||
# will be stored in PeerPool. | ||
if fut.finished(): | ||
if fut.failed() and not(fut.cancelled()): | ||
debug "Unable to establish connection with peer", peer = pi.id, | ||
debug "Unable to establish connection with peer", peer = remotePeerInfo.id, | ||
errMsg = fut.readError().msg | ||
inc nbc_failed_dials | ||
network.addSeen(pi, SeenTableTimeDeadPeer) | ||
if remotePeerInfo.peerId in bootstrapPeerIDs: | ||
# keep trying | ||
await network.connQueue.addLast(remotePeerInfo) | ||
else: | ||
network.addSeen(remotePeerInfo, SeenTableTimeDeadPeer) | ||
continue | ||
debug "Connection to remote peer timed out", peer = pi.id | ||
debug "Connection to remote peer timed out", peer = remotePeerInfo.id | ||
inc nbc_timeout_dials | ||
network.addSeen(pi, SeenTableTimeTimeout) | ||
if remotePeerInfo.peerId in bootstrapPeerIDs: | ||
# keep trying | ||
await network.connQueue.addLast(remotePeerInfo) | ||
stefantalpalaru marked this conversation as resolved.
Show resolved
Hide resolved
|
||
else: | ||
network.addSeen(remotePeerInfo, SeenTableTimeTimeout) | ||
finally: | ||
network.connTable.del(pi.peerId) | ||
network.connTable.del(remotePeerInfo.peerId) | ||
else: | ||
trace "Peer is already connected, connecting or already seen", | ||
peer = pi.id, peer_pool_has_peer = $r1, seen_table_has_peer = $r2, | ||
connecting_peer = $r3, seen_table_size = len(network.seenTable) | ||
peer = remotePeerInfo.id, peer_pool_has_peer = $peerPoolHasRemotePeer, seen_table_has_peer = $seenTableHasRemotePeer, | ||
connecting_peer = $remotePeerAlreadyConnected, seen_table_size = len(network.seenTable) | ||
|
||
proc runDiscoveryLoop*(node: Eth2Node) {.async.} = | ||
debug "Starting discovery loop" | ||
|
@@ -833,8 +842,16 @@ proc startListening*(node: Eth2Node) = | |
node.discovery.open() | ||
|
||
proc start*(node: Eth2Node) {.async.} = | ||
var bootstrapPeerIDs: seq[PeerID] | ||
for record in node.discovery.bootstrapRecords: | ||
let typedRecord = record.toTypedRecord() | ||
if typedRecord.isOk: | ||
let peerInfo = typedRecord.value.toPeerInfo() | ||
if peerInfo != nil: | ||
bootstrapPeerIDs.add(peerInfo.peerId) | ||
|
||
for i in 0 ..< ConcurrentConnections: | ||
node.connWorkers.add connectWorker(node) | ||
node.connWorkers.add connectWorker(node, bootstrapPeerIDs) | ||
|
||
node.libp2pTransportLoops = await node.switch.start() | ||
node.discovery.start() | ||
|
@@ -1130,17 +1147,19 @@ proc announcedENR*(node: Eth2Node): enr.Record = | |
proc shortForm*(id: KeyPair): string = | ||
$PeerID.init(id.pubkey) | ||
|
||
let BOOTSTRAP_NODE_CHECK_INTERVAL = 30.seconds | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is constant and should be placed at the beginning of the file with comment where it used. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The further it is from the place it's used, the harder it is to find. Jumping all over the file just to understand a few lines of code makes our job harder. |
||
proc checkIfConnectedToBootstrapNode(p: pointer) {.gcsafe.} = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This procedure looks very ugly, but it can look like this: proc checkIfConnected(node: Eth2Node) {.async.} =
while true:
await sleepAsync(30.seconds)
if node.discovery.bootstrapRecords.len > 0 and len(node.peerPool) == 0:
warn "Failed to connect to any node", bootstrapEnrs = node.discovery.bootstrapRecords
traceAsyncErrors checkIfConnected(node)) But also you should not use metric values for program logic, it is better to check PeerPool for number of connections currently available. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Why not? Keeps the number of variables low. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There already present place which perform tracking of libp2p peers. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
metrics are a one-way data flow out of the application - they're an optional feature and we should be able to compile the application with them completely disabled at compile time - using them as regular variables makes the code harder to analyze as they now start to serve multiple orthogonal concerns There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Your version might look better to you, but it does the wrong thing by continuing to check for that condition after it became false. You fix that and it becomes uglier than my version. Further more, you dropped that part of its name that made clear what it actually does. Also, do you remember what There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @stefantalpalaru rename your PR title please, because PR content is absolutely different from PR title. You should name it: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'm not an author of this PR i have made a proposal, if you do not like message you can easily change it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Your PR could easily fix the issue for both cases - "network loss on node start", "network loss while node is working". And this can be done by changing 2 LOCs... So why not introduce such changes in this PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Done.
But reducing those two timeouts is useful all the time, not just at program start.
The logic inside Defects don't end up in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It might get more complicated than that. Before telling the average user that his network might be down (or there's a problem with the firewall), I should look at any other network traffic I might have available - stuff like Discovery v5 nodes or any UDP multicasting we might be doing and getting replies from. I should also have more reliable ways of establishing if the problem is with the Eth2 network itself (and I should probably ping some servers with high uptime before making a fool of myself by lying to the user). |
||
# Keep showing warnings until we connect to at least one bootstrap node | ||
# successfully, in order to allow detection of an invalid configuration. | ||
let node = cast[Eth2Node](p) | ||
if node.discovery.bootstrapRecords.len > 0 and nbc_successful_dials.value == 0: | ||
warn "Failed to connect to any bootstrap node", | ||
bootstrapEnrs = node.discovery.bootstrapRecords | ||
addTimer(BOOTSTRAP_NODE_CHECK_INTERVAL, checkIfConnectedToBootstrapNode, p) | ||
|
||
proc startLookingForPeers*(node: Eth2Node) {.async.} = | ||
await node.start() | ||
|
||
proc checkIfConnectedToBootstrapNode {.async.} = | ||
await sleepAsync(30.seconds) | ||
if node.discovery.bootstrapRecords.len > 0 and nbc_successful_dials.value == 0: | ||
fatal "Failed to connect to any bootstrap node. Quitting", | ||
bootstrapEnrs = node.discovery.bootstrapRecords | ||
quit 1 | ||
|
||
traceAsyncErrors checkIfConnectedToBootstrapNode() | ||
addTimer(BOOTSTRAP_NODE_CHECK_INTERVAL, checkIfConnectedToBootstrapNode, node[].addr) | ||
|
||
func peersCount*(node: Eth2Node): int = | ||
len(node.peerPool) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently, this is add is redundant, as in discovery bootstrap nodes are also treated special and are never removed from the routing table. So they would keep getting added in the discovery loop. The not adding to the seen list that you do there is required of course
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see that bootstrap nodes are exempt from removal in
Protocol.replaceNode()
but where are they re-added toEth2Node.connQueue
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the discoveryLoop they have a chance of being passed again through the
randomNodes
call. Chance, as it is a random selection, but if no bootstrap nodes were reachable ( = no other nodes added to the routing table) it will always be those bootstrap nodes.And as you no longer do the
addSeen
for those bootstrap nodes, a new connection should be attempted I believe.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here you need to check not only
PeerID
but also you need to check ifPeerInfo
you are sending to connection worker hasTCP
based address inside. Otherwise it will fail immediately, but you will continue adding this item back to connection in queue. If number of bootstrap nodes will be more then number of connection workers it will create endless loop node will never attempt to connect to real nodes.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That filter should be done earlier, in the discovery loop (it happens now already on eth2 field)