-
Notifications
You must be signed in to change notification settings - Fork 244
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
deal with a temporary loss of network connectivity #1354
Changes from all commits
43961f9
09f31ab
2ccd959
f6a6e30
646bb91
c7e5cae
cbb0aca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,7 +23,7 @@ import | |
# Beacon node modules | ||
version, conf, eth2_discovery, libp2p_json_serialization, conf, | ||
ssz/ssz_serialization, | ||
peer_pool, spec/[datatypes, network] | ||
peer_pool, spec/[datatypes, network], ./time | ||
|
||
export | ||
version, multiaddress, peer_pool, peerinfo, p2pProtocol, | ||
|
@@ -205,9 +205,9 @@ const | |
ConcurrentConnections* = 10 | ||
## Maximum number of active concurrent connection requests. | ||
|
||
SeenTableTimeTimeout* = 10.minutes | ||
SeenTableTimeTimeout* = 1.minutes | ||
## Seen period of time for timeout connections | ||
SeenTableTimeDeadPeer* = 10.minutes | ||
SeenTableTimeDeadPeer* = 1.minutes | ||
## Period of time for dead peers. | ||
SeenTableTimeIrrelevantNetwork* = 24.hours | ||
## Period of time for `IrrelevantNetwork` error reason. | ||
|
@@ -216,6 +216,8 @@ const | |
SeemTableTimeFaultOrError* = 10.minutes | ||
## Period of time for `FaultOnError` error reason. | ||
|
||
var successfullyDialledAPeer = false # used to show a warning | ||
|
||
template neterr(kindParam: Eth2NetworkingErrorKind): auto = | ||
err(type(result), Eth2NetworkingError(kind: kindParam)) | ||
|
||
|
@@ -718,43 +720,49 @@ proc dialPeer*(node: Eth2Node, peerInfo: PeerInfo) {.async.} = | |
await performProtocolHandshakes(peer) | ||
|
||
inc nbc_successful_dials | ||
successfullyDialledAPeer = true | ||
debug "Network handshakes completed" | ||
|
||
proc connectWorker(network: Eth2Node) {.async.} = | ||
debug "Connection worker started" | ||
|
||
while true: | ||
let pi = await network.connQueue.popFirst() | ||
let r1 = network.peerPool.hasPeer(pi.peerId) | ||
let r2 = network.isSeen(pi) | ||
let r3 = network.connTable.hasKey(pi.peerId) | ||
let | ||
remotePeerInfo = await network.connQueue.popFirst() | ||
peerPoolHasRemotePeer = network.peerPool.hasPeer(remotePeerInfo.peerId) | ||
seenTableHasRemotePeer = network.isSeen(remotePeerInfo) | ||
remotePeerAlreadyConnected = network.connTable.hasKey(remotePeerInfo.peerId) | ||
|
||
if not(r1) and not(r2) and not(r3): | ||
network.connTable[pi.peerId] = pi | ||
if not(peerPoolHasRemotePeer) and not(seenTableHasRemotePeer) and not(remotePeerAlreadyConnected): | ||
network.connTable[remotePeerInfo.peerId] = remotePeerInfo | ||
try: | ||
# We trying to connect to peers which are not in PeerPool, SeenTable and | ||
# ConnTable. | ||
var fut = network.dialPeer(pi) | ||
var fut = network.dialPeer(remotePeerInfo) | ||
# We discarding here just because we going to check future state, to avoid | ||
# condition where connection happens and timeout reached. | ||
let res = await withTimeout(fut, network.connectTimeout) | ||
discard await withTimeout(fut, network.connectTimeout) | ||
# We handling only timeout and errors, because successfull connections | ||
# will be stored in PeerPool. | ||
if fut.finished(): | ||
if fut.failed() and not(fut.cancelled()): | ||
debug "Unable to establish connection with peer", peer = pi.id, | ||
debug "Unable to establish connection with peer", peer = remotePeerInfo.id, | ||
errMsg = fut.readError().msg | ||
inc nbc_failed_dials | ||
network.addSeen(pi, SeenTableTimeDeadPeer) | ||
network.addSeen(remotePeerInfo, SeenTableTimeDeadPeer) | ||
continue | ||
debug "Connection to remote peer timed out", peer = pi.id | ||
debug "Connection to remote peer timed out", peer = remotePeerInfo.id | ||
inc nbc_timeout_dials | ||
network.addSeen(pi, SeenTableTimeTimeout) | ||
network.addSeen(remotePeerInfo, SeenTableTimeTimeout) | ||
finally: | ||
network.connTable.del(pi.peerId) | ||
network.connTable.del(remotePeerInfo.peerId) | ||
else: | ||
trace "Peer is already connected, connecting or already seen", | ||
peer = pi.id, peer_pool_has_peer = $r1, seen_table_has_peer = $r2, | ||
connecting_peer = $r3, seen_table_size = len(network.seenTable) | ||
peer = remotePeerInfo.id, peer_pool_has_peer = $peerPoolHasRemotePeer, seen_table_has_peer = $seenTableHasRemotePeer, | ||
connecting_peer = $remotePeerAlreadyConnected, seen_table_size = len(network.seenTable) | ||
|
||
# Prevent (a purely theoretical) high CPU usage when losing connectivity. | ||
await sleepAsync(1.seconds) | ||
|
||
proc runDiscoveryLoop*(node: Eth2Node) {.async.} = | ||
debug "Starting discovery loop" | ||
|
@@ -1130,17 +1138,19 @@ proc announcedENR*(node: Eth2Node): enr.Record = | |
proc shortForm*(id: KeyPair): string = | ||
$PeerID.init(id.pubkey) | ||
|
||
let BOOTSTRAP_NODE_CHECK_INTERVAL = 30.seconds | ||
proc checkIfConnectedToBootstrapNode(p: pointer) {.gcsafe.} = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This procedure looks very ugly, but it can look like this: proc checkIfConnected(node: Eth2Node) {.async.} =
while true:
await sleepAsync(30.seconds)
if node.discovery.bootstrapRecords.len > 0 and len(node.peerPool) == 0:
warn "Failed to connect to any node", bootstrapEnrs = node.discovery.bootstrapRecords
traceAsyncErrors checkIfConnected(node)) But also you should not use metric values for program logic, it is better to check PeerPool for number of connections currently available. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Why not? Keeps the number of variables low. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There already present place which perform tracking of libp2p peers. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
metrics are a one-way data flow out of the application - they're an optional feature and we should be able to compile the application with them completely disabled at compile time - using them as regular variables makes the code harder to analyze as they now start to serve multiple orthogonal concerns There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Your version might look better to you, but it does the wrong thing by continuing to check for that condition after it became false. You fix that and it becomes uglier than my version. Further more, you dropped that part of its name that made clear what it actually does. Also, do you remember what There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @stefantalpalaru rename your PR title please, because PR content is absolutely different from PR title. You should name it: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'm not an author of this PR i have made a proposal, if you do not like message you can easily change it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Your PR could easily fix the issue for both cases - "network loss on node start", "network loss while node is working". And this can be done by changing 2 LOCs... So why not introduce such changes in this PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Done.
But reducing those two timeouts is useful all the time, not just at program start.
The logic inside Defects don't end up in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It might get more complicated than that. Before telling the average user that his network might be down (or there's a problem with the firewall), I should look at any other network traffic I might have available - stuff like Discovery v5 nodes or any UDP multicasting we might be doing and getting replies from. I should also have more reliable ways of establishing if the problem is with the Eth2 network itself (and I should probably ping some servers with high uptime before making a fool of myself by lying to the user). |
||
# Keep showing warnings until we connect to at least one bootstrap node | ||
# successfully, in order to allow detection of an invalid configuration. | ||
let node = cast[Eth2Node](p) | ||
if node.discovery.bootstrapRecords.len > 0 and not successfullyDialledAPeer: | ||
warn "Failed to connect to any bootstrap node", | ||
bootstrapEnrs = node.discovery.bootstrapRecords | ||
addTimer(BOOTSTRAP_NODE_CHECK_INTERVAL, checkIfConnectedToBootstrapNode, p) | ||
|
||
proc startLookingForPeers*(node: Eth2Node) {.async.} = | ||
await node.start() | ||
|
||
proc checkIfConnectedToBootstrapNode {.async.} = | ||
await sleepAsync(30.seconds) | ||
if node.discovery.bootstrapRecords.len > 0 and nbc_successful_dials.value == 0: | ||
fatal "Failed to connect to any bootstrap node. Quitting", | ||
bootstrapEnrs = node.discovery.bootstrapRecords | ||
quit 1 | ||
|
||
traceAsyncErrors checkIfConnectedToBootstrapNode() | ||
addTimer(BOOTSTRAP_NODE_CHECK_INTERVAL, checkIfConnectedToBootstrapNode, node[].addr) | ||
|
||
func peersCount*(node: Eth2Node): int = | ||
len(node.peerPool) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is constant and should be placed at the beginning of the file with comment where it used.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The further it is from the place it's used, the harder it is to find. Jumping all over the file just to understand a few lines of code makes our job harder.