Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

deal with a temporary loss of network connectivity #1354

Merged
merged 7 commits into from
Jul 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 37 additions & 27 deletions beacon_chain/eth2_network.nim
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import
# Beacon node modules
version, conf, eth2_discovery, libp2p_json_serialization, conf,
ssz/ssz_serialization,
peer_pool, spec/[datatypes, network]
peer_pool, spec/[datatypes, network], ./time

export
version, multiaddress, peer_pool, peerinfo, p2pProtocol,
Expand Down Expand Up @@ -205,9 +205,9 @@ const
ConcurrentConnections* = 10
## Maximum number of active concurrent connection requests.

SeenTableTimeTimeout* = 10.minutes
SeenTableTimeTimeout* = 1.minutes
## Seen period of time for timeout connections
SeenTableTimeDeadPeer* = 10.minutes
SeenTableTimeDeadPeer* = 1.minutes
## Period of time for dead peers.
SeenTableTimeIrrelevantNetwork* = 24.hours
## Period of time for `IrrelevantNetwork` error reason.
Expand All @@ -216,6 +216,8 @@ const
SeemTableTimeFaultOrError* = 10.minutes
## Period of time for `FaultOnError` error reason.

var successfullyDialledAPeer = false # used to show a warning

template neterr(kindParam: Eth2NetworkingErrorKind): auto =
err(type(result), Eth2NetworkingError(kind: kindParam))

Expand Down Expand Up @@ -718,43 +720,49 @@ proc dialPeer*(node: Eth2Node, peerInfo: PeerInfo) {.async.} =
await performProtocolHandshakes(peer)

inc nbc_successful_dials
successfullyDialledAPeer = true
debug "Network handshakes completed"

proc connectWorker(network: Eth2Node) {.async.} =
debug "Connection worker started"

while true:
let pi = await network.connQueue.popFirst()
let r1 = network.peerPool.hasPeer(pi.peerId)
let r2 = network.isSeen(pi)
let r3 = network.connTable.hasKey(pi.peerId)
let
remotePeerInfo = await network.connQueue.popFirst()
peerPoolHasRemotePeer = network.peerPool.hasPeer(remotePeerInfo.peerId)
seenTableHasRemotePeer = network.isSeen(remotePeerInfo)
remotePeerAlreadyConnected = network.connTable.hasKey(remotePeerInfo.peerId)

if not(r1) and not(r2) and not(r3):
network.connTable[pi.peerId] = pi
if not(peerPoolHasRemotePeer) and not(seenTableHasRemotePeer) and not(remotePeerAlreadyConnected):
network.connTable[remotePeerInfo.peerId] = remotePeerInfo
try:
# We trying to connect to peers which are not in PeerPool, SeenTable and
# ConnTable.
var fut = network.dialPeer(pi)
var fut = network.dialPeer(remotePeerInfo)
# We discarding here just because we going to check future state, to avoid
# condition where connection happens and timeout reached.
let res = await withTimeout(fut, network.connectTimeout)
discard await withTimeout(fut, network.connectTimeout)
# We handling only timeout and errors, because successfull connections
# will be stored in PeerPool.
if fut.finished():
if fut.failed() and not(fut.cancelled()):
debug "Unable to establish connection with peer", peer = pi.id,
debug "Unable to establish connection with peer", peer = remotePeerInfo.id,
errMsg = fut.readError().msg
inc nbc_failed_dials
network.addSeen(pi, SeenTableTimeDeadPeer)
network.addSeen(remotePeerInfo, SeenTableTimeDeadPeer)
continue
debug "Connection to remote peer timed out", peer = pi.id
debug "Connection to remote peer timed out", peer = remotePeerInfo.id
inc nbc_timeout_dials
network.addSeen(pi, SeenTableTimeTimeout)
network.addSeen(remotePeerInfo, SeenTableTimeTimeout)
finally:
network.connTable.del(pi.peerId)
network.connTable.del(remotePeerInfo.peerId)
else:
trace "Peer is already connected, connecting or already seen",
peer = pi.id, peer_pool_has_peer = $r1, seen_table_has_peer = $r2,
connecting_peer = $r3, seen_table_size = len(network.seenTable)
peer = remotePeerInfo.id, peer_pool_has_peer = $peerPoolHasRemotePeer, seen_table_has_peer = $seenTableHasRemotePeer,
connecting_peer = $remotePeerAlreadyConnected, seen_table_size = len(network.seenTable)

# Prevent (a purely theoretical) high CPU usage when losing connectivity.
await sleepAsync(1.seconds)

proc runDiscoveryLoop*(node: Eth2Node) {.async.} =
debug "Starting discovery loop"
Expand Down Expand Up @@ -1130,17 +1138,19 @@ proc announcedENR*(node: Eth2Node): enr.Record =
proc shortForm*(id: KeyPair): string =
$PeerID.init(id.pubkey)

let BOOTSTRAP_NODE_CHECK_INTERVAL = 30.seconds
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is constant and should be placed at the beginning of the file with comment where it used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The further it is from the place it's used, the harder it is to find. Jumping all over the file just to understand a few lines of code makes our job harder.

proc checkIfConnectedToBootstrapNode(p: pointer) {.gcsafe.} =
Copy link
Contributor

@cheatfate cheatfate Jul 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This procedure looks very ugly, but it can look like this:

  proc checkIfConnected(node: Eth2Node) {.async.} =
    while true:
      await sleepAsync(30.seconds)
      if node.discovery.bootstrapRecords.len > 0 and len(node.peerPool) == 0:
        warn "Failed to connect to any node",  bootstrapEnrs = node.discovery.bootstrapRecords

traceAsyncErrors checkIfConnected(node))

But also you should not use metric values for program logic, it is better to check PeerPool for number of connections currently available.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you should not use metric values for program logic

Why not? Keeps the number of variables low.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There already present place which perform tracking of libp2p peers.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not? Keeps the number of variables low.

metrics are a one-way data flow out of the application - they're an optional feature and we should be able to compile the application with them completely disabled at compile time - using them as regular variables makes the code harder to analyze as they now start to serve multiple orthogonal concerns

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your version might look better to you, but it does the wrong thing by continuing to check for that condition after it became false. You fix that and it becomes uglier than my version. Further more, you dropped that part of its name that made clear what it actually does.

Also, do you remember what traceAsyncErrors() does without reading its code? I don't. Its name sounds like something that's currently broken in Chronos.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@stefantalpalaru rename your PR title please, because PR content is absolutely different from PR title.

You should name it: deal with temporary loss of network connectivity while node is starting.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, for what you want, "Failed to connect to any node" is the wrong message, because it implies historical data which you don't have. You need "Not connected to any peer right now. There may be a problem with your network connectivity."

I'm not an author of this PR i have made a proposal, if you do not like message you can easily change it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exactly. I want that warning to only appear before succesfully connecting to a peer. Detecting any other problem is a separate concern better addressed in a separate procedure, in a separate PR.

Your PR could easily fix the issue for both cases - "network loss on node start", "network loss while node is working". And this can be done by changing 2 LOCs... So why not introduce such changes in this PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

create a dedicated boolean for it then

Done.

You should name it: deal with temporary loss of network connectivity while node is starting.

But reducing those two timeouts is useful all the time, not just at program start.

traceAsyncErrors catching exceptions in absolutely legal way https://github.com/status-im/nim-eth/blob/master/eth/async_utils.nim#L11

The logic inside catchOrQuit() is wrong right now: https://github.com/status-im/nim-eth/blob/765883c454be726799f4e724b4dc2ca8fe25bc74/eth/async_utils.nim#L7

Defects don't end up in Future.error, but are raised directly, so that else branch is unreachable.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your PR could easily fix the issue for both cases - "network loss on node start", "network loss while node is working". And this can be done by changing 2 LOCs...

It might get more complicated than that. Before telling the average user that his network might be down (or there's a problem with the firewall), I should look at any other network traffic I might have available - stuff like Discovery v5 nodes or any UDP multicasting we might be doing and getting replies from.

I should also have more reliable ways of establishing if the problem is with the Eth2 network itself (and I should probably ping some servers with high uptime before making a fool of myself by lying to the user).

# Keep showing warnings until we connect to at least one bootstrap node
# successfully, in order to allow detection of an invalid configuration.
let node = cast[Eth2Node](p)
if node.discovery.bootstrapRecords.len > 0 and not successfullyDialledAPeer:
warn "Failed to connect to any bootstrap node",
bootstrapEnrs = node.discovery.bootstrapRecords
addTimer(BOOTSTRAP_NODE_CHECK_INTERVAL, checkIfConnectedToBootstrapNode, p)

proc startLookingForPeers*(node: Eth2Node) {.async.} =
await node.start()

proc checkIfConnectedToBootstrapNode {.async.} =
await sleepAsync(30.seconds)
if node.discovery.bootstrapRecords.len > 0 and nbc_successful_dials.value == 0:
fatal "Failed to connect to any bootstrap node. Quitting",
bootstrapEnrs = node.discovery.bootstrapRecords
quit 1

traceAsyncErrors checkIfConnectedToBootstrapNode()
addTimer(BOOTSTRAP_NODE_CHECK_INTERVAL, checkIfConnectedToBootstrapNode, node[].addr)

func peersCount*(node: Eth2Node): int =
len(node.peerPool)
Expand Down
2 changes: 1 addition & 1 deletion scripts/launch_local_testnet.sh
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ $MAKE LOG_LEVEL="${LOG_LEVEL}" NIMFLAGS="-d:insecure -d:testnet_servers_image ${
PIDS=""
WEB3_ARG=""
STATE_SNAPSHOT_ARG=""
BOOTSTRAP_TIMEOUT=10 # in seconds
BOOTSTRAP_TIMEOUT=30 # in seconds
DEPOSIT_CONTRACT_ADDRESS="0x0000000000000000000000000000000000000000"
DEPOSIT_CONTRACT_BLOCK="0x0000000000000000000000000000000000000000000000000000000000000000"
NETWORK_METADATA_FILE="${DATA_DIR}/network.json"
Expand Down