-
Notifications
You must be signed in to change notification settings - Fork 37
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use peermanager scores for blocksync peers and don't error out on block mismatch #162
Changes from 11 commits
5786063
1ac69bc
fe9f6e0
08f8163
2f4517f
bfc637c
533f759
8fa8638
1137ac4
2bc2fbb
3610170
a1bd259
a83139b
ae4b8bc
e398b04
3f06493
a70ffd2
e47270c
36d31f9
ecf6e7f
a149386
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,10 @@ import ( | |
"context" | ||
"errors" | ||
"fmt" | ||
"github.com/tendermint/tendermint/internal/p2p" | ||
"math" | ||
"math/rand" | ||
"sort" | ||
"sync" | ||
"sync/atomic" | ||
"time" | ||
|
@@ -47,7 +50,7 @@ const ( | |
maxDiffBetweenCurrentAndReceivedBlockHeight = 100 | ||
) | ||
|
||
var peerTimeout = 15 * time.Second // not const so we can override with tests | ||
var peerTimeout = 3 * time.Second // not const so we can override with tests | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. waiting 15s for timeout is too long given we expect blocks much more frequently |
||
|
||
/* | ||
Peers self report their heights when we join the block pool. | ||
|
@@ -80,6 +83,7 @@ type BlockPool struct { | |
height int64 // the lowest key in requesters. | ||
// peers | ||
peers map[types.NodeID]*bpPeer | ||
peerManager *p2p.PeerManager | ||
maxPeerHeight int64 // the biggest reported height | ||
|
||
// atomic | ||
|
@@ -101,8 +105,8 @@ func NewBlockPool( | |
start int64, | ||
requestsCh chan<- BlockRequest, | ||
errorsCh chan<- peerError, | ||
peerManager *p2p.PeerManager, | ||
) *BlockPool { | ||
|
||
bp := &BlockPool{ | ||
logger: logger, | ||
peers: make(map[types.NodeID]*bpPeer), | ||
|
@@ -113,6 +117,7 @@ func NewBlockPool( | |
requestsCh: requestsCh, | ||
errorsCh: errorsCh, | ||
lastSyncRate: 0, | ||
peerManager: peerManager, | ||
} | ||
bp.BaseService = *service.NewBaseService(logger, "BlockPool", bp) | ||
return bp | ||
|
@@ -315,7 +320,9 @@ func (pool *BlockPool) AddBlock(peerID types.NodeID, block *types.Block, extComm | |
} | ||
} else { | ||
err := errors.New("requester is different or block already exists") | ||
pool.sendError(err, peerID) | ||
// Original behavior is to error out when there is a mismatch, which shuts down the entire reactor. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Overall LGTM, one question: do we understand why the original behavior wants to error this out and shutdown the entire reactor? Is there going to be any side effect if we change this behavior? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's revert this now, and when the standalone rpc falls behind again (currently it is healthly, and was upgraded to the new version), we can re-apply this patch and see if it helps |
||
// Instead, make the reactor more robust and just log error | ||
//pool.sendError(err, peerID) | ||
return fmt.Errorf("%w (peer: %s, requester: %s, block height: %d)", err, peerID, requester.getPeerID(), block.Height) | ||
} | ||
|
||
|
@@ -408,13 +415,44 @@ func (pool *BlockPool) updateMaxPeerHeight() { | |
pool.maxPeerHeight = max | ||
} | ||
|
||
func (pool *BlockPool) getSortedPeers(peers map[types.NodeID]*bpPeer) []types.NodeID { | ||
// Generate a sorted list | ||
sortedPeers := make([]types.NodeID, 0, len(peers)) | ||
|
||
for peer := range peers { | ||
sortedPeers = append(sortedPeers, peer) | ||
} | ||
// Sort from high to low score | ||
sort.Slice(sortedPeers, func(i, j int) bool { | ||
return pool.peerManager.Score(sortedPeers[i]) > pool.peerManager.Score(sortedPeers[j]) | ||
}) | ||
return sortedPeers | ||
} | ||
|
||
// Pick an available peer with the given height available. | ||
// If no peers are available, returns nil. | ||
func (pool *BlockPool) pickIncrAvailablePeer(height int64) *bpPeer { | ||
pool.mtx.Lock() | ||
defer pool.mtx.Unlock() | ||
|
||
for _, peer := range pool.peers { | ||
// Generate a sorted list | ||
sortedPeers := pool.getSortedPeers(pool.peers) | ||
var goodPeers []types.NodeID | ||
// Remove peers with 0 score and shuffle list | ||
for _, peer := range sortedPeers { | ||
// We only want to work with peers that are ready & connected (not dialing) | ||
if pool.peerManager.State(peer) == "ready,connected" { | ||
goodPeers = append(goodPeers, peer) | ||
} | ||
if pool.peerManager.Score(peer) == 0 { | ||
break | ||
} | ||
} | ||
rand.Seed(time.Now().UnixNano()) | ||
rand.Shuffle(len(goodPeers), func(i, j int) { goodPeers[i], goodPeers[j] = goodPeers[j], goodPeers[i] }) | ||
|
||
for _, nodeId := range sortedPeers { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A few further optimizations we can do:
|
||
peer := pool.peers[nodeId] | ||
if peer.didTimeout { | ||
pool.removePeer(peer.id) | ||
continue | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: fix lint issue