-
Notifications
You must be signed in to change notification settings - Fork 664
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
…#3036) After this change stress.py node_restart passes relatively consistently, and is reintroduced to nightly. Nearcore fixes: - We had a bug in the syncing logic (with a low chance of being triggered in the wild): if a block is produced, and between 1/3 and 2/3 of block producers received it, and the rest have not, the system stalls, because no 2/3 of block producers have the same head, but also nobody is two blocks behind the highest peer to start syncing. Fixing it by forcing sync if we've been 1 block behind for too long. stress.py was reproducing this issue in every run - (#2916) we had an issue that if a node produced a chunk, and then crashed, on recovery it was not able to serve it because it didn't have all the parts and receipts stored in the storage from which we recover cache entries in the shards manager. Fixing it by always storing all the parts and receipts (redundantly) for chunks in the shards we care about. Test fixes [v] Fixing a scenario in which a failure to send a transaction to all validators resulted in recording an incorrect tx hash alongside the tx. Later when checking balances using the incorrect hash resulted in getting incorrect success value, and thus applying incorrect corrections to the expected balances; [v] Changing the order of magnitude of staking transactions, so that the validator set actually changes. Other issues discovered while fixing stress.py: - #2906
- Loading branch information
1 parent
2da194e
commit 062dce7
Showing
10 changed files
with
351 additions
and
86 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
# Spins up two nodes with two shards, waits for couple blocks, snapshots the | ||
# latest chunks, and requests both chunks from the first node, asking for | ||
# receipts for both shards in both requests. We expect the first node to | ||
# respond to exactly one of the requests, for the shard it tracks (for the | ||
# shard it doesn't track it will only have the receipts to the shard it does | ||
# track). | ||
# | ||
# We then kill both nodes, and restart the first node, and do the same | ||
# requests. We expect it to resond the same way. Before 2916 is fixed, it | ||
# fails to respond to the request it was previously responding to due to | ||
# incorrect reconstruction of the receipts. | ||
|
||
import asyncio, sys, time | ||
import socket, base58 | ||
import nacl.signing, hashlib | ||
|
||
sys.path.append('lib') | ||
|
||
from cluster import start_cluster | ||
from peer import * | ||
from utils import obj_to_string | ||
|
||
from messages.tx import * | ||
from messages.block import * | ||
from messages.crypto import * | ||
from messages.network import * | ||
|
||
async def main(): | ||
# start a cluster with two shards | ||
nodes = start_cluster(2, 0, 2, None, [], {}) | ||
|
||
started = time.time() | ||
|
||
while True: | ||
if time.time() - started > 10: | ||
assert False, "Giving up waiting for two blocks" | ||
|
||
status = nodes[0].get_status() | ||
hash_ = status['sync_info']['latest_block_hash'] | ||
height = status['sync_info']['latest_block_height'] | ||
|
||
if height > 2: | ||
block = nodes[0].get_block(hash_) | ||
chunk_hashes = [base58.b58decode(x['chunk_hash']) for x in block['result']['chunks']] | ||
|
||
assert len(chunk_hashes) == 2 | ||
assert all([len(x) == 32 for x in chunk_hashes]) | ||
|
||
break | ||
|
||
my_key_pair_nacl = nacl.signing.SigningKey.generate() | ||
received_responses = [None, None] | ||
|
||
# step = 0: before the node is killed | ||
# step = 1: after the node is killed | ||
for step in range(2): | ||
|
||
conn0 = await connect(nodes[0].addr()) | ||
await run_handshake(conn0, nodes[0].node_key.pk, my_key_pair_nacl) | ||
for shard_ord, chunk_hash in enumerate(chunk_hashes): | ||
|
||
request = PartialEncodedChunkRequestMsg() | ||
request.chunk_hash = chunk_hash | ||
request.part_ords = [] | ||
request.tracking_shards = [0, 1] | ||
|
||
routed_msg_body = RoutedMessageBody() | ||
routed_msg_body.enum = 'PartialEncodedChunkRequest' | ||
routed_msg_body.PartialEncodedChunkRequest = request | ||
|
||
peer_message = create_and_sign_routed_peer_message(routed_msg_body, nodes[0], my_key_pair_nacl) | ||
|
||
await conn0.send(peer_message) | ||
|
||
received_response = False | ||
|
||
def predicate(response): | ||
return response.enum == 'Routed' and response.Routed.body.enum == 'PartialEncodedChunkResponse' | ||
|
||
try: | ||
response = await asyncio.wait_for(conn0.recv(predicate), 5) | ||
except concurrent.futures._base.TimeoutError: | ||
response = None | ||
|
||
if response is not None: | ||
print("Received response for shard %s" % shard_ord) | ||
received_response = True | ||
else: | ||
print("Didn't receive response for shard %s" % shard_ord) | ||
|
||
if step == 0: | ||
received_responses[shard_ord] = received_response | ||
else: | ||
assert received_responses[shard_ord] == received_response, "The response doesn't match for the chunk in shard %s. Received response before node killed: %s, after: %s" % (shard_ord, received_responses[shard_ord], received_response) | ||
|
||
# we expect first node to only respond to one of the chunk requests, for the shard assigned to it | ||
assert received_responses[0] != received_responses[1], received_responses | ||
|
||
if step == 0: | ||
print("Killing and restarting nodes") | ||
nodes[1].kill() | ||
nodes[0].kill() | ||
nodes[0].start(None, None) | ||
time.sleep(1) | ||
|
||
|
||
asyncio.run(main()) | ||
|
Oops, something went wrong.