Skip to content

Commit

Permalink
fix: fix some issues in zero downtime upgrade
Browse files Browse the repository at this point in the history
  • Loading branch information
bowenwang1996 committed Jun 12, 2020
1 parent a6c69f0 commit 03853e0
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 20 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 2 additions & 5 deletions chain/client/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -316,11 +316,8 @@ impl Client {
&next_block_proposer,
)?;
if validator_stake.public_key != validator_signer.public_key() {
return Err(Error::BlockProducer(format!(
"Validator key doesn't match. Expected {} Actual {}",
validator_stake.public_key,
validator_signer.public_key()
)));
debug!(target: "client", "Local validator key {} does not match expected validator key {}, skipping block production", validator_signer.public_key(), validator_stake.public_key);
return Ok(None);
}

debug!(target: "client", "{:?} Producing block at height {}, parent {} @ {}", validator_signer.validator_id(), next_height, prev.height(), format_hash(head.last_block_hash));
Expand Down
23 changes: 11 additions & 12 deletions chain/client/src/client_actor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -590,11 +590,10 @@ impl ClientActor {
}

// First check that we currently have an AccountId
if self.client.validator_signer.is_none() {
// There is no account id associated with this client
return;
}
let validator_signer = self.client.validator_signer.as_ref().unwrap();
let validator_signer = match self.client.validator_signer.as_ref() {
None => return,
Some(signer) => signer,
};

let now = Instant::now();
// Check that we haven't announced it too recently
Expand All @@ -615,14 +614,14 @@ impl ClientActor {
.get_next_epoch_id_from_prev_block(&prev_block_hash));

// Check client is part of the futures validators
if let Ok(validators) = self
.client
.runtime_adapter
.get_epoch_block_producers_ordered(&next_epoch_id, &prev_block_hash)
if let Ok((validator_stake, is_slashed)) =
self.client.runtime_adapter.get_validator_by_account_id(
&next_epoch_id,
&prev_block_hash,
validator_signer.validator_id(),
)
{
if validators.iter().any(|(validator_stake, _)| {
&validator_stake.account_id == validator_signer.validator_id()
}) {
if !is_slashed && validator_stake.public_key == validator_signer.public_key() {
debug!(target: "client", "Sending announce account for {}", validator_signer.validator_id());
self.last_validator_announce_time = Some(now);
let signature = self.sign_announce_account(&next_epoch_id).unwrap();
Expand Down
12 changes: 10 additions & 2 deletions neard/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ fn default_peer_recent_time_window() -> Duration {
fn default_safe_set_size() -> u32 {
20
}
/// Time to persist Accounts Id in the router without removing them in seconds.
fn default_ttl_account_id_router() -> Duration {
Duration::from_secs(TTL_ACCOUNT_ID_ROUTER)
}

#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct Network {
Expand Down Expand Up @@ -212,6 +216,9 @@ pub struct Network {
/// It can be IP:Port or IP (to blacklist all connections coming from this address).
#[serde(default)]
pub blacklist: Vec<String>,
/// Time to persist Accounts Id in the router without removing them in seconds.
#[serde(default = "default_ttl_account_id_router")]
pub ttl_account_id_router: Duration,
}

impl Default for Network {
Expand All @@ -231,6 +238,7 @@ impl Default for Network {
skip_sync_wait: false,
ban_window: Duration::from_secs(3 * 60 * 60),
blacklist: vec![],
ttl_account_id_router: default_ttl_account_id_router(),
}
}
}
Expand Down Expand Up @@ -527,7 +535,7 @@ impl NearConfig {
epoch_length: genesis.config.epoch_length,
num_block_producer_seats: genesis.config.num_block_producer_seats,
announce_account_horizon: genesis.config.epoch_length / 2,
ttl_account_id_router: Duration::from_secs(TTL_ACCOUNT_ID_ROUTER),
ttl_account_id_router: config.network.ttl_account_id_router,
// TODO(1047): this should be adjusted depending on the speed of sync of state.
block_fetch_horizon: config.consensus.block_fetch_horizon,
state_fetch_horizon: config.consensus.state_fetch_horizon,
Expand Down Expand Up @@ -571,7 +579,7 @@ impl NearConfig {
max_send_peers: 512,
peer_expiration_duration: Duration::from_secs(7 * 24 * 60 * 60),
peer_stats_period: Duration::from_secs(5),
ttl_account_id_router: Duration::from_secs(TTL_ACCOUNT_ID_ROUTER),
ttl_account_id_router: config.network.ttl_account_id_router,
routed_message_ttl: ROUTED_MESSAGE_TTL,
max_routes_to_store: MAX_ROUTES_TO_STORE,
highest_peer_horizon: HIGHEST_PEER_HORIZON,
Expand Down
17 changes: 16 additions & 1 deletion pytest/lib/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ def from_json_file(self, jf):
with open(jf) as f:
return Key.from_json(json.loads(f.read()))

def to_json(self):
return {'account_id': self.account_id, 'public_key': self.pk, 'secret_key': self.sk}


class BaseNode(object):

Expand Down Expand Up @@ -330,6 +333,11 @@ def kill(self):
def reset_data(self):
shutil.rmtree(os.path.join(self.node_dir, "data"))

def reset_validator_key(self, new_key):
self.validator_key = new_key
with open(os.path.join(self.node_dir, "validator_key.json"), 'w+') as f:
json.dump(new_key.to_json(), f)

def cleanup(self):
if self.cleaned:
return
Expand Down Expand Up @@ -467,6 +475,13 @@ def resume_network(self):
rc.run(f'gcloud compute firewall-rules delete {self.machine.name}-stop',
input='yes\n')

def reset_validator_key(self, new_key):
self.validator_key = new_key
with open(os.path.join(self.node_dir, "validator_key.json"), 'w+') as f:
json.dump(new_key.to_json(), f)
self.machine.upload(os.path.join(self.node_dir, 'validator_key.json'),
f'/home/{self.machine.username}/.near/')


def spin_up_node(config,
near_root,
Expand Down Expand Up @@ -598,7 +613,7 @@ def apply_config_changes(node_dir, client_config_change):
assert k in config_json
if isinstance(v, dict):
for key, value in v.items():
assert key in config_json[k]
assert key in config_json[k], key
config_json[k][key] = value
else:
config_json[k] = v
Expand Down
49 changes: 49 additions & 0 deletions pytest/tests/sanity/validator_switch_key.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Starts two validating nodes and one non-validating node
# Set a new validator key that has the same account id as one of
# the validating nodes. Stake that account with the new key
# and make sure that the network doesn't stall even after
# the non-validating node becomes a validator.

import sys, time, base58

sys.path.append('lib')

from cluster import start_cluster, Key
from transaction import sign_staking_tx

EPOCH_LENGTH = 10
TIMEOUT = 60

client_config = {"network": {"ttl_account_id_router": {"secs": 0, "nanos": 100000000}}}
nodes = start_cluster(2, 1, 1, None, [["epoch_length", EPOCH_LENGTH], ["block_producer_kickout_threshold", 10],
["chunk_producer_kickout_threshold", 10]], {1: client_config, 2: client_config})
time.sleep(2)

nodes[2].kill()

validator_key = Key(nodes[1].validator_key.account_id, nodes[2].signer_key.pk, nodes[2].signer_key.sk)
nodes[2].reset_validator_key(validator_key)
nodes[2].reset_data()
nodes[2].start(nodes[0].node_key.pk, nodes[0].addr())
time.sleep(3)

status = nodes[0].get_status()
block_hash = status['sync_info']['latest_block_hash']
block_height = status['sync_info']['latest_block_height']

tx = sign_staking_tx(nodes[1].signer_key, validator_key, 50000000000000000000000000000000, 1,
base58.b58decode(block_hash.encode('utf8')))
res = nodes[0].send_tx_and_wait(tx, timeout=15)
assert 'error' not in res

start_time = time.time()
while True:
if time.time() - start_time > TIMEOUT:
assert False, "Validators get stuck"
status1 = nodes[1].get_status()
node1_height = status1['sync_info']['latest_block_height']
status2 = nodes[2].get_status()
node2_height = status2['sync_info']['latest_block_height']
if node1_height > block_height + 4 * EPOCH_LENGTH and node2_height > block_height + 4 * EPOCH_LENGTH:
break
time.sleep(2)

0 comments on commit 03853e0

Please sign in to comment.