-
Notifications
You must be signed in to change notification settings - Fork 2.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
*: fix node bootstrap not idempotent #1774
Changes from 5 commits
785ab3e
ab87922
d419e73
9983b4d
067c8db
dab9955
e5e5587
19fa048
4767f7a
ae05692
5409f36
75bad78
8df4bcf
e6aefcc
a1df814
f0aa06e
942ac5b
654b59d
a79653c
08f9442
10c0282
f21d07d
df6f5ad
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -116,11 +116,11 @@ impl<C> Node<C> | |
} | ||
|
||
self.store.set_id(store_id); | ||
|
||
try!(self.check_prepare_bootstrap_cluster(&engine)); | ||
if !bootstrapped { | ||
// cluster is not bootstrapped, and we choose first store to bootstrap | ||
// first region. | ||
let region = try!(self.bootstrap_first_region(&engine, store_id)); | ||
// prepare bootstrap. | ||
let region = try!(self.prepare_bootstrap_cluster(&engine, store_id)); | ||
try!(self.bootstrap_cluster(&engine, region)); | ||
} | ||
|
||
|
@@ -181,7 +181,7 @@ impl<C> Node<C> | |
Ok(store_id) | ||
} | ||
|
||
fn bootstrap_first_region(&self, engine: &DB, store_id: u64) -> Result<metapb::Region> { | ||
fn prepare_bootstrap_cluster(&self, engine: &DB, store_id: u64) -> Result<metapb::Region> { | ||
let region_id = try!(self.alloc_id()); | ||
info!("alloc first region id {} for cluster {}, store {}", | ||
region_id, | ||
|
@@ -192,21 +192,66 @@ impl<C> Node<C> | |
peer_id, | ||
region_id); | ||
|
||
let region = try!(store::bootstrap_region(engine, store_id, region_id, peer_id)); | ||
let region = try!(store::prepare_bootstrap(engine, store_id, region_id, peer_id)); | ||
Ok(region) | ||
} | ||
|
||
fn check_region_epoch(&self, region: &metapb::Region, other: &metapb::Region) -> Result<()> { | ||
let epoch = region.get_region_epoch(); | ||
let other_epoch = other.get_region_epoch(); | ||
if epoch.get_conf_ver() != other_epoch.get_conf_ver() { | ||
return Err(box_err!("region conf_ver inconsist: {} with {}", | ||
epoch.get_conf_ver(), | ||
other_epoch.get_conf_ver())); | ||
} | ||
if epoch.get_version() != other_epoch.get_version() { | ||
return Err(box_err!("region version inconsist: {} with {}", | ||
epoch.get_version(), | ||
other_epoch.get_version())); | ||
} | ||
Ok(()) | ||
} | ||
|
||
fn check_prepare_bootstrap_cluster(&self, engine: &DB) -> Result<()> { | ||
let res = try!(engine.get_msg::<metapb::Region>(&keys::prepare_bootstrap_key())); | ||
if res.is_none() { | ||
return Ok(()); | ||
} | ||
|
||
let first_region = res.unwrap(); | ||
for _ in 0..MAX_CHECK_CLUSTER_BOOTSTRAPPED_RETRY_COUNT { | ||
match self.pd_client.get_region(b"") { | ||
Ok(region) => { | ||
if region.get_id() == first_region.get_id() { | ||
try!(self.check_region_epoch(®ion, &first_region)); | ||
try!(store::clear_prepare_bootstrap_state(engine)); | ||
} else { | ||
try!(store::clear_prepare_bootstrap(engine, region.get_id())); | ||
} | ||
return Ok(()); | ||
} | ||
|
||
Err(e) => { | ||
warn!("check cluster prepare bootstrapped failed: {:?}", e); | ||
} | ||
} | ||
thread::sleep(Duration::from_secs(CHECK_CLUSTER_BOOTSTRAPPED_RETRY_SECONDS)); | ||
} | ||
Err(box_err!("check cluster prepare bootstrapped failed")) | ||
} | ||
|
||
fn bootstrap_cluster(&mut self, engine: &DB, region: metapb::Region) -> Result<()> { | ||
let region_id = region.get_id(); | ||
match self.pd_client.bootstrap_cluster(self.store.clone(), region) { | ||
Err(PdError::ClusterBootstrapped(_)) => { | ||
error!("cluster {} is already bootstrapped", self.cluster_id); | ||
try!(store::clear_region(engine, region_id)); | ||
try!(store::clear_prepare_bootstrap(engine, region_id)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess you want to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you are right |
||
Ok(()) | ||
} | ||
// TODO: should we clean region for other errors too? | ||
Err(e) => panic!("bootstrap cluster {} err: {:?}", self.cluster_id, e), | ||
Ok(_) => { | ||
try!(store::clear_prepare_bootstrap_state(engine)); | ||
info!("bootstrap cluster {} ok", self.cluster_id); | ||
Ok(()) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,3 +34,4 @@ mod test_snap; | |
mod test_region_heartbeat; | ||
mod test_stale_peer; | ||
mod test_lease_read; | ||
mod test_bootstrap; |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,6 +57,7 @@ struct Cluster { | |
|
||
down_peers: HashMap<u64, pdpb::PeerStats>, | ||
pending_peers: HashMap<u64, metapb::Peer>, | ||
is_bootstraped: bool, | ||
} | ||
|
||
impl Cluster { | ||
|
@@ -76,6 +77,7 @@ impl Cluster { | |
split_count: 0, | ||
down_peers: HashMap::new(), | ||
pending_peers: HashMap::new(), | ||
is_bootstraped: false, | ||
} | ||
} | ||
|
||
|
@@ -96,6 +98,11 @@ impl Cluster { | |
self.stores.insert(store_id, s); | ||
|
||
self.add_region(®ion); | ||
self.is_bootstraped = true; | ||
} | ||
|
||
fn set_bootstrap(&mut self, is_bootstraped: bool) { | ||
self.is_bootstraped = is_bootstraped | ||
} | ||
|
||
// We don't care cluster id here, so any value like 0 in tests is ok. | ||
|
@@ -131,6 +138,10 @@ impl Cluster { | |
self.stores.values().map(|s| s.store.clone()).collect() | ||
} | ||
|
||
fn get_regions_number(&self) -> Result<u32> { | ||
Ok(self.regions.len() as u32) | ||
} | ||
|
||
fn add_region(&mut self, region: &metapb::Region) { | ||
let end_key = enc_end_key(region); | ||
assert!(self.regions.insert(end_key.clone(), region.clone()).is_none()); | ||
|
@@ -362,6 +373,10 @@ impl TestPdClient { | |
Ok(()) | ||
} | ||
|
||
fn is_regions_empty(&self) -> Result<(bool)> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need Result here? |
||
Ok(self.cluster.rl().regions.is_empty()) | ||
} | ||
|
||
// Set a customized rule to overwrite default max peer count check rule. | ||
pub fn set_rule(&self, rule: Rule) { | ||
self.cluster.wl().rule = Some(rule); | ||
|
@@ -375,7 +390,9 @@ impl TestPdClient { | |
pub fn get_region_epoch(&self, region_id: u64) -> metapb::RegionEpoch { | ||
self.get_region_by_id(region_id).wait().unwrap().unwrap().take_region_epoch() | ||
} | ||
|
||
pub fn get_regions_number(&self) -> Result<(u32)> { | ||
self.cluster.rl().get_regions_number() | ||
} | ||
// Set an empty rule which nothing to do to disable default max peer count | ||
// check rule, we can use reset_rule to enable default again. | ||
pub fn disable_default_rule(&self) { | ||
|
@@ -423,6 +440,9 @@ impl TestPdClient { | |
.unwrap(); | ||
panic!("region {:?} has peer {:?}", region, peer); | ||
} | ||
pub fn add_region(&self, region: &metapb::Region) { | ||
self.cluster.wl().add_region(region) | ||
} | ||
|
||
pub fn add_peer(&self, region_id: u64, peer: metapb::Peer) { | ||
self.set_rule(box move |region: &metapb::Region, _: &metapb::Peer| { | ||
|
@@ -503,7 +523,8 @@ impl PdClient for TestPdClient { | |
} | ||
|
||
fn bootstrap_cluster(&self, store: metapb::Store, region: metapb::Region) -> Result<()> { | ||
if self.is_cluster_bootstrapped().unwrap() { | ||
if self.is_cluster_bootstrapped().unwrap() || !self.is_regions_empty().unwrap() { | ||
self.cluster.wl().set_bootstrap(true); | ||
return Err(Error::ClusterBootstrapped(self.cluster_id)); | ||
} | ||
|
||
|
@@ -513,7 +534,7 @@ impl PdClient for TestPdClient { | |
} | ||
|
||
fn is_cluster_bootstrapped(&self) -> Result<bool> { | ||
Ok(!self.cluster.rl().stores.is_empty()) | ||
Ok(self.cluster.rl().is_bootstraped) | ||
} | ||
|
||
fn alloc_id(&self) -> Result<u64> { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
// Copyright 2017 PingCAP, Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
use super::cluster::{Cluster, Simulator}; | ||
use super::node::new_node_cluster; | ||
use super::util::*; | ||
|
||
fn test_bootstrap_idempotent<T: Simulator>(cluster: &mut Cluster<T>) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we check conf ver and version inconsistent for check_region_epoch? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. any update? @nolouch |
||
// assume that there is a node bootstrap the cluster and add region in pd successfully | ||
cluster.add_first_region().unwrap(); | ||
// now at same time start the another node, and will recive cluster is not bootstrap | ||
// try to bootstrap with a new region | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need to test both when pd is bootstrapped and pd is not bootstrapped. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I start the cluster two times, in the second time the cluster is bootstrapped. |
||
cluster.start(); | ||
cluster.check_regions_number(1); | ||
cluster.shutdown(); | ||
sleep_ms(500); | ||
cluster.start(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. start twice to check what? I think we must check the prepare bootstrap key in RocksDB too. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. any update? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. first the cluster is not bootstraped. second is bootstraped. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have checked it in here https://github.com/pingcap/tikv/pull/1774/files#diff-7d211b12fe1e345db17e9dbfb322abd3R186 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, not the same. |
||
cluster.check_regions_number(1); | ||
} | ||
|
||
#[test] | ||
fn test_node_bootstrap_idempotent() { | ||
let mut cluster = new_node_cluster(0, 3); | ||
test_bootstrap_idempotent(&mut cluster); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add a blank line